From 39eb95a82440e1e614533b702e1cda814e01f69c Mon Sep 17 00:00:00 2001 From: Luke Street Date: Wed, 3 Aug 2022 18:10:19 -0400 Subject: [PATCH] Update soxr to 0.1.3; make wasm compatible --- CMakeLists.txt | 2 +- soxr/.gitignore | 2 + soxr/CMakeLists.txt | 304 +- soxr/INSTALL | 98 +- soxr/LICENCE | 5 +- soxr/NEWS | 9 + soxr/README | 4 +- soxr/TODO | 6 +- soxr/cmake/Modules/FindCFlags.cmake | 35 + soxr/cmake/Modules/FindLibAVCodec.cmake | 20 +- soxr/cmake/Modules/FindLibAVUtil.cmake | 23 + soxr/cmake/Modules/FindOpenMP.cmake | 115 - soxr/cmake/Modules/FindSIMD.cmake | 94 - soxr/cmake/Modules/FindSIMD32.cmake | 54 + soxr/cmake/Modules/FindSIMD64.cmake | 29 + soxr/cmake/Modules/SetSystemProcessor.cmake | 37 + soxr/cmake/Modules/TestBigEndian.cmake | 15 - soxr/dist | 12 + soxr/examples/1-single-block.c | 2 +- soxr/examples/1a-lsr.c | 2 +- soxr/examples/3-options-input-fn.c | 11 +- soxr/examples/4-split-channels.c | 25 +- soxr/examples/CMakeLists.txt | 27 +- soxr/examples/examples-common.h | 14 +- soxr/go | 18 + soxr/go.bat | 27 + soxr/inst-check | 25 + soxr/inst-check-soxr | 52 + soxr/inst-check-soxr-lsr | 52 + soxr/lsr-tests/CMakeLists.txt | 50 + soxr/lsr-tests/COPYING | 340 + soxr/lsr-tests/README | 8 + soxr/lsr-tests/calc_snr.c | 242 + soxr/lsr-tests/callback_hang_test.c | 131 + soxr/lsr-tests/callback_test.c | 243 + soxr/lsr-tests/cmake/Modules/FindFFTW.cmake | 23 + .../lsr-tests/cmake/Modules/Findsndfile.cmake | 23 + soxr/lsr-tests/config.h.in | 24 + soxr/lsr-tests/downsample_test.c | 61 + soxr/lsr-tests/float_cast.h | 281 + soxr/lsr-tests/float_short_test.c | 192 + soxr/lsr-tests/misc_test.c | 175 + soxr/lsr-tests/multi_channel_test.c | 364 + soxr/lsr-tests/multichan_throughput_test.c | 216 + soxr/lsr-tests/reset_test.c | 238 + soxr/lsr-tests/simple_test.c | 117 + soxr/lsr-tests/sndfile-resample.c | 332 + soxr/lsr-tests/snr_bw_test.c | 401 ++ soxr/lsr-tests/termination_test.c | 339 + soxr/lsr-tests/throughput_test.c | 212 + soxr/lsr-tests/util.c | 230 + soxr/lsr-tests/util.h | 50 + soxr/lsr-tests/varispeed_test.c | 152 + soxr/msvc/README | 22 + soxr/msvc/example1.vcproj | 82 + soxr/msvc/libsoxr.sln | 29 + soxr/msvc/libsoxr.vcproj | 97 + soxr/msvc/soxr-config.h | 30 + soxr/multi-arch | 31 + soxr/soxr-config.h.in | 53 +- soxr/src/CMakeLists.txt | 149 +- soxr/src/aliases.h | 4 +- soxr/src/avfft32.c | 32 +- soxr/src/avfft32s.c | 37 +- soxr/src/cb_t.h | 26 + soxr/src/ccrw2.h | 4 +- soxr/src/cr-core.c | 316 + soxr/src/cr.c | 600 ++ soxr/src/cr.h | 178 + soxr/src/cr32.c | 8 + soxr/src/cr32s.c | 8 + soxr/src/cr64.c | 8 + soxr/src/cr64s.c | 8 + soxr/src/data-io.c | 59 +- soxr/src/dev32s.h | 54 + soxr/src/dev64s.h | 42 + soxr/src/fft4g.c | 16 +- soxr/src/fft4g32.c | 45 +- soxr/src/fft4g32s.c | 33 +- soxr/src/fft4g64.c | 43 +- soxr/src/fifo.h | 5 +- soxr/src/filter.c | 60 +- soxr/src/filter.h | 15 +- soxr/src/filters.h | 151 - soxr/src/half-coefs.h | 75 + soxr/src/half-fir.h | 58 +- soxr/src/half_coefs.h | 57 - soxr/src/internal.h | 66 +- soxr/src/libsoxr-dev.src.in | 2 - soxr/src/libsoxr.src.in | 1 - soxr/src/lsr.c | 114 - soxr/src/math-wrap.h | 31 + soxr/src/pffft-avx.h | 40 + soxr/src/pffft-wrap.c | 110 + soxr/src/pffft.c | 641 +- soxr/src/pffft.h | 56 +- soxr/src/pffft32.c | 39 +- soxr/src/pffft32s.c | 37 +- soxr/src/pffft64s.c | 34 + soxr/src/poly-fir.h | 200 +- soxr/src/poly-fir0.h | 60 +- soxr/src/rate.h | 726 -- soxr/src/rate32.c | 9 - soxr/src/rate32s.c | 9 - soxr/src/rate64.c | 9 - soxr/src/rdft.h | 8 +- soxr/src/rdft_t.h | 40 + soxr/src/rint-clip.h | 51 +- soxr/src/rint.h | 136 +- soxr/src/simd-dev.h | 5 - soxr/src/simd.h | 16 - soxr/src/soxr-lsr.c | 198 + soxr/src/soxr-lsr.h | 6 +- soxr/src/soxr.c | 368 +- soxr/src/soxr.h | 28 +- soxr/src/sse2neon.h | 6292 ----------------- soxr/src/std-types.h | 48 + soxr/src/{simd.c => util-simd.c} | 29 +- soxr/src/util32s.c | 8 + soxr/src/util32s.h | 23 + soxr/src/util64s.c | 8 + soxr/src/util64s.h | 23 + soxr/src/vr-coefs.c | 3 + soxr/src/vr-coefs.h | 3 + soxr/src/vr32.c | 72 +- soxr/src/vr32s.c | 665 -- soxr/tests/CMakeLists.txt | 40 +- soxr/tests/bandwidth-test | 9 +- soxr/tests/cmp-test.cmake | 22 +- soxr/tests/eg-test | 5 +- soxr/tests/io-test | 18 +- soxr/tests/large-ratio-test | 23 +- soxr/tests/phase-test | 9 +- soxr/tests/q-test | 9 +- soxr/tests/scripts | 7 +- soxr/tests/throughput-test | 11 + soxr/tests/throughput-test.bat | 5 + soxr/tests/throughput.c | 141 + soxr/tests/time-test | 35 +- soxr/tests/vector-cmp.c | 81 +- soxr/tests/vector-gen.c | 81 +- 141 files changed, 9179 insertions(+), 9529 deletions(-) create mode 100644 soxr/.gitignore create mode 100644 soxr/cmake/Modules/FindCFlags.cmake create mode 100644 soxr/cmake/Modules/FindLibAVUtil.cmake delete mode 100644 soxr/cmake/Modules/FindOpenMP.cmake delete mode 100644 soxr/cmake/Modules/FindSIMD.cmake create mode 100644 soxr/cmake/Modules/FindSIMD32.cmake create mode 100644 soxr/cmake/Modules/FindSIMD64.cmake create mode 100644 soxr/cmake/Modules/SetSystemProcessor.cmake delete mode 100644 soxr/cmake/Modules/TestBigEndian.cmake create mode 100644 soxr/dist create mode 100644 soxr/go create mode 100644 soxr/go.bat create mode 100644 soxr/inst-check create mode 100644 soxr/inst-check-soxr create mode 100644 soxr/inst-check-soxr-lsr create mode 100644 soxr/lsr-tests/CMakeLists.txt create mode 100644 soxr/lsr-tests/COPYING create mode 100644 soxr/lsr-tests/README create mode 100644 soxr/lsr-tests/calc_snr.c create mode 100644 soxr/lsr-tests/callback_hang_test.c create mode 100644 soxr/lsr-tests/callback_test.c create mode 100644 soxr/lsr-tests/cmake/Modules/FindFFTW.cmake create mode 100644 soxr/lsr-tests/cmake/Modules/Findsndfile.cmake create mode 100644 soxr/lsr-tests/config.h.in create mode 100644 soxr/lsr-tests/downsample_test.c create mode 100644 soxr/lsr-tests/float_cast.h create mode 100644 soxr/lsr-tests/float_short_test.c create mode 100644 soxr/lsr-tests/misc_test.c create mode 100644 soxr/lsr-tests/multi_channel_test.c create mode 100644 soxr/lsr-tests/multichan_throughput_test.c create mode 100644 soxr/lsr-tests/reset_test.c create mode 100644 soxr/lsr-tests/simple_test.c create mode 100644 soxr/lsr-tests/sndfile-resample.c create mode 100644 soxr/lsr-tests/snr_bw_test.c create mode 100644 soxr/lsr-tests/termination_test.c create mode 100644 soxr/lsr-tests/throughput_test.c create mode 100644 soxr/lsr-tests/util.c create mode 100644 soxr/lsr-tests/util.h create mode 100644 soxr/lsr-tests/varispeed_test.c create mode 100644 soxr/msvc/README create mode 100644 soxr/msvc/example1.vcproj create mode 100644 soxr/msvc/libsoxr.sln create mode 100644 soxr/msvc/libsoxr.vcproj create mode 100644 soxr/msvc/soxr-config.h create mode 100644 soxr/multi-arch create mode 100644 soxr/src/cb_t.h create mode 100644 soxr/src/cr-core.c create mode 100644 soxr/src/cr.c create mode 100644 soxr/src/cr.h create mode 100644 soxr/src/cr32.c create mode 100644 soxr/src/cr32s.c create mode 100644 soxr/src/cr64.c create mode 100644 soxr/src/cr64s.c create mode 100644 soxr/src/dev32s.h create mode 100644 soxr/src/dev64s.h delete mode 100644 soxr/src/filters.h create mode 100644 soxr/src/half-coefs.h delete mode 100644 soxr/src/half_coefs.h delete mode 100644 soxr/src/libsoxr-dev.src.in delete mode 100644 soxr/src/libsoxr.src.in delete mode 100644 soxr/src/lsr.c create mode 100644 soxr/src/math-wrap.h create mode 100644 soxr/src/pffft-avx.h create mode 100644 soxr/src/pffft-wrap.c create mode 100644 soxr/src/pffft64s.c delete mode 100644 soxr/src/rate.h delete mode 100644 soxr/src/rate32.c delete mode 100644 soxr/src/rate32s.c delete mode 100644 soxr/src/rate64.c create mode 100644 soxr/src/rdft_t.h delete mode 100644 soxr/src/simd-dev.h delete mode 100644 soxr/src/simd.h create mode 100644 soxr/src/soxr-lsr.c delete mode 100644 soxr/src/sse2neon.h create mode 100644 soxr/src/std-types.h rename soxr/src/{simd.c => util-simd.c} (69%) create mode 100644 soxr/src/util32s.c create mode 100644 soxr/src/util32s.h create mode 100644 soxr/src/util64s.c create mode 100644 soxr/src/util64s.h delete mode 100644 soxr/src/vr32s.c create mode 100644 soxr/tests/throughput-test create mode 100644 soxr/tests/throughput-test.bat create mode 100644 soxr/tests/throughput.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 85eab0a..584a36e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -73,7 +73,7 @@ endif () set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}") -add_subdirectory(soxr/src) +add_subdirectory(soxr) add_library(boo lib/audiodev/Common.hpp diff --git a/soxr/.gitignore b/soxr/.gitignore new file mode 100644 index 0000000..ac1dff9 --- /dev/null +++ b/soxr/.gitignore @@ -0,0 +1,2 @@ +Release*/ +Debug*/ diff --git a/soxr/CMakeLists.txt b/soxr/CMakeLists.txt index 61bd596..76950ae 100644 --- a/soxr/CMakeLists.txt +++ b/soxr/CMakeLists.txt @@ -1,30 +1,36 @@ -# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net # Licence for this file: LGPL v2.1 See LICENCE for details. -cmake_minimum_required (VERSION 2.8 FATAL_ERROR) -cmake_policy(SET CMP0075 OLD) - -#project (soxr C) -#set (DESCRIPTION_SUMMARY "High quality, one-dimensional sample-rate conversion library") +cmake_minimum_required (VERSION 3.1 FATAL_ERROR) +project (soxr C) +set (DESCRIPTION_SUMMARY + "High quality, one-dimensional sample-rate conversion library") +cmake_policy(SET CMP0075 NEW) +cmake_policy(SET CMP0115 OLD) +cmake_policy(SET CMP0127 OLD) # Release versioning: set (PROJECT_VERSION_MAJOR 0) set (PROJECT_VERSION_MINOR 1) -set (PROJECT_VERSION_PATCH 2) +set (PROJECT_VERSION_PATCH 3) # For shared-object; if, since the last public release: -# * library code changed at all: ++revision -# * interfaces changed at all: ++current, revision = 0 -# * interfaces added: ++age -# * interfaces removed: age = 0 +# 1) library code changed at all: ++revision +# 2) interfaces changed at all: ++current, revision = 0 +# 3) interfaces added: ++age +# 4) interfaces removed: age = 0 set (SO_VERSION_CURRENT 1) -set (SO_VERSION_REVISION 1) +set (SO_VERSION_REVISION 2) set (SO_VERSION_AGE 1) +math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}") +math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}") +math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}") + # Main options: @@ -32,112 +38,147 @@ set (SO_VERSION_AGE 1) include (CMakeDependentOption) if (NOT CMAKE_BUILD_TYPE) - set (CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE) + set (CMAKE_BUILD_TYPE Release CACHE STRING + "Build type, one of: None Debug Release RelWithDebInfo MinSizeRel." FORCE) endif () -#option (BUILD_TESTS "Build sanity-tests." ON) -#option (BUILD_SHARED_LIBS "Build shared libraries." ON) -#option (BUILD_EXAMPLES "Build examples." OFF) -option (WITH_LSR_BINDINGS "Include a `libsamplerate'-like interface." ON) -cmake_dependent_option (WITH_SINGLE_PRECISION "Build with single precision (for up to 20-bit accuracy)." ON - "WITH_DOUBLE_PRECISION" ON) -cmake_dependent_option (WITH_DOUBLE_PRECISION "Build with double precision (for up to 32-bit accuracy)." ON - "WITH_SINGLE_PRECISION" ON) -cmake_dependent_option (WITH_SIMD "Use SIMD (for faster single precision)." ON - "WITH_SINGLE_PRECISION" OFF) -cmake_dependent_option (WITH_AVFFT "Use libavcodec (LGPL) for SIMD DFT." OFF - "WITH_SIMD;NOT WITH_PFFFT" OFF) -cmake_dependent_option (WITH_PFFFT "Use PFFFT (BSD-like licence) for SIMD DFT." ON - "WITH_SIMD;NOT WITH_AVFFT" OFF) -option (SOXR_SILENT "Disable debug messages, even in debug mode" OFF) -if (UNIX) - if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/lsr-tests) - cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF - "WITH_LSR_BINDINGS" OFF) - endif () -endif () +option (BUILD_TESTS "Build sanity-tests." OFF) +option (BUILD_EXAMPLES "Build examples." OFF) +option (WITH_OPENMP "Include OpenMP threading." OFF) +option (WITH_LSR_BINDINGS "Include a `libsamplerate'-like interface." OFF) + +cmake_dependent_option (BUILD_SHARED_LIBS + "Build shared (dynamic) soxr libraries." OFF + "NOT WITH_DEV_GPROF" OFF) +cmake_dependent_option (WITH_VR32 + "Include HQ variable-rate resampling engine." ON + "WITH_CR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S OR NOT DEFINED WITH_VR32" ON) +cmake_dependent_option (WITH_CR32 + "Include HQ constant-rate resampling engine." ON + "WITH_VR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S" ON) +cmake_dependent_option (WITH_CR64 + "Include VHQ constant-rate resampling engine." ON + "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64S" ON) +cmake_dependent_option (WITH_CR64S + "Include VHQ SIMD constant-rate resampling engine." ON + "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64" ON) +cmake_dependent_option (WITH_CR32S + "Include HQ SIMD constant-rate resampling engine." ON + "WITH_VR32 OR WITH_CR64 OR WITH_CR32 OR WITH_CR64S" ON) +cmake_dependent_option (WITH_PFFFT + "Use PFFFT (BSD-like licence) for HQ SIMD DFT." ON + "WITH_CR32S;NOT WITH_AVFFT" OFF) +cmake_dependent_option (WITH_AVFFT + "Use libavcodec (LGPL) for HQ SIMD DFT." OFF + "WITH_CR32S;NOT WITH_PFFFT" OFF) +cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF + "UNIX;NOT CMAKE_CROSSCOMPILING;EXISTS ${PROJECT_SOURCE_DIR}/lsr-tests;WITH_LSR_BINDINGS" OFF) + +option (WITH_HI_PREC_CLOCK "Enable high-precision time-base." ON) +option (WITH_FLOAT_STD_PREC_CLOCK + "Use floating-point for standard-precision time-base." OFF) +option (WITH_DEV_TRACE "Enable developer trace capability." ON) +option (WITH_DEV_GPROF "Enable developer grpof output." OFF) +mark_as_advanced (WITH_HI_PREC_CLOCK WITH_FLOAT_STD_PREC_CLOCK + WITH_DEV_TRACE WITH_DEV_GPROF) # Introspection: -list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules) +list (APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) include (CheckFunctionExists) include (CheckIncludeFiles) include (CheckLibraryExists) -include (TestBigEndian) +include (SetSystemProcessor) +if (NOT EMSCRIPTEN) + include(TestBigEndian) +endif () + +set_system_processor () check_library_exists (m pow "" NEED_LIBM) if (NEED_LIBM) set (CMAKE_REQUIRED_LIBRARIES "m;${CMAKE_REQUIRED_LIBRARIES}") - link_libraries (m) + set (LIBM_LIBRARIES m) endif () -#if (WITH_OPENMP) -# find_package (OpenMP) -# if (OPENMP_FOUND) -# set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") -# set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}") -# set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}") -# endif () -#endif () - -if (WITH_SIMD) - find_package (SIMD) - if (SIMD_FOUND) - set (HAVE_SIMD 1) - endif () +if (${BUILD_EXAMPLES}) + project (${PROJECT_NAME}) # Adds c++ compiler endif () -if (WITH_SINGLE_PRECISION) - set (HAVE_SINGLE_PRECISION 1) +if (WITH_OPENMP) + find_package (OpenMP) + if (OPENMP_FOUND) + set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") + if (MINGW) # Is this still needed? + set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}") + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}") + endif () + endif() endif () -if (WITH_DOUBLE_PRECISION) - set (HAVE_DOUBLE_PRECISION 1) +if (WITH_CR32S) + find_package (SIMD32) + set (WITH_CR32S ${SIMD32_FOUND}) +endif () + +if (WITH_CR64S) + find_package (SIMD64) + set (WITH_CR64S ${SIMD64_FOUND}) endif () if (WITH_AVFFT) - find_package (LibAVCodec) + find_package (LibAVCodec REQUIRED) if (AVCODEC_FOUND) include_directories (${AVCODEC_INCLUDE_DIRS}) - link_libraries (${AVCODEC_LIBRARIES}) - set (HAVE_AVFFT 1) + set (LIBS ${LIBS} ${AVCODEC_LIBRARIES}) endif () endif () -if (SOXR_SILENT) - add_definitions (-DSOXR_SILENT=1) +if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD32_FOUND AND WITH_CR32)) + find_package (LibAVUtil) + if (AVUTIL_FOUND) + include_directories (${AVUTIL_INCLUDE_DIRS}) + set (LIBS ${LIBS} ${AVUTIL_LIBRARIES}) + endif () endif () check_function_exists (lrint HAVE_LRINT) check_include_files (fenv.h HAVE_FENV_H) -test_big_endian (WORDS_BIGENDIAN) - -macro (make_exist) - foreach (x ${ARGN}) - if (NOT ${x}) - set (${x} 0) - endif () - endforeach () -endmacro () - -make_exist (HAVE_LRINT HAVE_FENV_H WORDS_BIGENDIAN HAVE_SIMD) -make_exist (HAVE_SINGLE_PRECISION HAVE_DOUBLE_PRECISION HAVE_AVFFT) +check_include_files (stdbool.h HAVE_STDBOOL_H) +check_include_files (stdint.h HAVE_STDINT_H) +if (EMSCRIPTEN) + set(HAVE_BIGENDIAN OFF) +else() + test_big_endian (HAVE_BIGENDIAN) +endif() # Compiler configuration: -if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX) - set (PROJECT_CXX_FLAGS "-Wconversion -Wall -W -Wundef -Wcast-align -Wpointer-arith -Wno-long-long") - set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} -Wnested-externs -Wmissing-prototypes -Wstrict-prototypes") +if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang") + set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wconversion -Wall -Wextra \ + -pedantic -Wundef -Wpointer-arith -Wno-long-long") + if (CMAKE_C_COMPILER_ID STREQUAL "Clang") + set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wno-keyword-macro") + endif () + if (WITH_DEV_GPROF) + set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -pg") + endif () + # Can use std=c89, but gnu89 should give faster sinf, cosf, etc.: + set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} \ + -std=gnu89 -Wnested-externs -Wmissing-prototypes -Wstrict-prototypes") if (CMAKE_BUILD_TYPE STREQUAL "Release") set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s") # strip endif () - cmake_dependent_option (VISIBILITY_HIDDEN "Build with -fvisibility=hidden." ON + cmake_dependent_option (VISIBILITY_HIDDEN + "Build shared libraries with -fvisibility=hidden." ON "BUILD_SHARED_LIBS" OFF) + mark_as_advanced (VISIBILITY_HIDDEN) if (VISIBILITY_HIDDEN) add_definitions (-fvisibility=hidden -DSOXR_VISIBILITY) endif () @@ -145,9 +186,14 @@ endif () if (MSVC) add_definitions (-D_USE_MATH_DEFINES -D_CRT_SECURE_NO_WARNINGS) - option (ENABLE_STATIC_RUNTIME "Visual Studio, link with runtime statically." OFF) - if (ENABLE_STATIC_RUNTIME) - foreach (flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO) + option (BUILD_SHARED_RUNTIME "MSVC, link with runtime dynamically." ON) + if (NOT BUILD_SHARED_RUNTIME) + foreach (flag_var + CMAKE_C_FLAGS CMAKE_CXX_FLAGS + CMAKE_C_FLAGS_DEBUG CMAKE_CXX_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_MINSIZEREL + CMAKE_C_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_RELWITHDEBINFO) string (REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}") endforeach () endif () @@ -161,8 +207,9 @@ endif () # Build configuration: -if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows) # Allow exes to find dlls: - set (BIN ${CMAKE_CURRENT_BINARY_DIR}/bin/) +if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows) + # Allow exes to find dlls: + set (BIN ${PROJECT_BINARY_DIR}/bin/) set (EXAMPLES_BIN ${BIN}) set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${BIN}) set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${BIN}) @@ -179,6 +226,10 @@ if (BUILD_SHARED_LIBS) endif () endif () +if (CMAKE_BUILD_TYPE STREQUAL "None") # As used by some distros. + add_definitions (-DNDEBUG) +endif () + # Installation configuration: @@ -194,7 +245,7 @@ if (NOT DEFINED INCLUDE_INSTALL_DIR) endif () if (NOT DEFINED DOC_INSTALL_DIR) if (UNIX) - set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/doc/libsoxr") + set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/doc/lib${PROJECT_NAME}") else () set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/doc") endif () @@ -202,25 +253,24 @@ endif () if (APPLE) option (BUILD_FRAMEWORK "Build an OS X framework." OFF) - set (FRAMEWORK_INSTALL_DIR "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.") + set (FRAMEWORK_INSTALL_DIR + "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.") endif () # Top-level: -set (PROJECT_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}) -math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}") -math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}") -math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}") +set (PROJECT_VERSION + ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}) set (SO_VERSION ${SO_VERSION_MAJOR}.${SO_VERSION_MINOR}.${SO_VERSION_PATCH}) configure_file ( - ${CMAKE_CURRENT_SOURCE_DIR}/soxr-config.h.in - ${CMAKE_CURRENT_BINARY_DIR}/soxr-config.h) -include_directories (${CMAKE_CURRENT_BINARY_DIR}) + ${PROJECT_SOURCE_DIR}/${PROJECT_NAME}-config.h.in + ${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config.h) +include_directories (${PROJECT_BINARY_DIR}) -if (BUILD_TESTS OR BUILD_LSR_TESTS) +if (NOT CMAKE_CROSSCOMPILING AND (BUILD_TESTS OR BUILD_LSR_TESTS)) enable_testing () endif () @@ -234,7 +284,7 @@ install (FILES # Subdirectories: -include_directories (${CMAKE_CURRENT_SOURCE_DIR}/src) +include_directories (${PROJECT_SOURCE_DIR}/src) add_subdirectory (src) if (BUILD_TESTS) @@ -249,55 +299,45 @@ endif () -# Rough-and-ready distclean for anyone still doing in-tree builds: +# GNU Autotools compatibility; 'make check': -#if (UNIX) -# add_custom_target (distclean -# COMMAND make clean && rm -rf -# CMakeCache.txt -# CMakeFiles -# cmake_install.cmake -# CPackConfig.cmake -# CPackSourceConfig.cmake -# deinstall.cmake -# Makefile -# soxr-config.h -# src/CMakeFiles -# src/cmake_install.cmake -# src/libsoxr-dev.src -# src/libsoxr-lsr.pc -# src/libsoxr.pc -# src/libsoxr.src -# src/Makefile) -#endif () +add_custom_target (check COMMAND ${CMAKE_CTEST_COMMAND}) + + + +# GNU Autotools compatibility; 'make distclean': + +if (UNIX) + add_custom_target (distclean COMMAND make clean && find . + \\! -path \\*/Modules/\\* \\! -name cmp-test.cmake -a -name \\*.cmake + -o -name CMakeFiles -o -name Makefile -o -name CMakeCache.txt -o -name + Testing -o -name cmake_install.cmake -o -name install_manifest.txt -o + -path ./soxr-config.h -o -name config.h -o -name \\*.pc -o -name \\*.s32 + | xargs rm -rf) +endif () # Deinstallation: -#configure_file ( -# "${CMAKE_CURRENT_SOURCE_DIR}/deinstall.cmake.in" -# "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake" -# IMMEDIATE @ONLY) +configure_file ( + "${CMAKE_CURRENT_SOURCE_DIR}/deinstall.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake" + IMMEDIATE @ONLY) -#add_custom_target (deinstall -# COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake") +add_custom_target (deinstall + COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake") # Packaging: -#if (UNIX) -# set (CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}") -# set (CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}") -# set (CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}") -# -# set (CPACK_SOURCE_GENERATOR "TGZ") -# set (CPACK_SOURCE_IGNORE_FILES "dist;/lsr-tests/;/Debug/;/Release/;/cpack/;\\\\.swp$;\\\\.gitignore;/\\\\.git/") -# -# include (CPack) -# -# if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cpack) -# add_subdirectory (cpack) -# endif () -#endif () +if (UNIX) + set (CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}") + set (CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}") + set (CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}") + set (CPACK_SOURCE_GENERATOR "TXZ") + set (CPACK_SOURCE_IGNORE_FILES + "dist;/lsr-tests/;/Debug.*/;/Release.*/;\\\\.swp$;\\\\.git.*;/\\\\.git/") + include (CPack) +endif () diff --git a/soxr/INSTALL b/soxr/INSTALL index c2c7675..5599870 100644 --- a/soxr/INSTALL +++ b/soxr/INSTALL @@ -1,11 +1,12 @@ -SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net INSTALLATION GUIDE CONTENTS * Standard build * Build customisation -* Cross-compiling with mingw (linux host) +* Cross-compilation * Integration with other build systems +* Run-time configuration @@ -20,7 +21,7 @@ STANDARD BUILD * A 'make' utility (most compiler installations already have one of these). - * CMake: http://www.cmake.org/cmake/resources/software.html + * CMake v3.0 or newer: https://cmake.org/download/ 2. Build: @@ -30,7 +31,7 @@ STANDARD BUILD go (on MS-Windows with nmake) or - ./go (on unix-like systems) + ./go (on Unix-like systems) This should build the library and run a few sanity tests. @@ -38,14 +39,14 @@ STANDARD BUILD 3. Installation: Note that this step may need to be performed by a system - adminstrator. Enter: + administrator. Enter: nmake install (on MS-Windows) or - cd Release; make install (on unix) + cd Release; make install (on Unix-like) -4. Configuration: +4. Preparation for use: To use the library you may need to set up appropriate paths to the library and its header file in your development environment. @@ -60,38 +61,74 @@ STANDARD BUILD BUILD CUSTOMISATION -If it is necessary to customise the build, then steps 2 and 3 above may be -substituted as follows. Change directory to the one containing this file, -then enter commands along the lines of: +If it is necessary to customise the build, then steps 2 and 3 above should be +substituted as follows: change directory to the one containing this file, then +enter commands along the lines: mkdir build cd build - cmake [OPTIONS] .. + cmake -Wno-dev -DCMAKE_BUILD_TYPE=Release [OPTIONS] .. make make test sudo make install +N.B. The CMAKE_BUILD_TYPE to use for library deployment is Release. + To list help on the available options, enter: cmake -LH .. Options, if given, should be preceded with '-D', e.g. - cmake -DWITH_SIMD:BOOL=OFF .. + -DBUILD_SHARED_LIBS:BOOL=OFF -CROSS-COMPILING WITH MINGW (LINUX HOST) +Resampling engines -For example: +As available on a given system, options for including up-to five resampling +‘engines’ are available (per above) as follows: + + WITH_CR32: for constant-rate HQ resampling, + WITH_CR32S: SIMD variant of previous, + WITH_CR64: for constant-rate VHQ resampling, + WITH_CR64S: SIMD variant of previous, + WITH_VR32: for variable-rate HQ resampling. + +By default, these options are all set to ON. + +When both SIMD and non-SIMD engine variants are included, run-time selection +is automatic (based on CPU capability) for x86 CPUs, and can be automatic for +ARM CPUs if the 3rd-party library `libavutil' is available at libsoxr +build-time. Which engine has been selected for a specific configuration and +invocation of the library can be checked using example #3, which reports it. +See also Run-time Configuration, below. + + + +CROSS-COMPILATION + +E.g. targeting a Linux ARM system: + + mkdir build + cd build + cmake -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \ + .. +or, also building the examples (one of which uses C++): + + cmake -DCMAKE_SYSTEM_NAME=Linux \ + -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \ + -DCMAKE_CXX_COMPILER=arm-linux-gnueabi-g++ \ + -DBUILD_EXAMPLES=1 \ + .. + +E.g. with Mingw (Linux host), using a tool-chain file: mkdir build cd build cmake -DCMAKE_TOOLCHAIN_FILE=~/Toolchain-x86_64-mingw-w64-mingw32.cmake \ -DCMAKE_INSTALL_PREFIX=install \ - -DHAVE_WORDS_BIGENDIAN_EXITCODE=1 \ - -DBUILD_TESTS=0 \ - -DBUILD_EXAMPLES=1 \ .. make @@ -117,7 +154,30 @@ INTEGRATION WITH OTHER BUILD SYSTEMS Autotools-based systems might find it useful to create a file called `configure' in the directory containing this file, consisting of the line: - cmake -DBUILD_SHARED_LIBS=OFF . + cmake -DBUILD_SHARED_LIBS=OFF . (or with other build options as required). -For MS visual studio, see msvc/README +For MS Visual Studio, see msvc/README. + + + +RUN-TIME CONFIGURATION + +The libsoxr API structure ‘soxr_runtime_spec_t’ allows application developers +to optimise some aspects of libsoxr’s operation for a particular application. +Optimal performance however, might depend on an individual end-user’s run- +time system and the end-user’s preferences. Hence environment variables are +available to set (override) run-time parameters as follows: + + Env. variable Equivalent soxr_runtime_spec_t item (see soxr.h) + ------------------ ----------------------------------- + SOXR_COEFS_SIZE coef_size_kbytes + SOXR_COEF_INTERP SOXR_COEF_INTERP_xxx + SOXR_LARGE_DFT_SIZE log2_large_dft_size + SOXR_MIN_DFT_SIZE log2_min_dft_size + SOXR_NUM_THREADS num_threads + +Additionally, the SOXR_USE_SIMD32 and SOXR_USE_SIMD64 boolean environment +variables can be used to override automatic selection (or to provide manual +selection where automatic selection is not available) between SIMD and +non-SIMD engine variants. diff --git a/soxr/LICENCE b/soxr/LICENCE index 1c61878..43e5a71 100644 --- a/soxr/LICENCE +++ b/soxr/LICENCE @@ -1,4 +1,4 @@ -SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by @@ -11,8 +11,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License -along with this library; if not, write to the Free Software Foundation, -Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +along with this library; if not, see . Notes diff --git a/soxr/NEWS b/soxr/NEWS index f388974..9e7c298 100644 --- a/soxr/NEWS +++ b/soxr/NEWS @@ -1,3 +1,12 @@ +Version 0.1.3 (2018-02-24) + * SIMD enhancements: SSE, AVX, Neon. + * Improve support for clang, ARM, and cross-compilation. + * Provide env. var. override of runtime parameters. + * Build fix re cmake variables AVCODEC_INCLUDE_DIRS & AVUTIL_INCLUDE_DIRS. + * Build options WITH_SINGLE_PRECISION, WITH_DOUBLE_PRECISION & WITH_SIMD have + been removed; replacement options are detailed in INSTALL, `Resampling + engines'. + Version 0.1.2 (2015-09-05) * Fix conversion failure when I/O types differ but I/O rates don't. * Fix #defines for interpolation order selection. diff --git a/soxr/README b/soxr/README index 06f11e6..7f9a7af 100644 --- a/soxr/README +++ b/soxr/README @@ -1,4 +1,4 @@ -SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net The SoX Resampler library `libsoxr' performs one-dimensional sample-rate conversion -- it may be used, for example, to resample PCM-encoded audio. @@ -46,7 +46,7 @@ size configuration parameters may be used to reduce this figure). For build and installation instructions, see the file `INSTALL'; for copyright and licensing information, see the file `LICENCE'. -For support and new versions, see http://soxr.sourceforge.net +For support and new versions, see https://soxr.sourceforge.net ________ ¹ For example, multi-channel resampling can utilise multiple CPU-cores. ² Bit-perfect within practical occupied-bandwidth limits. diff --git a/soxr/TODO b/soxr/TODO index 1c4a31b..2d1bc19 100644 --- a/soxr/TODO +++ b/soxr/TODO @@ -1,3 +1,3 @@ -* SOXR_ALLOW_ALIASING -* Explicit flush API fn, perhaps. -* More SIMD. +* vr32s +* vr32 with 1-delay-clear +* fir_to_phase with RDFT32 diff --git a/soxr/cmake/Modules/FindCFlags.cmake b/soxr/cmake/Modules/FindCFlags.cmake new file mode 100644 index 0000000..f118727 --- /dev/null +++ b/soxr/cmake/Modules/FindCFlags.cmake @@ -0,0 +1,35 @@ +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# - Function to find C compiler feature flags + +include (CheckCSourceCompiles) +include (FindPackageHandleStandardArgs) + +function (FindCFlags PKG_NAME PKG_DESC TRIAL_C_FLAGS TEST_C_SOURCE) + +foreach (TRIAL_C_FLAG ${TRIAL_C_FLAGS}) + message (STATUS "Trying ${PKG_NAME} C flags: ${TRIAL_C_FLAG}") + unset (DETECT_${PKG_NAME}_C_FLAGS CACHE) #displayed by check_c_source_compiles + + set (TMP "${CMAKE_REQUIRED_FLAGS}") + set (CMAKE_REQUIRED_FLAGS "${TRIAL_C_FLAG}") + check_c_source_compiles ("${TEST_C_SOURCE}" DETECT_${PKG_NAME}_C_FLAGS) + set (CMAKE_REQUIRED_FLAGS "${TMP}") + + if (DETECT_${PKG_NAME}_C_FLAGS) + set (DETECTED_C_FLAGS "${TRIAL_C_FLAG}") + break () + endif () +endforeach () + +# N.B. Will not overwrite existing cache variable: +set (${PKG_NAME}_C_FLAGS "${DETECTED_C_FLAGS}" + CACHE STRING "C compiler flags for ${PKG_DESC}") + +find_package_handle_standard_args ( + ${PKG_NAME} DEFAULT_MSG ${PKG_NAME}_C_FLAGS ${PKG_NAME}_C_FLAGS) +mark_as_advanced (${PKG_NAME}_C_FLAGS) +set (${PKG_NAME}_FOUND ${${PKG_NAME}_FOUND} PARENT_SCOPE) + +endfunction () diff --git a/soxr/cmake/Modules/FindLibAVCodec.cmake b/soxr/cmake/Modules/FindLibAVCodec.cmake index add33c3..f1bbf89 100644 --- a/soxr/cmake/Modules/FindLibAVCodec.cmake +++ b/soxr/cmake/Modules/FindLibAVCodec.cmake @@ -1,23 +1,23 @@ -# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net # Licence for this file: LGPL v2.1 See LICENCE for details. # - Find AVCODEC -# Find the native installation of this package: includes and libraries. +# Find the installation of this package: include-dirs and libraries. # -# AVCODEC_INCLUDES - where to find headers for this package. -# AVCODEC_LIBRARIES - List of libraries when using this package. -# AVCODEC_FOUND - True if this package can be found. +# AVCODEC_INCLUDE_DIRS - where to find headers for this package. +# AVCODEC_LIBRARIES - libraries to link to when using this package. +# AVCODEC_FOUND - true iff this package can be found. -if (AVCODEC_INCLUDES) +if (AVCODEC_INCLUDE_DIRS) set (AVCODEC_FIND_QUIETLY TRUE) -endif (AVCODEC_INCLUDES) +endif () -find_path (AVCODEC_INCLUDES libavcodec/avcodec.h) +find_path (AVCODEC_INCLUDE_DIRS libavcodec/avcodec.h) find_library (AVCODEC_LIBRARIES NAMES avcodec) include (FindPackageHandleStandardArgs) find_package_handle_standard_args ( - AVCODEC DEFAULT_MSG AVCODEC_LIBRARIES AVCODEC_INCLUDES) + AVCODEC DEFAULT_MSG AVCODEC_LIBRARIES AVCODEC_INCLUDE_DIRS) -mark_as_advanced (AVCODEC_LIBRARIES AVCODEC_INCLUDES) +mark_as_advanced (AVCODEC_LIBRARIES AVCODEC_INCLUDE_DIRS) diff --git a/soxr/cmake/Modules/FindLibAVUtil.cmake b/soxr/cmake/Modules/FindLibAVUtil.cmake new file mode 100644 index 0000000..464e6cf --- /dev/null +++ b/soxr/cmake/Modules/FindLibAVUtil.cmake @@ -0,0 +1,23 @@ +# SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# - Find AVUTIL +# Find the installation of this package: includes and libraries. +# +# AVUTIL_INCLUDE_DIRS - where to find headers for this package. +# AVUTIL_LIBRARIES - libraries to link to when using this package. +# AVUTIL_FOUND - true iff this package can be found. + +if (AVUTIL_INCLUDE_DIRS) + set (AVUTIL_FIND_QUIETLY TRUE) +endif () + +find_path (AVUTIL_INCLUDE_DIRS libavutil/cpu.h) + +find_library (AVUTIL_LIBRARIES NAMES avutil) + +include (FindPackageHandleStandardArgs) +find_package_handle_standard_args ( + AVUTIL DEFAULT_MSG AVUTIL_LIBRARIES AVUTIL_INCLUDE_DIRS) + +mark_as_advanced (AVUTIL_LIBRARIES AVUTIL_INCLUDE_DIRS) diff --git a/soxr/cmake/Modules/FindOpenMP.cmake b/soxr/cmake/Modules/FindOpenMP.cmake deleted file mode 100644 index eef8422..0000000 --- a/soxr/cmake/Modules/FindOpenMP.cmake +++ /dev/null @@ -1,115 +0,0 @@ -# - Finds OpenMP support -# This module can be used to detect OpenMP support in a compiler. -# If the compiler supports OpenMP, the flags required to compile with -# openmp support are set. -# -# The following variables are set: -# OpenMP_C_FLAGS - flags to add to the C compiler for OpenMP support -# OPENMP_FOUND - true if openmp is detected -# -# Supported compilers can be found at http://openmp.org/wp/openmp-compilers/ -# -# Modifications for soxr: -# * don't rely on presence of C++ compiler -# * support MINGW -# -#============================================================================= -# Copyright 2009 Kitware, Inc. -# Copyright 2008-2009 André Rigland Brodtkorb -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -include (CheckCSourceCompiles) -include (FindPackageHandleStandardArgs) - -set (OpenMP_C_FLAG_CANDIDATES - #Gnu - "-fopenmp" - #Microsoft Visual Studio - "/openmp" - #Intel windows - "-Qopenmp" - #Intel - "-openmp" - #Empty, if compiler automatically accepts openmp - " " - #Sun - "-xopenmp" - #HP - "+Oopenmp" - #IBM XL C/c++ - "-qsmp" - #Portland Group - "-mp" -) - -# sample openmp source code to test -set (OpenMP_C_TEST_SOURCE -" -#include -int main() { -#ifdef _OPENMP - return 0; -#else - breaks_on_purpose -#endif -} -") -# if these are set then do not try to find them again, -# by avoiding any try_compiles for the flags -if (DEFINED OpenMP_C_FLAGS) - set (OpenMP_C_FLAG_CANDIDATES) -endif (DEFINED OpenMP_C_FLAGS) - -# check c compiler -foreach (FLAG ${OpenMP_C_FLAG_CANDIDATES}) - set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") - set (CMAKE_REQUIRED_FLAGS "${FLAG}") - unset (OpenMP_FLAG_DETECTED CACHE) - message (STATUS "Try OpenMP C flag = [${FLAG}]") - check_c_source_compiles ("${OpenMP_C_TEST_SOURCE}" OpenMP_FLAG_DETECTED) - set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}") - if (OpenMP_FLAG_DETECTED) - set (OpenMP_C_FLAGS_INTERNAL "${FLAG}") - break () - endif (OpenMP_FLAG_DETECTED) -endforeach (FLAG ${OpenMP_C_FLAG_CANDIDATES}) - -set (OpenMP_C_FLAGS "${OpenMP_C_FLAGS_INTERNAL}" - CACHE STRING "C compiler flags for OpenMP parallization") - -# handle the standard arguments for find_package -find_package_handle_standard_args (OpenMP DEFAULT_MSG - OpenMP_C_FLAGS) - -if (MINGW) - set (OpenMP_SHARED_LINKER_FLAGS "${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}") - set (OpenMP_EXE_LINKER_FLAGS "${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}") -endif () - -mark_as_advanced (OpenMP_C_FLAGS OpenMP_SHARED_LINKER_FLAGS OpenMP_EXE_LINKER_FLAGS) diff --git a/soxr/cmake/Modules/FindSIMD.cmake b/soxr/cmake/Modules/FindSIMD.cmake deleted file mode 100644 index 6ac51cb..0000000 --- a/soxr/cmake/Modules/FindSIMD.cmake +++ /dev/null @@ -1,94 +0,0 @@ -# - Finds SIMD support -# -# The following variables are set: -# SIMD_C_FLAGS - flags to add to the C compiler for this package. -# SIMD_FOUND - true if support for this package is found. -# -#============================================================================= -# Based on FindOpenMP.cmake, which is: -# -# Copyright 2009 Kitware, Inc. -# Copyright 2008-2009 André Rigland Brodtkorb -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are -# met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * The names of Kitware, Inc., the Insight Consortium, or the names of -# any consortium members, or of any contributors, may not be used to -# endorse or promote products derived from this software without -# specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS'' -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR -# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -include (CheckCSourceCompiles) -include (FindPackageHandleStandardArgs) - -if (WIN32) # Safety for when mixed lib/app compilers (but performance hit) - set (GCC_WIN32_SIMD_OPTS "-mincoming-stack-boundary=2") -endif () - -set (SIMD_C_FLAG_CANDIDATES - # x64 - " " - # Microsoft Visual Studio x86 - "/arch:SSE /fp:fast -D__SSE__" - # Gcc x86 - "-msse -mfpmath=sse ${GCC_WIN32_SIMD_OPTS}" - # Gcc x86 (old versions) - "-msse -mfpmath=sse" -) - -set (SIMD_C_TEST_SOURCE -" -#include -int main() -{ - __m128 a, b; - float vals[4] = {0}; - a = _mm_loadu_ps (vals); - b = a; - b = _mm_add_ps (a,b); - _mm_storeu_ps (vals,b); - return 0; -} -") - -if (DEFINED SIMD_C_FLAGS) - set (SIMD_C_FLAG_CANDIDATES) -endif () - -foreach (FLAG ${SIMD_C_FLAG_CANDIDATES}) - set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}") - set (CMAKE_REQUIRED_FLAGS "${FLAG}") - unset (SIMD_FLAG_DETECTED CACHE) - message (STATUS "Try SIMD C flag = [${FLAG}]") - check_c_source_compiles ("${SIMD_C_TEST_SOURCE}" SIMD_FLAG_DETECTED) - set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}") - if (SIMD_FLAG_DETECTED) - set (SIMD_C_FLAGS_INTERNAL "${FLAG}") - break () - endif () -endforeach () - -set (SIMD_C_FLAGS "${SIMD_C_FLAGS_INTERNAL}" - CACHE STRING "C compiler flags for SIMD vectorization") - -find_package_handle_standard_args (SIMD DEFAULT_MSG SIMD_C_FLAGS SIMD_C_FLAGS) -mark_as_advanced (SIMD_C_FLAGS) diff --git a/soxr/cmake/Modules/FindSIMD32.cmake b/soxr/cmake/Modules/FindSIMD32.cmake new file mode 100644 index 0000000..9e42373 --- /dev/null +++ b/soxr/cmake/Modules/FindSIMD32.cmake @@ -0,0 +1,54 @@ +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# - Finds SIMD32 support +# +# The following variables are set: +# SIMD32_C_FLAGS - flags to add to the C compiler for this package. +# SIMD32_FOUND - true if support for this package is found. + +if (DEFINED SIMD32_C_FLAGS) + set (TRIAL_C_FLAGS) +elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm") + set (TRIAL_C_FLAGS + # Gcc + "-mfpu=neon-vfpv4 -mcpu=cortex-a7" + "-mfpu=neon -mfloat-abi=hard" + "-mfpu=neon -mfloat-abi=softfp" + "-mfpu=neon -mfloat-abi=soft" + ) + set (TEST_C_SOURCE " + #include + int main(int c, char * * v) { + float32x4_t a = vdupq_n_f32((float)c), b = vdupq_n_f32((float)!!v); + return !vgetq_lane_u32(vceqq_f32(a,b),0); + } + ") +else () + if (WIN32) # Safety for when mixed lib/app compilers (but performance hit) + set (GCC_WIN32_SIMD32_OPTS "-mincoming-stack-boundary=2") + endif () + + set (TRIAL_C_FLAGS + # x64 + " " + # MSVC x86 + "/arch:SSE /fp:fast -D__SSE__" + # Gcc x86 + "-msse -mfpmath=sse ${GCC_WIN32_SIMD32_OPTS}" + # Gcc x86 (old versions) + "-msse -mfpmath=sse" + ) + set (TEST_C_SOURCE " + #include + int main(int c, char * * v) { + __m128 a = _mm_set_ss((float)c), b = _mm_set_ss((float)!!v); + return _mm_comineq_ss(a,b); + } + ") +endif () + +include (FindCFlags) + +FindCFlags ("SIMD32" "FLOAT-32 (single-precision) SIMD vectorization" + "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}") diff --git a/soxr/cmake/Modules/FindSIMD64.cmake b/soxr/cmake/Modules/FindSIMD64.cmake new file mode 100644 index 0000000..d412644 --- /dev/null +++ b/soxr/cmake/Modules/FindSIMD64.cmake @@ -0,0 +1,29 @@ +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# - Finds SIMD64 support +# +# The following variables are set: +# SIMD64_C_FLAGS - flags to add to the C compiler for this package. +# SIMD64_FOUND - true if support for this package is found. + +if (DEFINED SIMD64_C_FLAGS OR CMAKE_SYSTEM_PROCESSOR MATCHES "^arm") + set (TRIAL_C_FLAGS) +else () + set (TRIAL_C_FLAGS + "-mavx" # Gcc + "/arch:AVX" # MSVC + ) + set (TEST_C_SOURCE " + #ifndef __AVX__ + #error + #endif + #include + int main() {return 0;} + ") +endif () + +include (FindCFlags) + +FindCFlags ("SIMD64" "FLOAT-64 (double-precision) SIMD vectorization" + "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}") diff --git a/soxr/cmake/Modules/SetSystemProcessor.cmake b/soxr/cmake/Modules/SetSystemProcessor.cmake new file mode 100644 index 0000000..8e2c292 --- /dev/null +++ b/soxr/cmake/Modules/SetSystemProcessor.cmake @@ -0,0 +1,37 @@ +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# Sets CMAKE_SYSTEM_PROCESSOR for cross-compiling. + +macro (set_system_processor) + if (CMAKE_CROSSCOMPILING) + if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "unknown") + unset(CMAKE_SYSTEM_PROCESSOR) + endif () + if (NOT DEFINED CMAKE_SYSTEM_PROCESSOR) + include (CheckCSourceCompiles) + set (CPU_LINES + "#if defined __x86_64__ || defined _M_X64 /*\;x86_64\;*/" + "#if defined __i386__ || defined _M_IX86 /*\;x86_32\;*/" + "#if defined __arm__ || defined _M_ARM /*\;arm\;*/" + ) + foreach (CPU_LINE ${CPU_LINES}) + string (CONCAT CPU_SOURCE "${CPU_LINE}" " + int main() {return 0;} + #endif + ") + unset (SYSTEM_PROCESSOR_DETECTED CACHE) + check_c_source_compiles ("${CPU_SOURCE}" SYSTEM_PROCESSOR_DETECTED) + if (SYSTEM_PROCESSOR_DETECTED) + list (GET CPU_LINE 1 CMAKE_SYSTEM_PROCESSOR) + message (STATUS "CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}") + break () + endif () + endforeach () + endif () + + # N.B. Will not overwrite existing cache variable: + set (CMAKE_SYSTEM_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}" + CACHE STRING "Target system processor") + endif () +endmacro () diff --git a/soxr/cmake/Modules/TestBigEndian.cmake b/soxr/cmake/Modules/TestBigEndian.cmake deleted file mode 100644 index d80df20..0000000 --- a/soxr/cmake/Modules/TestBigEndian.cmake +++ /dev/null @@ -1,15 +0,0 @@ -# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net -# Licence for this file: LGPL v2.1 See LICENCE for details. - -# - Macro to determine endian type -# test_big_endian (VARIABLE) -# VARIABLE - variable to store the result to - -macro (test_big_endian VARIABLE) - if ("${HAVE_${VARIABLE}}" MATCHES "^${HAVE_${VARIABLE}}$") - include (CheckCSourceRuns) - check_c_source_runs ("int main() {union {long i; char c[sizeof(long)];} - const u = {1}; return !!u.c[0];}" HAVE_${VARIABLE}) - set (${VARIABLE} "${HAVE_${VARIABLE}}" CACHE INTERNAL "1 if system is big endian" FORCE) - endif () -endmacro () diff --git a/soxr/dist b/soxr/dist new file mode 100644 index 0000000..ee68b30 --- /dev/null +++ b/soxr/dist @@ -0,0 +1,12 @@ +#!/bin/sh +set -e +# SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# Makes the distribution tarball + +test $# = 1 -o `git status -s|wc -c` = 0 +rm -rf Release +./go -j4 +cd Release +make package_source diff --git a/soxr/examples/1-single-block.c b/soxr/examples/1-single-block.c index 3fb9201..3b919b4 100644 --- a/soxr/examples/1-single-block.c +++ b/soxr/examples/1-single-block.c @@ -25,7 +25,7 @@ const float in[] = { /* Input: 12 cycles of a sine wave with freq. = irate/4 */ int main(int argc, char const * arg[]) { - double irate = argc > 1? atof(arg[1]) : 1; /* Default to upsampling */ + double irate = argc > 1? atof(arg[1]) : 1; /* Default to interpolation */ double orate = argc > 2? atof(arg[2]) : 2; /* by a factor of 2. */ size_t olen = (size_t)(AL(in) * orate / irate + .5); /* Assay output len. */ diff --git a/soxr/examples/1a-lsr.c b/soxr/examples/1a-lsr.c index e42e530..6b50a8f 100644 --- a/soxr/examples/1a-lsr.c +++ b/soxr/examples/1a-lsr.c @@ -12,7 +12,7 @@ float in[] = { /* Input: 12 cycles of a sine wave with freq. = irate/4 */ int main(int argc, char const * arg[]) { - double irate = argc > 1? atof(arg[1]) : 1; /* Default to upsampling */ + double irate = argc > 1? atof(arg[1]) : 1; /* Default to interpolation */ double orate = argc > 2? atof(arg[2]) : 2; /* by a factor of 2. */ size_t olen = (size_t)(AL(in) * orate / irate + .5); /* Assay output len. */ diff --git a/soxr/examples/3-options-input-fn.c b/soxr/examples/3-options-input-fn.c index 38fbb0d..afd43b9 100644 --- a/soxr/examples/3-options-input-fn.c +++ b/soxr/examples/3-options-input-fn.c @@ -1,4 +1,4 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ /* Example 3: extends example 2 with multiple channels, multiple datatypes, @@ -14,7 +14,7 @@ * OUTPUT-RATE Ditto * NUM-CHANNELS Number of interleaved channels * IN-DATATYPE# 0:float32 1:float64 2:int32 3:int16 - * OUT-DATATYPE# Ditto + * OUT-DATATYPE# Ditto; or 11 for un-dithered int16 * Q-RECIPE Quality recipe (in hex) See soxr.h * Q-FLAGS Quality flags (in hex) See soxr.h * PASSBAND-END % @@ -42,7 +42,7 @@ static size_t input_fn(input_context_t * p, soxr_cbuf_t * buf, size_t len) int main(int n, char const * arg[]) { - char const * const arg0 = n? --n, *arg++ : ""; + char const * const arg0 = n? --n, *arg++ : "", * engine = ""; double const irate = n? --n, atof(*arg++) : 96000.; double const orate = n? --n, atof(*arg++) : 44100.; unsigned const chans = n? --n, (unsigned)atoi(*arg++) : 1; @@ -94,6 +94,7 @@ int main(int n, char const * arg[]) } if (!error) { /* If all is well, run the resampler: */ + engine = soxr_engine(soxr); USE_STD_STDIO; /* Resample in blocks: */ do odone = soxr_output(soxr, obuf, olen); @@ -106,8 +107,8 @@ int main(int n, char const * arg[]) soxr_delete(soxr); free(obuf), free(ibuf); /* Diagnostics: */ - fprintf(stderr, "%-26s %s; %lu clips; I/O: %s\n", + fprintf(stderr, "%-26s %s; %lu clips; I/O: %s (%s)\n", arg0, soxr_strerror(error), (long unsigned)clips, - ferror(stdin) || ferror(stdout)? strerror(errno) : "no error"); + ferror(stdin) || ferror(stdout)? strerror(errno) : "no error", engine); return !!error; } diff --git a/soxr/examples/4-split-channels.c b/soxr/examples/4-split-channels.c index d6448aa..a9022ce 100644 --- a/soxr/examples/4-split-channels.c +++ b/soxr/examples/4-split-channels.c @@ -1,4 +1,4 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ /* Example 4: variant of examples 2 & 3, demonstrating I/O with split channels. @@ -13,6 +13,8 @@ * * Note also (not shown in the examples) that split/interleaved channels may * be used for input and output independently. + * + * Arguments are as example 3. */ #include @@ -73,13 +75,17 @@ int main(int n, char const * arg[]) double const orate = n? --n, atof(*arg++) : 44100.; unsigned const chans = n? --n, (unsigned)atoi(*arg++) : 1; soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0; - soxr_datatype_t const otype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0; + unsigned const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0; unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ; unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0; + double const passband_end = n? --n, atof(*arg++) : 0; + double const stopband_begin = n? --n, atof(*arg++) : 0; + double const phase_response = n? --n, atof(*arg++) : -1; int const use_threads = n? --n, atoi(*arg++) : 1; + soxr_datatype_t const otype = ospec & 3; - soxr_quality_spec_t const q_spec = soxr_quality_spec(q_recipe, q_flags); - soxr_io_spec_t const io_spec=soxr_io_spec(itype|SOXR_SPLIT, otype|SOXR_SPLIT); + soxr_quality_spec_t q_spec = soxr_quality_spec(q_recipe, q_flags); + soxr_io_spec_t io_spec=soxr_io_spec(itype|SOXR_SPLIT, otype|SOXR_SPLIT); soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads); /* Allocate resampling input and output buffers in proportion to the input @@ -102,11 +108,18 @@ int main(int n, char const * arg[]) size_t odone, written, need_input = 1, clips = 0; soxr_error_t error; + soxr_t soxr; + unsigned i; - soxr_t soxr = soxr_create( + /* Overrides (if given): */ + if (passband_end > 0) q_spec.passband_end = passband_end / 100; + if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100; + if (phase_response >=0) q_spec.phase_response = phase_response; + io_spec.flags = ospec & ~7u; + + soxr = soxr_create( irate, orate, chans, &error, &io_spec, &q_spec, &runtime_spec); - unsigned i; for (i = 0; i < chans; ++i) { ibuf_ptrs[i] = iptr; obuf_ptrs[i] = optr; diff --git a/soxr/examples/CMakeLists.txt b/soxr/examples/CMakeLists.txt index 8107a4e..c8c17c9 100644 --- a/soxr/examples/CMakeLists.txt +++ b/soxr/examples/CMakeLists.txt @@ -1,25 +1,23 @@ -# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net # Licence for this file: LGPL v2.1 See LICENCE for details. -if (${BUILD_EXAMPLES}) - project (soxr) # Adds c++ compiler - file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/[1-9]-*.[cC]) -elseif (${BUILD_TESTS}) - file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/3*.c) -endif () +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}") +set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PROJECT_CXX_FLAGS}") +link_libraries (${PROJECT_NAME} ${LIBM_LIBRARIES}) if (${BUILD_EXAMPLES} OR ${BUILD_TESTS}) + set (SOURCES 3-options-input-fn) if (${WITH_LSR_BINDINGS}) - set (LSR_SOURCES 1a-lsr.c) + set (LSR_SOURCES 1a-lsr) endif () endif () -if (NOT BUILD_SHARED_LIBS AND OPENMP_FOUND) - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_C_FLAGS}") +if (${BUILD_EXAMPLES}) + list (APPEND SOURCES 1-single-block 2-stream 4-split-channels) + if (${WITH_VR32}) + list (APPEND SOURCES 5-variable-rate) + endif () endif () -set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}") -set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PROJECT_CXX_FLAGS}") -link_libraries (soxr) foreach (fe ${SOURCES} ${LSR_SOURCES}) get_filename_component (f ${fe} NAME_WE) @@ -34,4 +32,5 @@ if (${BUILD_TESTS} AND ${WITH_LSR_BINDINGS}) endif () file (GLOB INSTALL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.[cCh]) -install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README DESTINATION ${DOC_INSTALL_DIR}/examples) +install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README + DESTINATION ${DOC_INSTALL_DIR}/examples) diff --git a/soxr/examples/examples-common.h b/soxr/examples/examples-common.h index 585fac3..fc8ed82 100644 --- a/soxr/examples/examples-common.h +++ b/soxr/examples/examples-common.h @@ -1,4 +1,4 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ /* Common includes etc. for the examples. */ @@ -17,10 +17,7 @@ #include #include #define USE_STD_STDIO _setmode(_fileno(stdout), _O_BINARY), \ - _setmode(_fileno(stdin ), _O_BINARY); - /* Sometimes missing, so ensure that it is defined: */ - #undef M_PI - #define M_PI 3.14159265358979323846 + _setmode(_fileno(stdin ), _O_BINARY) #else #define USE_STD_STDIO #endif @@ -38,8 +35,13 @@ #endif #undef min -#undef max #define min(x,y) ((x)<(y)?(x):(y)) + +#undef max #define max(x,y) ((x)>(y)?(x):(y)) +#undef AL #define AL(a) (sizeof(a)/sizeof((a)[0])) /* Array Length */ + +#undef M_PI /* Sometimes missing, so ensure that it is defined: */ +#define M_PI 3.14159265358979323846 diff --git a/soxr/go b/soxr/go new file mode 100644 index 0000000..7fba810 --- /dev/null +++ b/soxr/go @@ -0,0 +1,18 @@ +#!/bin/sh +set -e + +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +case "$1" in -j*) j="$1"; shift;; esac # Support -jX for parallel build/test + +test x"$1" = x && build=Release || build="$1" + +rm -f CMakeCache.txt # Prevent interference from any in-tree build + +mkdir -p "$build" +cd "$build" + +cmake -Wno-dev -DCMAKE_BUILD_TYPE="$build" .. +make $j +ctest $j || echo "FAILURE details in $build/Testing/Temporary/LastTest.log" diff --git a/soxr/go.bat b/soxr/go.bat new file mode 100644 index 0000000..aabff75 --- /dev/null +++ b/soxr/go.bat @@ -0,0 +1,27 @@ +@echo off +rem SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net +rem Licence for this file: LGPL v2.1 See LICENCE for details. + +set build=%1 +if x%build% == x set build=Release + +rem Prevent interference from any in-tree build +del/f CMakeCache.txt + +mkdir %build% +cd %build% + +cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=%build% -Wno-dev .. +if errorlevel 1 goto end + +nmake +if errorlevel 1 goto end + +nmake test +if errorlevel 1 goto error +goto end + +:error +echo FAILURE details in Testing\Temporary\LastTest.log + +:end diff --git a/soxr/inst-check b/soxr/inst-check new file mode 100644 index 0000000..8cf64b7 --- /dev/null +++ b/soxr/inst-check @@ -0,0 +1,25 @@ +#!/bin/sh +set -e +# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# Sanity-check of library installed on unix-like system + +# This script checks the installation of the entire library (including lsr). +# +# Distros using three separate packages can do the following (in order): +# +# * Install soxr pkg (i.e. basically, just the shared object) +# * ./inst-check-soxr +# * Install soxr-lsr pkg (i.e. basically, just the shared object) +# * ./inst-check-soxr-lsr +# * Install the -dev pkg (i.e. examples, headers, & pkg-config) +# * ./inst-check PATH-OF-INSTALLED-EXAMPLES-DIR (e.g. /usr/share/doc/libsoxr/examples) + +# Where are the example source files: +src=$1 +test x$src = x && src=/usr/local/share/doc/libsoxr/examples + +dir="$(dirname $(readlink -f $0))" +$dir/inst-check-soxr $src +$dir/inst-check-soxr-lsr $src diff --git a/soxr/inst-check-soxr b/soxr/inst-check-soxr new file mode 100644 index 0000000..5f923b8 --- /dev/null +++ b/soxr/inst-check-soxr @@ -0,0 +1,52 @@ +#!/bin/sh +set -e +# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# Sanity-check of sub-library installed on unix-like system + +arg="$1" # path to installed examples (if dev pkg installed); otherwise omitted +dir="$(dirname $(readlink -f $0))" + +# Find the examples: +src="$arg" +test x"$src" = x && src="$dir/examples" +cd $src + +# Somewhere to put the binaries: +tmp=`mktemp -d` + +build_examples() { + if [ x"$arg" = x ]; then + echo "Examples in `pwd`; using local headers:" # for when dev pkg not installed + libs=-l$1 + cflags=-I$dir/src + else + echo "Examples in `pwd`; using pkg-config:" + libs=$(pkg-config --libs $1) + cflags=$(pkg-config --cflags $1) + fi + for f in ?$2-*.[cC]; do + cc=cc; echo $f | grep -q C$ && cc=c++ + out=$tmp/`echo $f | sed "s/.[cC]$//"` + cmd="$cc $cflags -o $out $f $libs -lm" + echo $cmd; $cmd + done +} + +# Determine library: +if [ `basename $0` = inst-check-soxr ]; then + build_examples soxr + gen="dd if=/dev/urandom count=1000" + $tmp/1-single-block 1 2 . + $gen 2> /dev/null | $tmp/2-stream 2>&1 >$tmp/stdout + $gen 2> /dev/null | $tmp/3-options-input-fn 6 7 2 2 0 2>&1 >$tmp/stdout + $gen 2> /dev/null | $tmp/4-split-channels 7 6 2 2 3 2>&1 >$tmp/stdout # Clipping expected here + $gen 2> /dev/null | $tmp/5-variable-rate 2>&1 >$tmp/stdout +else + build_examples soxr-lsr a # lsr has 'a' suffix on example number. + $tmp/1a-lsr 1 2 . +fi + +# Tidy up: +rm -rf $tmp diff --git a/soxr/inst-check-soxr-lsr b/soxr/inst-check-soxr-lsr new file mode 100644 index 0000000..5f923b8 --- /dev/null +++ b/soxr/inst-check-soxr-lsr @@ -0,0 +1,52 @@ +#!/bin/sh +set -e +# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# Sanity-check of sub-library installed on unix-like system + +arg="$1" # path to installed examples (if dev pkg installed); otherwise omitted +dir="$(dirname $(readlink -f $0))" + +# Find the examples: +src="$arg" +test x"$src" = x && src="$dir/examples" +cd $src + +# Somewhere to put the binaries: +tmp=`mktemp -d` + +build_examples() { + if [ x"$arg" = x ]; then + echo "Examples in `pwd`; using local headers:" # for when dev pkg not installed + libs=-l$1 + cflags=-I$dir/src + else + echo "Examples in `pwd`; using pkg-config:" + libs=$(pkg-config --libs $1) + cflags=$(pkg-config --cflags $1) + fi + for f in ?$2-*.[cC]; do + cc=cc; echo $f | grep -q C$ && cc=c++ + out=$tmp/`echo $f | sed "s/.[cC]$//"` + cmd="$cc $cflags -o $out $f $libs -lm" + echo $cmd; $cmd + done +} + +# Determine library: +if [ `basename $0` = inst-check-soxr ]; then + build_examples soxr + gen="dd if=/dev/urandom count=1000" + $tmp/1-single-block 1 2 . + $gen 2> /dev/null | $tmp/2-stream 2>&1 >$tmp/stdout + $gen 2> /dev/null | $tmp/3-options-input-fn 6 7 2 2 0 2>&1 >$tmp/stdout + $gen 2> /dev/null | $tmp/4-split-channels 7 6 2 2 3 2>&1 >$tmp/stdout # Clipping expected here + $gen 2> /dev/null | $tmp/5-variable-rate 2>&1 >$tmp/stdout +else + build_examples soxr-lsr a # lsr has 'a' suffix on example number. + $tmp/1a-lsr 1 2 . +fi + +# Tidy up: +rm -rf $tmp diff --git a/soxr/lsr-tests/CMakeLists.txt b/soxr/lsr-tests/CMakeLists.txt new file mode 100644 index 0000000..4f718f7 --- /dev/null +++ b/soxr/lsr-tests/CMakeLists.txt @@ -0,0 +1,50 @@ +# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +list (APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules) + +find_package (FFTW) +if (FFTW_FOUND) + include_directories (${FFTW_INCLUDE_DIRS}) + link_libraries (${FFTW_LIBRARIES}) + set (HAVE_FFTW3 1) +endif () + +find_package (sndfile) +if (SNDFILE_FOUND) + include_directories (${SNDFILE_INCLUDE_DIRS}) + link_libraries (${SNDFILE_LIBRARIES}) + set (HAVE_SNDFILE 1) +endif () + +check_function_exists (lrintf HAVE_LRINTF) +check_function_exists (alarm HAVE_ALARM) +check_function_exists (signal HAVE_SIGNAL) +check_include_files (sys/times.h HAVE_SYS_TIMES_H) + +configure_file (${CMAKE_CURRENT_SOURCE_DIR}/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h) +include_directories (${CMAKE_CURRENT_BINARY_DIR}) + +add_library (tests_lib STATIC util calc_snr) + +link_libraries (tests_lib ${PROJECT_NAME}-lsr ${LIBM_LIBRARIES}) + +enable_testing () + +set (tests + callback_hang_test callback_test downsample_test + float_short_test misc_test multi_channel_test + reset_test simple_test termination_test varispeed_test) +if (WITH_CR64 OR WITH_CR64S) + set (tests ${tests} snr_bw_test) +endif () + +foreach (test ${tests}) + add_executable (${test} ${test}) + add_test (lsr-${test} ${BIN}${test}) + set_property (TEST lsr-${test} PROPERTY ENVIRONMENT "SOXR_LSR_STRICT=1") +endforeach () + +add_executable (multichan_throughput_test multichan_throughput_test) +add_executable (throughput_test throughput_test ) +add_executable (sndfile-resample sndfile-resample) diff --git a/soxr/lsr-tests/COPYING b/soxr/lsr-tests/COPYING new file mode 100644 index 0000000..d60c31a --- /dev/null +++ b/soxr/lsr-tests/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/soxr/lsr-tests/README b/soxr/lsr-tests/README new file mode 100644 index 0000000..f468446 --- /dev/null +++ b/soxr/lsr-tests/README @@ -0,0 +1,8 @@ +The C source and header files in this directory have been copied from +the `libsamplerate' project and are copyrighted by its authors -- see +the notices within the files and the file `COPYING' for details. + +They are used here to test libsoxr's optional libsamplerate-like +wrapper. The only modifications made are to the file `snr_bw_test.c' to +remove reliance on certain frequency response troughs that are specific +to libsamplerate. diff --git a/soxr/lsr-tests/calc_snr.c b/soxr/lsr-tests/calc_snr.c new file mode 100644 index 0000000..ddfc04c --- /dev/null +++ b/soxr/lsr-tests/calc_snr.c @@ -0,0 +1,242 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include "config.h" + +#include "util.h" + +#if (HAVE_FFTW3 == 1) + +#include +#include +#include +#include + +#include + +#define MAX_SPEC_LEN (1<<18) +#define MAX_PEAKS 10 + +static void log_mag_spectrum (double *input, int len, double *magnitude) ; +static void smooth_mag_spectrum (double *magnitude, int len) ; +static double find_snr (const double *magnitude, int len, int expected_peaks) ; + +typedef struct +{ double peak ; + int index ; +} PEAK_DATA ; + +double +calculate_snr (float *data, int len, int expected_peaks) +{ static double magnitude [MAX_SPEC_LEN] ; + static double datacopy [MAX_SPEC_LEN] ; + + double snr = 200.0 ; + int k ; + + if (len > MAX_SPEC_LEN) + { printf ("%s : line %d : data length too large.\n", __FILE__, __LINE__) ; + exit (1) ; + } ; + + for (k = 0 ; k < len ; k++) + datacopy [k] = data [k] ; + + /* Pad the data just a little to speed up the FFT. */ + while ((len & 0x1F) && len < MAX_SPEC_LEN) + { datacopy [len] = 0.0 ; + len ++ ; + } ; + + log_mag_spectrum (datacopy, len, magnitude) ; + smooth_mag_spectrum (magnitude, len / 2) ; + + snr = find_snr (magnitude, len, expected_peaks) ; + + return snr ; +} /* calculate_snr */ + +/*============================================================================== +** There is a slight problem with trying to measure SNR with the method used +** here; the side lobes of the windowed FFT can look like a noise/aliasing peak. +** The solution is to smooth the magnitude spectrum by wiping out troughs +** between adjacent peaks as done here. +** This removes side lobe peaks without affecting noise/aliasing peaks. +*/ + +static void linear_smooth (double *mag, PEAK_DATA *larger, PEAK_DATA *smaller) ; + +static void +smooth_mag_spectrum (double *mag, int len) +{ PEAK_DATA peaks [2] ; + + int k ; + + memset (peaks, 0, sizeof (peaks)) ; + + /* Find first peak. */ + for (k = 1 ; k < len - 1 ; k++) + { if (mag [k - 1] < mag [k] && mag [k] >= mag [k + 1]) + { peaks [0].peak = mag [k] ; + peaks [0].index = k ; + break ; + } ; + } ; + + /* Find subsequent peaks ans smooth between peaks. */ + for (k = peaks [0].index + 1 ; k < len - 1 ; k++) + { if (mag [k - 1] < mag [k] && mag [k] >= mag [k + 1]) + { peaks [1].peak = mag [k] ; + peaks [1].index = k ; + + if (peaks [1].peak > peaks [0].peak) + linear_smooth (mag, &peaks [1], &peaks [0]) ; + else + linear_smooth (mag, &peaks [0], &peaks [1]) ; + peaks [0] = peaks [1] ; + } ; + } ; + +} /* smooth_mag_spectrum */ + +static void +linear_smooth (double *mag, PEAK_DATA *larger, PEAK_DATA *smaller) +{ int k ; + + if (smaller->index < larger->index) + { for (k = smaller->index + 1 ; k < larger->index ; k++) + mag [k] = (mag [k] < mag [k - 1]) ? 0.999 * mag [k - 1] : mag [k] ; + } + else + { for (k = smaller->index - 1 ; k >= larger->index ; k--) + mag [k] = (mag [k] < mag [k + 1]) ? 0.999 * mag [k + 1] : mag [k] ; + } ; + +} /* linear_smooth */ + +/*============================================================================== +*/ + +static int +peak_compare (const void *vp1, const void *vp2) +{ const PEAK_DATA *peak1, *peak2 ; + + peak1 = (const PEAK_DATA*) vp1 ; + peak2 = (const PEAK_DATA*) vp2 ; + + return (peak1->peak < peak2->peak) ? 1 : -1 ; +} /* peak_compare */ + +static double +find_snr (const double *magnitude, int len, int expected_peaks) +{ PEAK_DATA peaks [MAX_PEAKS] ; + + int k, peak_count = 0 ; + double snr ; + + memset (peaks, 0, sizeof (peaks)) ; + + /* Find the MAX_PEAKS largest peaks. */ + for (k = 1 ; k < len - 1 ; k++) + { if (magnitude [k - 1] < magnitude [k] && magnitude [k] >= magnitude [k + 1]) + { if (peak_count < MAX_PEAKS) + { peaks [peak_count].peak = magnitude [k] ; + peaks [peak_count].index = k ; + peak_count ++ ; + qsort (peaks, peak_count, sizeof (PEAK_DATA), peak_compare) ; + } + else if (magnitude [k] > peaks [MAX_PEAKS - 1].peak) + { peaks [MAX_PEAKS - 1].peak = magnitude [k] ; + peaks [MAX_PEAKS - 1].index = k ; + qsort (peaks, MAX_PEAKS, sizeof (PEAK_DATA), peak_compare) ; + } ; + } ; + } ; + + if (peak_count < expected_peaks) + { printf ("\n%s : line %d : bad peak_count (%d), expected %d.\n\n", __FILE__, __LINE__, peak_count, expected_peaks) ; + return -1.0 ; + } ; + + /* Sort the peaks. */ + qsort (peaks, peak_count, sizeof (PEAK_DATA), peak_compare) ; + + snr = peaks [0].peak ; + for (k = 1 ; k < peak_count ; k++) + if (fabs (snr - peaks [k].peak) > 10.0) + return fabs (peaks [k].peak) ; + + return snr ; +} /* find_snr */ + +static void +log_mag_spectrum (double *input, int len, double *magnitude) +{ fftw_plan plan = NULL ; + + double maxval ; + int k ; + + if (input == NULL || magnitude == NULL) + return ; + + plan = fftw_plan_r2r_1d (len, input, magnitude, FFTW_R2HC, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT) ; + if (plan == NULL) + { printf ("%s : line %d : create plan failed.\n", __FILE__, __LINE__) ; + exit (1) ; + } ; + + fftw_execute (plan) ; + + fftw_destroy_plan (plan) ; + + /* (k < N/2 rounded up) */ + maxval = 0.0 ; + for (k = 1 ; k < len / 2 ; k++) + { magnitude [k] = sqrt (magnitude [k] * magnitude [k] + magnitude [len - k - 1] * magnitude [len - k - 1]) ; + maxval = (maxval < magnitude [k]) ? magnitude [k] : maxval ; + } ; + + memset (magnitude + len / 2, 0, len / 2 * sizeof (magnitude [0])) ; + + /* Don't care about DC component. Make it zero. */ + magnitude [0] = 0.0 ; + + /* log magnitude. */ + for (k = 0 ; k < len ; k++) + { magnitude [k] = magnitude [k] / maxval ; + magnitude [k] = (magnitude [k] < 1e-15) ? -200.0 : 20.0 * log10 (magnitude [k]) ; + } ; + + return ; +} /* log_mag_spectrum */ + +#else /* ! (HAVE_LIBFFTW && HAVE_LIBRFFTW) */ + +double +calculate_snr (float *data, int len, int expected_peaks) +{ double snr = 200.0 ; + + data = data ; + len = len ; + expected_peaks = expected_peaks ; + + return snr ; +} /* calculate_snr */ + +#endif + diff --git a/soxr/lsr-tests/callback_hang_test.c b/soxr/lsr-tests/callback_hang_test.c new file mode 100644 index 0000000..be89369 --- /dev/null +++ b/soxr/lsr-tests/callback_hang_test.c @@ -0,0 +1,131 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include "config.h" + +#include +#include +#include +#include + +#if HAVE_ALARM && HAVE_SIGNAL && HAVE_SIGALRM + +#include + +#include + +#include "util.h" + +#define SHORT_BUFFER_LEN 512 +#define LONG_BUFFER_LEN (1 << 14) + +typedef struct +{ double ratio ; + int count ; +} SRC_PAIR ; + +static void callback_hang_test (int converter) ; + +static void alarm_handler (int number) ; +static long input_callback (void *cb_data, float **data) ; + + +int +main (void) +{ + /* Set up SIGALRM handler. */ + signal (SIGALRM, alarm_handler) ; + + puts ("") ; + callback_hang_test (SRC_ZERO_ORDER_HOLD) ; + callback_hang_test (SRC_LINEAR) ; + callback_hang_test (SRC_SINC_FASTEST) ; + puts ("") ; + + return 0 ; +} /* main */ + + +static void +callback_hang_test (int converter) +{ static float output [LONG_BUFFER_LEN] ; + static SRC_PAIR pairs [] = + { + { 1.2, 5 }, { 1.1, 1 }, { 1.0, 1 }, { 3.0, 1 }, { 2.0, 1 }, { 0.3, 1 }, + { 1.2, 0 }, { 1.1, 10 }, { 1.0, 1 } + } ; + + + SRC_STATE *src_state ; + + double src_ratio = 1.0 ; + int k, error ; + + printf ("\tcallback_hang_test (%-28s) ....... ", src_get_name (converter)) ; + fflush (stdout) ; + + /* Perform sample rate conversion. */ + src_state = src_callback_new (input_callback, converter, 1, &error, NULL) ; + if (src_state == NULL) + { printf ("\n\nLine %d : src_callback_new () failed : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + for (k = 0 ; k < ARRAY_LEN (pairs) ; k++) + { alarm (1) ; + src_ratio = pairs [k].ratio ; + src_callback_read (src_state, src_ratio, pairs [k].count, output) ; + } ; + + src_state = src_delete (src_state) ; + + alarm (0) ; + puts ("ok") ; + + return ; +} /* callback_hang_test */ + +static void +alarm_handler (int number) +{ + (void) number ; + printf ("\n\n Error : Hang inside src_callback_read() detected. Exiting!\n\n") ; + exit (1) ; +} /* alarm_handler */ + +static long +input_callback (void *cb_data, float **data) +{ + static float buffer [20] ; + + (void) cb_data ; + *data = buffer ; + + return ARRAY_LEN (buffer) ; +} /* input_callback */ + +#else + +int +main (void) +{ + puts ("\tCan't run this test on this platform.") ; + return 0 ; +} /* main */ + +#endif diff --git a/soxr/lsr-tests/callback_test.c b/soxr/lsr-tests/callback_test.c new file mode 100644 index 0000000..0854d64 --- /dev/null +++ b/soxr/lsr-tests/callback_test.c @@ -0,0 +1,243 @@ +/* +** Copyright (C) 2003-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include +#include + +#include + +#include "util.h" + +#define BUFFER_LEN 10000 +#define CB_READ_LEN 256 + +static void callback_test (int converter, double ratio) ; +static void end_of_stream_test (int converter) ; + +int +main (void) +{ static double src_ratios [] = + { 1.0, 0.099, 0.1, 0.33333333, 0.789, 1.0001, 1.9, 3.1, 9.9 + } ; + + int k ; + + puts ("") ; + + puts (" Zero Order Hold interpolator :") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + callback_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ; + + puts (" Linear interpolator :") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + callback_test (SRC_LINEAR, src_ratios [k]) ; + + puts (" Sinc interpolator :") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + callback_test (SRC_SINC_FASTEST, src_ratios [k]) ; + + puts ("") ; + + puts (" End of stream test :") ; + end_of_stream_test (SRC_ZERO_ORDER_HOLD) ; + end_of_stream_test (SRC_LINEAR) ; + end_of_stream_test (SRC_SINC_FASTEST) ; + + puts ("") ; + return 0 ; +} /* main */ + +/*===================================================================================== +*/ + +typedef struct +{ int channels ; + long count, total ; + int end_of_data ; + float data [BUFFER_LEN] ; +} TEST_CB_DATA ; + +static long +test_callback_func (void *cb_data, float **data) +{ TEST_CB_DATA *pcb_data ; + + long frames ; + + if ((pcb_data = cb_data) == NULL) + return 0 ; + + if (data == NULL) + return 0 ; + + if (pcb_data->total - pcb_data->count > CB_READ_LEN) + frames = CB_READ_LEN / pcb_data->channels ; + else + frames = (pcb_data->total - pcb_data->count) / pcb_data->channels ; + + *data = pcb_data->data + pcb_data->count ; + pcb_data->count += frames ; + + return frames ; +} /* test_callback_func */ + + +static void +callback_test (int converter, double src_ratio) +{ static TEST_CB_DATA test_callback_data ; + static float output [BUFFER_LEN] ; + + SRC_STATE *src_state ; + + long read_count, read_total ; + int error ; + + printf ("\tcallback_test (SRC ratio = %6.4f) ........... ", src_ratio) ; + fflush (stdout) ; + + test_callback_data.channels = 2 ; + test_callback_data.count = 0 ; + test_callback_data.end_of_data = 0 ; + test_callback_data.total = ARRAY_LEN (test_callback_data.data) ; + + if ((src_state = src_callback_new (test_callback_func, converter, test_callback_data.channels, &error, &test_callback_data)) == NULL) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + read_total = 0 ; + do + { /* We will be throwing away output data, so just grab as much as possible. */ + read_count = ARRAY_LEN (output) / test_callback_data.channels ; + read_count = src_callback_read (src_state, src_ratio, read_count, output) ; + read_total += read_count ; + } + while (read_count > 0) ; + + if ((error = src_error (src_state)) != 0) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + src_state = src_delete (src_state) ; + + if (fabs (read_total / src_ratio - ARRAY_LEN (test_callback_data.data)) > 2.0) + { printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ; + printf (" input len : %d\n", ARRAY_LEN (test_callback_data.data)) ; + printf (" output len : %ld (should be %g +/- 2)\n\n", read_total, + floor (0.5 + src_ratio * ARRAY_LEN (test_callback_data.data))) ; + exit (1) ; + } ; + + puts ("ok") ; + + return ; +} /* callback_test */ + +/*===================================================================================== +*/ + +static long +eos_callback_func (void *cb_data, float **data) +{ + TEST_CB_DATA *pcb_data ; + long frames ; + + if (data == NULL) + return 0 ; + + if ((pcb_data = cb_data) == NULL) + return 0 ; + + /* + ** Return immediately if there is no more data. + ** In this case, the output pointer 'data' will not be set and + ** valgrind should not warn about it. + */ + if (pcb_data->end_of_data) + return 0 ; + + if (pcb_data->total - pcb_data->count > CB_READ_LEN) + frames = CB_READ_LEN / pcb_data->channels ; + else + frames = (pcb_data->total - pcb_data->count) / pcb_data->channels ; + + *data = pcb_data->data + pcb_data->count ; + pcb_data->count += frames ; + + /* + ** Set end_of_data so that the next call to the callback function will + ** return zero ocunt without setting the 'data' pointer. + */ + if (pcb_data->total < 2 * pcb_data->count) + pcb_data->end_of_data = 1 ; + + return frames ; +} /* eos_callback_data */ + + +static void +end_of_stream_test (int converter) +{ static TEST_CB_DATA test_callback_data ; + static float output [BUFFER_LEN] ; + + SRC_STATE *src_state ; + + double src_ratio = 0.3 ; + long read_count, read_total ; + int error ; + + printf ("\t%-30s ........... ", src_get_name (converter)) ; + fflush (stdout) ; + + test_callback_data.channels = 2 ; + test_callback_data.count = 0 ; + test_callback_data.end_of_data = 0 ; + test_callback_data.total = ARRAY_LEN (test_callback_data.data) ; + + if ((src_state = src_callback_new (eos_callback_func, converter, test_callback_data.channels, &error, &test_callback_data)) == NULL) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + read_total = 0 ; + do + { /* We will be throwing away output data, so just grab as much as possible. */ + read_count = ARRAY_LEN (output) / test_callback_data.channels ; + read_count = src_callback_read (src_state, src_ratio, read_count, output) ; + read_total += read_count ; + } + while (read_count > 0) ; + + if ((error = src_error (src_state)) != 0) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + src_state = src_delete (src_state) ; + + if (test_callback_data.end_of_data == 0) + { printf ("\n\nLine %d : test_callback_data.end_of_data should not be 0." + " This is a bug in the test.\n\n", __LINE__) ; + exit (1) ; + } ; + + puts ("ok") ; + return ; +} /* end_of_stream_test */ diff --git a/soxr/lsr-tests/cmake/Modules/FindFFTW.cmake b/soxr/lsr-tests/cmake/Modules/FindFFTW.cmake new file mode 100644 index 0000000..409268e --- /dev/null +++ b/soxr/lsr-tests/cmake/Modules/FindFFTW.cmake @@ -0,0 +1,23 @@ +# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# - Find FFTW +# Find the native installation of this package: includes and libraries. +# +# FFTW_INCLUDES - where to find headers for this package. +# FFTW_LIBRARIES - List of libraries when using this package. +# FFTW_FOUND - True if this package can be found. + +if (FFTW_INCLUDES) + set (FFTW_FIND_QUIETLY TRUE) +endif (FFTW_INCLUDES) + +find_path (FFTW_INCLUDES fftw3.h) + +find_library (FFTW_LIBRARIES NAMES fftw3) + +include (FindPackageHandleStandardArgs) +find_package_handle_standard_args ( + FFTW DEFAULT_MSG FFTW_LIBRARIES FFTW_INCLUDES) + +mark_as_advanced (FFTW_LIBRARIES FFTW_INCLUDES) diff --git a/soxr/lsr-tests/cmake/Modules/Findsndfile.cmake b/soxr/lsr-tests/cmake/Modules/Findsndfile.cmake new file mode 100644 index 0000000..b2fd725 --- /dev/null +++ b/soxr/lsr-tests/cmake/Modules/Findsndfile.cmake @@ -0,0 +1,23 @@ +# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +# - Find SNDFILE +# Find the native installation of this package: includes and libraries. +# +# SNDFILE_INCLUDES - where to find headers for this package. +# SNDFILE_LIBRARIES - List of libraries when using this package. +# SNDFILE_FOUND - True if this package can be found. + +if (SNDFILE_INCLUDES) + set (SNDFILE_FIND_QUIETLY TRUE) +endif (SNDFILE_INCLUDES) + +find_path (SNDFILE_INCLUDES sndfile.h) + +find_library (SNDFILE_LIBRARIES NAMES sndfile) + +include (FindPackageHandleStandardArgs) +find_package_handle_standard_args ( + SNDFILE DEFAULT_MSG SNDFILE_LIBRARIES SNDFILE_INCLUDES) + +mark_as_advanced (SNDFILE_LIBRARIES SNDFILE_INCLUDES) diff --git a/soxr/lsr-tests/config.h.in b/soxr/lsr-tests/config.h.in new file mode 100644 index 0000000..1095e00 --- /dev/null +++ b/soxr/lsr-tests/config.h.in @@ -0,0 +1,24 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined soxsrc_lsr_tests_config_included +#define soxsrc_lsr_tests_config_included + +#cmakedefine01 HAVE_ALARM +#cmakedefine01 HAVE_FFTW3 +#cmakedefine01 HAVE_LRINTF +#cmakedefine01 HAVE_LRINT +#cmakedefine01 HAVE_SIGNAL +#cmakedefine01 HAVE_SNDFILE +#cmakedefine01 HAVE_SYS_TIMES_H + +#if HAVE_SIGNAL + #include + #if defined SIGALRM + #define HAVE_SIGALRM 1 + #else + #define HAVE_SIGALRM 0 + #endif +#endif + +#endif diff --git a/soxr/lsr-tests/downsample_test.c b/soxr/lsr-tests/downsample_test.c new file mode 100644 index 0000000..87243e7 --- /dev/null +++ b/soxr/lsr-tests/downsample_test.c @@ -0,0 +1,61 @@ +/* +** Copyright (C) 2008-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include + +#include "util.h" + +static void +downsample_test (int converter) +{ static float in [1000], out [10] ; + SRC_DATA data ; + + printf (" downsample_test (%-28s) ....... ", src_get_name (converter)) ; + fflush (stdout) ; + + data.src_ratio = 1.0 / 255.0 ; + data.input_frames = ARRAY_LEN (in) ; + data.output_frames = ARRAY_LEN (out) ; + data.data_in = in ; + data.data_out = out ; + + if (src_simple (&data, converter, 1)) + { puts ("src_simple failed.") ; + exit (1) ; + } ; + + puts ("ok") ; +} /* downsample_test */ + +int +main (void) +{ + puts ("") ; + + downsample_test (SRC_ZERO_ORDER_HOLD) ; + downsample_test (SRC_LINEAR) ; + downsample_test (SRC_SINC_FASTEST) ; + downsample_test (SRC_SINC_MEDIUM_QUALITY) ; + downsample_test (SRC_SINC_BEST_QUALITY) ; + + puts ("") ; + + return 0 ; +} /* main */ diff --git a/soxr/lsr-tests/float_cast.h b/soxr/lsr-tests/float_cast.h new file mode 100644 index 0000000..77ad5b4 --- /dev/null +++ b/soxr/lsr-tests/float_cast.h @@ -0,0 +1,281 @@ +/* +** Copyright (C) 2001-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU Lesser General Public License as published by +** the Free Software Foundation; either version 2.1 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU Lesser General Public License for more details. +** +** You should have received a copy of the GNU Lesser General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +*/ + +/* Version 1.5 */ + +#ifndef FLOAT_CAST_HEADER +#define FLOAT_CAST_HEADER + +/*============================================================================ +** On Intel Pentium processors (especially PIII and probably P4), converting +** from float to int is very slow. To meet the C specs, the code produced by +** most C compilers targeting Pentium needs to change the FPU rounding mode +** before the float to int conversion is performed. +** +** Changing the FPU rounding mode causes the FPU pipeline to be flushed. It +** is this flushing of the pipeline which is so slow. +** +** Fortunately the ISO C99 specifications define the functions lrint, lrintf, +** llrint and llrintf which fix this problem as a side effect. +** +** On Unix-like systems, the configure process should have detected the +** presence of these functions. If they weren't found we have to replace them +** here with a standard C cast. +*/ + +/* +** The C99 prototypes for lrint and lrintf are as follows: +** +** long int lrintf (float x) ; +** long int lrint (double x) ; +*/ + +#include "config.h" + +/* +** The presence of the required functions are detected during the configure +** process and the values HAVE_LRINT and HAVE_LRINTF are set accordingly in +** the config.h file. +*/ + +#define HAVE_LRINT_REPLACEMENT 0 + +#if (HAVE_LRINT && HAVE_LRINTF) + + /* + ** These defines enable functionality introduced with the 1999 ISO C + ** standard. They must be defined before the inclusion of math.h to + ** engage them. If optimisation is enabled, these functions will be + ** inlined. With optimisation switched off, you have to link in the + ** maths library using -lm. + */ + + #define _ISOC9X_SOURCE 1 + #define _ISOC99_SOURCE 1 + + #define __USE_ISOC9X 1 + #define __USE_ISOC99 1 + + #include + +#elif (defined (__CYGWIN__)) + + #include + + #undef HAVE_LRINT_REPLACEMENT + #define HAVE_LRINT_REPLACEMENT 1 + + #undef lrint + #undef lrintf + + #define lrint double2int + #define lrintf float2int + + /* + ** The native CYGWIN lrint and lrintf functions are buggy: + ** http://sourceware.org/ml/cygwin/2005-06/msg00153.html + ** http://sourceware.org/ml/cygwin/2005-09/msg00047.html + ** and slow. + ** These functions (pulled from the Public Domain MinGW math.h header) + ** replace the native versions. + */ + + static inline long double2int (double in) + { long retval ; + + __asm__ __volatile__ + ( "fistpl %0" + : "=m" (retval) + : "t" (in) + : "st" + ) ; + + return retval ; + } /* double2int */ + + static inline long float2int (float in) + { long retval ; + + __asm__ __volatile__ + ( "fistpl %0" + : "=m" (retval) + : "t" (in) + : "st" + ) ; + + return retval ; + } /* float2int */ + +#elif (defined (WIN64) || defined(_WIN64)) + + /* Win64 section should be places before Win32 one, because + ** most likely both WIN32 and WIN64 will be defined in 64-bit case. + */ + + #include + + /* Win64 doesn't seem to have these functions, nor inline assembly. + ** Therefore implement inline versions of these functions here. + */ + #include + #include + + __inline long int + lrint(double flt) + { + return _mm_cvtsd_si32(_mm_load_sd(&flt)); + } + + __inline long int + lrintf(float flt) + { + return _mm_cvtss_si32(_mm_load_ss(&flt)); + } + +#elif (defined (WIN32) || defined (_WIN32)) + + #undef HAVE_LRINT_REPLACEMENT + #define HAVE_LRINT_REPLACEMENT 1 + + #include + + /* + ** Win32 doesn't seem to have these functions. + ** Therefore implement inline versions of these functions here. + */ + + __inline long int + lrint (double flt) + { int intgr ; + + _asm + { fld flt + fistp intgr + } ; + + return intgr ; + } + + __inline long int + lrintf (float flt) + { int intgr ; + + _asm + { fld flt + fistp intgr + } ; + + return intgr ; + } + +#elif (defined (__MWERKS__) && defined (macintosh)) + + /* This MacOS 9 solution was provided by Stephane Letz */ + + #undef HAVE_LRINT_REPLACEMENT + #define HAVE_LRINT_REPLACEMENT 1 + #include + + #undef lrint + #undef lrintf + + #define lrint double2int + #define lrintf float2int + + inline int + float2int (register float in) + { long res [2] ; + + asm + { fctiw in, in + stfd in, res + } + return res [1] ; + } /* float2int */ + + inline int + double2int (register double in) + { long res [2] ; + + asm + { fctiw in, in + stfd in, res + } + return res [1] ; + } /* double2int */ + +#elif (defined (__MACH__) && defined (__APPLE__)) + + /* For Apple MacOSX. */ + + #undef HAVE_LRINT_REPLACEMENT + #define HAVE_LRINT_REPLACEMENT 1 + #include + + #undef lrint + #undef lrintf + + #define lrint double2int + #define lrintf float2int + + inline static long + float2int (register float in) + { int res [2] ; + + __asm__ __volatile__ + ( "fctiw %1, %1\n\t" + "stfd %1, %0" + : "=m" (res) /* Output */ + : "f" (in) /* Input */ + : "memory" + ) ; + + return res [1] ; + } /* lrintf */ + + inline static long + double2int (register double in) + { int res [2] ; + + __asm__ __volatile__ + ( "fctiw %1, %1\n\t" + "stfd %1, %0" + : "=m" (res) /* Output */ + : "f" (in) /* Input */ + : "memory" + ) ; + + return res [1] ; + } /* lrint */ + +#else + #ifndef __sgi + #warning "Don't have the functions lrint() and lrintf()." + #warning "Replacing these functions with a standard C cast." + #endif + + #include + + #define lrint(dbl) ((long) (dbl)) + #define lrintf(flt) ((long) (flt)) + +#endif + + +#endif /* FLOAT_CAST_HEADER */ + diff --git a/soxr/lsr-tests/float_short_test.c b/soxr/lsr-tests/float_short_test.c new file mode 100644 index 0000000..6664a3b --- /dev/null +++ b/soxr/lsr-tests/float_short_test.c @@ -0,0 +1,192 @@ +/* +** Copyright (C) 2003-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include + +#include + +#include "util.h" + +#define BUFFER_LEN 10000 + +static void float_to_short_test (void) ; +static void short_to_float_test (void) ; + +static void float_to_int_test (void) ; +static void int_to_float_test (void) ; + +int +main (void) +{ + puts ("") ; + + float_to_short_test () ; + short_to_float_test () ; + + float_to_int_test () ; + int_to_float_test () ; + + puts ("") ; + + return 0 ; +} /* main */ + +/*===================================================================================== +*/ + +static void +float_to_short_test (void) +{ + static float fpos [] = + { 0.95, 0.99, 1.0, 1.01, 1.1, 2.0, 11.1, 111.1, 2222.2, 33333.3 + } ; + static float fneg [] = + { -0.95, -0.99, -1.0, -1.01, -1.1, -2.0, -11.1, -111.1, -2222.2, -33333.3 + } ; + + static short out [MAX (ARRAY_LEN (fpos), ARRAY_LEN (fneg))] ; + + int k ; + + printf ("\tfloat_to_short_test ............................. ") ; + + src_float_to_short_array (fpos, out, ARRAY_LEN (fpos)) ; + + for (k = 0 ; k < ARRAY_LEN (fpos) ; k++) + if (out [k] < 30000) + { printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ; + exit (1) ; + } ; + + src_float_to_short_array (fneg, out, ARRAY_LEN (fneg)) ; + + for (k = 0 ; k < ARRAY_LEN (fneg) ; k++) + if (out [k] > -30000) + { printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ; + exit (1) ; + } ; + + puts ("ok") ; + + return ; +} /* float_to_short_test */ + +/*------------------------------------------------------------------------------------- +*/ + +static void +short_to_float_test (void) +{ + static short input [BUFFER_LEN] ; + static short output [BUFFER_LEN] ; + static float temp [BUFFER_LEN] ; + + int k ; + + printf ("\tshort_to_float_test ............................. ") ; + + for (k = 0 ; k < ARRAY_LEN (input) ; k++) + input [k] = (k * 0x8000) / ARRAY_LEN (input) ; + + src_short_to_float_array (input, temp, ARRAY_LEN (temp)) ; + src_float_to_short_array (temp, output, ARRAY_LEN (output)) ; + + for (k = 0 ; k < ARRAY_LEN (input) ; k++) + if (ABS (input [k] - output [k]) > 0) + { printf ("\n\n\tLine %d : index %d %d -> %d\n", __LINE__, k, input [k], output [k]) ; + exit (1) ; + } ; + + puts ("ok") ; + + return ; +} /* short_to_float_test */ + +/*===================================================================================== +*/ + +static void +float_to_int_test (void) +{ + static float fpos [] = + { 0.95, 0.99, 1.0, 1.01, 1.1, 2.0, 11.1, 111.1, 2222.2, 33333.3 + } ; + static float fneg [] = + { -0.95, -0.99, -1.0, -1.01, -1.1, -2.0, -11.1, -111.1, -2222.2, -33333.3 + } ; + + static int out [MAX (ARRAY_LEN (fpos), ARRAY_LEN (fneg))] ; + + int k ; + + printf ("\tfloat_to_int_test ............................... ") ; + + src_float_to_int_array (fpos, out, ARRAY_LEN (fpos)) ; + + for (k = 0 ; k < ARRAY_LEN (fpos) ; k++) + if (out [k] < 30000 * 0x10000) + { printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ; + exit (1) ; + } ; + + src_float_to_int_array (fneg, out, ARRAY_LEN (fneg)) ; + + for (k = 0 ; k < ARRAY_LEN (fneg) ; k++) + if (out [k] > -30000 * 0x1000) + { printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ; + exit (1) ; + } ; + + puts ("ok") ; + + return ; +} /* float_to_int_test */ + +/*------------------------------------------------------------------------------------- +*/ + +static void +int_to_float_test (void) +{ + static int input [BUFFER_LEN] ; + static int output [BUFFER_LEN] ; + static float temp [BUFFER_LEN] ; + + int k ; + + printf ("\tint_to_float_test ............................... ") ; + + for (k = 0 ; k < ARRAY_LEN (input) ; k++) + input [k] = (k * 0x80000000) / ARRAY_LEN (input) ; + + src_int_to_float_array (input, temp, ARRAY_LEN (temp)) ; + src_float_to_int_array (temp, output, ARRAY_LEN (output)) ; + + for (k = 0 ; k < ARRAY_LEN (input) ; k++) + if (ABS (input [k] - output [k]) > 0) + { printf ("\n\n\tLine %d : index %d %d -> %d\n", __LINE__, k, input [k], output [k]) ; + exit (1) ; + } ; + + puts ("ok") ; + + return ; +} /* int_to_float_test */ + diff --git a/soxr/lsr-tests/misc_test.c b/soxr/lsr-tests/misc_test.c new file mode 100644 index 0000000..4baa334 --- /dev/null +++ b/soxr/lsr-tests/misc_test.c @@ -0,0 +1,175 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include + +#include + +#include "util.h" + +static void name_test (void) ; +static void error_test (void) ; +static void src_ratio_test (void) ; +static void zero_input_test (int converter) ; + +int +main (void) +{ + puts ("") ; + + printf (" version : %s\n\n", src_get_version ()) ; + + /* Current max converter is SRC_LINEAR. */ + name_test () ; + + error_test () ; + + src_ratio_test () ; + + zero_input_test (SRC_ZERO_ORDER_HOLD) ; + zero_input_test (SRC_LINEAR) ; + zero_input_test (SRC_SINC_FASTEST) ; + + puts ("") ; + return 0 ; +} /* main */ + +static void +name_test (void) +{ const char *name ; + int k = 0 ; + + puts (" name_test :") ; + + while (1) + { name = src_get_name (k) ; + if (name == NULL) + break ; + printf ("\tName %d : %s\n", k, name) ; + printf ("\tDesc %d : %s\n", k, src_get_description (k)) ; + k ++ ; + } ; + + puts ("") ; + + return ; +} /* name_test */ + +/*------------------------------------------------------------------------------ +*/ + +typedef struct +{ double ratio ; + int should_pass ; +} RATIO_TEST ; + +static RATIO_TEST ratio_test [] = +{ { 1.0 / 256.1, 0 }, + { 1.0 / 256.0, 1 }, + { 1.0, 1 }, + { 256.0, 1 }, + { 256.1, 0 }, + { -1.0, 0 } +} ; + +static void +src_ratio_test (void) +{ int k ; + + puts (" src_ratio_test (SRC ratio must be in range [1/256, 256]):" ) ; + + + for (k = 0 ; k < ARRAY_LEN (ratio_test) ; k++) + { if (ratio_test [k].should_pass && src_is_valid_ratio (ratio_test [k].ratio) == 0) + { printf ("\n\nLine %d : SRC ratio %f should have passed.\n\n", __LINE__, ratio_test [k].ratio) ; + exit (1) ; + } ; + if (! ratio_test [k].should_pass && src_is_valid_ratio (ratio_test [k].ratio) != 0) + { printf ("\n\nLine %d : SRC ratio %f should not have passed.\n\n", __LINE__, ratio_test [k].ratio) ; + exit (1) ; + } ; + printf ("\t SRC ratio (%9.5f) : %s ................... ok\n", ratio_test [k].ratio, + (ratio_test [k].should_pass ? "pass" : "fail")) ; + } ; + + puts ("") ; + + return ; +} /* src_ratio_test */ + +static void +error_test (void) +{ const char *errorstr ; + int k, errors = 0 ; + + puts (" error_test :") ; + + for (k = 0 ; 1 ; k++) + { errorstr = src_strerror (k) ; + printf ("\t%-2d : %s\n", k, errorstr) ; + if (errorstr == NULL) + { errors ++ ; + continue ; + } ; + if (strstr (errorstr, "Placeholder.") == errorstr) + break ; + } ; + + if (errors != 0) + { printf ("\n\nLine %d : Missing error numbers above.\n\n", __LINE__) ; + exit (1) ; + } ; + + puts ("") ; + + return ; +} /* error_test */ + +static void +zero_input_test (int converter) +{ SRC_DATA data ; + SRC_STATE *state ; + float out [100] ; + int error ; + + printf (" %s (%-26s) ........ ", __func__, src_get_name (converter)) ; + fflush (stdout) ; + + if ((state = src_new (converter, 1, &error)) == NULL) + { printf ("\n\nLine %d : src_new failed : %s.\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + data.data_in = (float *) 0xdeadbeef ; + data.input_frames = 0 ; + data.data_out = out ; + data.output_frames = ARRAY_LEN (out) ; + data.end_of_input = 0 ; + data.src_ratio = 1.0 ; + + if ((error = src_process (state, &data))) + { printf ("\n\nLine %d : src_new failed : %s.\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + state = src_delete (state) ; + + puts ("ok") ; +} /* zero_input_test */ diff --git a/soxr/lsr-tests/multi_channel_test.c b/soxr/lsr-tests/multi_channel_test.c new file mode 100644 index 0000000..1ad9ced --- /dev/null +++ b/soxr/lsr-tests/multi_channel_test.c @@ -0,0 +1,364 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include "config.h" + +#include +#include +#include +#include +#include + +#include + +#include "util.h" +#define BUFFER_LEN 50000 +#define BLOCK_LEN (12) + +#define MAX_CHANNELS 10 + +static void simple_test (int converter, int channel_count, double target_snr) ; +static void process_test (int converter, int channel_count, double target_snr) ; +static void callback_test (int converter, int channel_count, double target_snr) ; + +int +main (void) +{ double target ; + int k ; + + puts ("\n Zero Order Hold interpolator :") ; + target = 38.0 ; + for (k = 1 ; k <= 3 ; k++) + { simple_test (SRC_ZERO_ORDER_HOLD, k, target) ; + process_test (SRC_ZERO_ORDER_HOLD, k, target) ; + callback_test (SRC_ZERO_ORDER_HOLD, k, target) ; + } ; + + puts ("\n Linear interpolator :") ; + target = 79.0 ; + for (k = 1 ; k <= 3 ; k++) + { simple_test (SRC_LINEAR, k, target) ; + process_test (SRC_LINEAR, k, target) ; + callback_test (SRC_LINEAR, k, target) ; + } ; + + puts ("\n Sinc interpolator :") ; + target = 100.0 ; + for (k = 1 ; k <= MAX_CHANNELS ; k++) + { simple_test (SRC_SINC_FASTEST, k, target) ; + process_test (SRC_SINC_FASTEST, k, target) ; + callback_test (SRC_SINC_FASTEST, k, target) ; + } ; + + puts ("") ; + + return 0 ; +} /* main */ + +/*============================================================================== +*/ + +static float input_serial [BUFFER_LEN * MAX_CHANNELS] ; +static float input_interleaved [BUFFER_LEN * MAX_CHANNELS] ; +static float output_interleaved [BUFFER_LEN * MAX_CHANNELS] ; +static float output_serial [BUFFER_LEN * MAX_CHANNELS] ; + +static void +simple_test (int converter, int channel_count, double target_snr) +{ SRC_DATA src_data ; + + double freq, snr ; + int ch, error, frames ; + + printf ("\t%-22s (%2d channel%c) ............ ", "simple_test", channel_count, channel_count > 1 ? 's' : ' ') ; + fflush (stdout) ; + + assert (channel_count <= MAX_CHANNELS) ; + + memset (input_serial, 0, sizeof (input_serial)) ; + memset (input_interleaved, 0, sizeof (input_interleaved)) ; + memset (output_interleaved, 0, sizeof (output_interleaved)) ; + memset (output_serial, 0, sizeof (output_serial)) ; + + frames = BUFFER_LEN ; + + /* Calculate channel_count separate windowed sine waves. */ + for (ch = 0 ; ch < channel_count ; ch++) + { freq = (200.0 + 33.333333333 * ch) / 44100.0 ; + gen_windowed_sines (1, &freq, 1.0, input_serial + ch * frames, frames) ; + } ; + + /* Interleave the data in preparation for SRC. */ + interleave_data (input_serial, input_interleaved, frames, channel_count) ; + + /* Choose a converstion ratio <= 1.0. */ + src_data.src_ratio = 0.95 ; + + src_data.data_in = input_interleaved ; + src_data.input_frames = frames ; + + src_data.data_out = output_interleaved ; + src_data.output_frames = frames ; + + if ((error = src_simple (&src_data, converter, channel_count))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + if (fabs (src_data.output_frames_gen - src_data.src_ratio * src_data.input_frames) > 2) + { printf ("\n\nLine %d : bad output data length %ld should be %d.\n", __LINE__, + src_data.output_frames_gen, (int) floor (src_data.src_ratio * src_data.input_frames)) ; + printf ("\tsrc_ratio : %.4f\n", src_data.src_ratio) ; + printf ("\tinput_len : %ld\n", src_data.input_frames) ; + printf ("\toutput_len : %ld\n\n", src_data.output_frames_gen) ; + exit (1) ; + } ; + + /* De-interleave data so SNR can be calculated for each channel. */ + deinterleave_data (output_interleaved, output_serial, frames, channel_count) ; + + for (ch = 0 ; ch < channel_count ; ch++) + { snr = calculate_snr (output_serial + ch * frames, frames, 1) ; + if (snr < target_snr) + { printf ("\n\nLine %d: channel %d snr %f should be %f\n", __LINE__, ch, snr, target_snr) ; + save_oct_float ("output.dat", input_serial, channel_count * frames, output_serial, channel_count * frames) ; + exit (1) ; + } ; + } ; + + puts ("ok") ; + + return ; +} /* simple_test */ + +/*============================================================================== +*/ + +static void +process_test (int converter, int channel_count, double target_snr) +{ SRC_STATE *src_state ; + SRC_DATA src_data ; + + double freq, snr ; + int ch, error, frames, current_in, current_out ; + + printf ("\t%-22s (%2d channel%c) ............ ", "process_test", channel_count, channel_count > 1 ? 's' : ' ') ; + fflush (stdout) ; + + assert (channel_count <= MAX_CHANNELS) ; + + memset (input_serial, 0, sizeof (input_serial)) ; + memset (input_interleaved, 0, sizeof (input_interleaved)) ; + memset (output_interleaved, 0, sizeof (output_interleaved)) ; + memset (output_serial, 0, sizeof (output_serial)) ; + + frames = BUFFER_LEN ; + + /* Calculate channel_count separate windowed sine waves. */ + for (ch = 0 ; ch < channel_count ; ch++) + { freq = (400.0 + 11.333333333 * ch) / 44100.0 ; + gen_windowed_sines (1, &freq, 1.0, input_serial + ch * frames, frames) ; + } ; + + /* Interleave the data in preparation for SRC. */ + interleave_data (input_serial, input_interleaved, frames, channel_count) ; + + /* Perform sample rate conversion. */ + if ((src_state = src_new (converter, channel_count, &error)) == NULL) + { printf ("\n\nLine %d : src_new() failed : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + src_data.end_of_input = 0 ; /* Set this later. */ + + /* Choose a converstion ratio < 1.0. */ + src_data.src_ratio = 0.95 ; + + src_data.data_in = input_interleaved ; + src_data.data_out = output_interleaved ; + + current_in = current_out = 0 ; + + while (1) + { src_data.input_frames = MAX (MIN (BLOCK_LEN, frames - current_in), 0) ; + src_data.output_frames = MAX (MIN (BLOCK_LEN, frames - current_out), 0) ; + + if ((error = src_process (src_state, &src_data))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + if (src_data.end_of_input && src_data.output_frames_gen == 0) + break ; + + current_in += src_data.input_frames_used ; + current_out += src_data.output_frames_gen ; + + src_data.data_in += src_data.input_frames_used * channel_count ; + src_data.data_out += src_data.output_frames_gen * channel_count ; + + src_data.end_of_input = (current_in >= frames) ? 1 : 0 ; + } ; + + src_state = src_delete (src_state) ; + + if (fabs (current_out - src_data.src_ratio * current_in) > 2) + { printf ("\n\nLine %d : bad output data length %d should be %d.\n", __LINE__, + current_out, (int) floor (src_data.src_ratio * current_in)) ; + printf ("\tsrc_ratio : %.4f\n", src_data.src_ratio) ; + printf ("\tinput_len : %d\n", frames) ; + printf ("\toutput_len : %d\n\n", current_out) ; + exit (1) ; + } ; + + /* De-interleave data so SNR can be calculated for each channel. */ + deinterleave_data (output_interleaved, output_serial, frames, channel_count) ; + + for (ch = 0 ; ch < channel_count ; ch++) + { snr = calculate_snr (output_serial + ch * frames, frames, 1) ; + if (snr < target_snr) + { printf ("\n\nLine %d: channel %d snr %f should be %f\n", __LINE__, ch, snr, target_snr) ; + save_oct_float ("output.dat", input_serial, channel_count * frames, output_serial, channel_count * frames) ; + exit (1) ; + } ; + } ; + + puts ("ok") ; + + return ; +} /* process_test */ + +/*============================================================================== +*/ + +typedef struct +{ int channels ; + long total_frames ; + long current_frame ; + float *data ; +} TEST_CB_DATA ; + +static long +test_callback_func (void *cb_data, float **data) +{ TEST_CB_DATA *pcb_data ; + + long frames ; + + if ((pcb_data = cb_data) == NULL) + return 0 ; + + if (data == NULL) + return 0 ; + + *data = pcb_data->data + (pcb_data->current_frame * pcb_data->channels) ; + + if (pcb_data->total_frames - pcb_data->current_frame < BLOCK_LEN) + frames = pcb_data->total_frames - pcb_data->current_frame ; + else + frames = BLOCK_LEN ; + + pcb_data->current_frame += frames ; + + return frames ; +} /* test_callback_func */ + +static void +callback_test (int converter, int channel_count, double target_snr) +{ TEST_CB_DATA test_callback_data ; + SRC_STATE *src_state = NULL ; + + double freq, snr, src_ratio ; + int ch, error, frames, read_total, read_count ; + + printf ("\t%-22s (%2d channel%c) ............ ", "callback_test", channel_count, channel_count > 1 ? 's' : ' ') ; + fflush (stdout) ; + + assert (channel_count <= MAX_CHANNELS) ; + + memset (input_serial, 0, sizeof (input_serial)) ; + memset (input_interleaved, 0, sizeof (input_interleaved)) ; + memset (output_interleaved, 0, sizeof (output_interleaved)) ; + memset (output_serial, 0, sizeof (output_serial)) ; + memset (&test_callback_data, 0, sizeof (test_callback_data)) ; + + frames = BUFFER_LEN ; + + /* Calculate channel_count separate windowed sine waves. */ + for (ch = 0 ; ch < channel_count ; ch++) + { freq = (200.0 + 33.333333333 * ch) / 44100.0 ; + gen_windowed_sines (1, &freq, 1.0, input_serial + ch * frames, frames) ; + } ; + + /* Interleave the data in preparation for SRC. */ + interleave_data (input_serial, input_interleaved, frames, channel_count) ; + + /* Perform sample rate conversion. */ + src_ratio = 0.95 ; + test_callback_data.channels = channel_count ; + test_callback_data.total_frames = frames ; + test_callback_data.current_frame = 0 ; + test_callback_data.data = input_interleaved ; + + if ((src_state = src_callback_new (test_callback_func, converter, channel_count, &error, &test_callback_data)) == NULL) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + read_total = 0 ; + while (read_total < frames) + { read_count = src_callback_read (src_state, src_ratio, frames - read_total, output_interleaved + read_total * channel_count) ; + + if (read_count <= 0) + break ; + + read_total += read_count ; + } ; + + if ((error = src_error (src_state)) != 0) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + src_state = src_delete (src_state) ; + + if (fabs (read_total - src_ratio * frames) > 2) + { printf ("\n\nLine %d : bad output data length %d should be %d.\n", __LINE__, + read_total, (int) floor (src_ratio * frames)) ; + printf ("\tsrc_ratio : %.4f\n", src_ratio) ; + printf ("\tinput_len : %d\n", frames) ; + printf ("\toutput_len : %d\n\n", read_total) ; + exit (1) ; + } ; + + /* De-interleave data so SNR can be calculated for each channel. */ + deinterleave_data (output_interleaved, output_serial, frames, channel_count) ; + + for (ch = 0 ; ch < channel_count ; ch++) + { snr = calculate_snr (output_serial + ch * frames, frames, 1) ; + if (snr < target_snr) + { printf ("\n\nLine %d: channel %d snr %f should be %f\n", __LINE__, ch, snr, target_snr) ; + save_oct_float ("output.dat", input_serial, channel_count * frames, output_serial, channel_count * frames) ; + exit (1) ; + } ; + } ; + + puts ("ok") ; + + return ; +} /* callback_test */ + diff --git a/soxr/lsr-tests/multichan_throughput_test.c b/soxr/lsr-tests/multichan_throughput_test.c new file mode 100644 index 0000000..523139e --- /dev/null +++ b/soxr/lsr-tests/multichan_throughput_test.c @@ -0,0 +1,216 @@ +/* +** Copyright (C) 2008-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include +#include +#include + +#include + +#include "config.h" + +#include "util.h" +#include "float_cast.h" + +#define BUFFER_LEN (1<<17) + +static float input [BUFFER_LEN] ; +static float output [BUFFER_LEN] ; + +static long +throughput_test (int converter, int channels, long best_throughput) +{ SRC_DATA src_data ; + clock_t start_time, clock_time ; + double duration ; + long total_frames = 0, throughput ; + int error ; + + printf (" %-30s %2d ", src_get_name (converter), channels) ; + fflush (stdout) ; + + src_data.data_in = input ; + src_data.input_frames = ARRAY_LEN (input) / channels ; + + src_data.data_out = output ; + src_data.output_frames = ARRAY_LEN (output) / channels ; + + src_data.src_ratio = 0.99 ; + + sleep (2) ; + + start_time = clock () ; + + do + { + if ((error = src_simple (&src_data, converter, channels)) != 0) + { puts (src_strerror (error)) ; + exit (1) ; + } ; + + total_frames += src_data.output_frames_gen ; + + clock_time = clock () - start_time ; + duration = (1.0 * clock_time) / CLOCKS_PER_SEC ; + } + while (duration < 5.0) ; + + if (src_data.input_frames_used != src_data.input_frames) + { printf ("\n\nLine %d : input frames used %ld should be %ld\n", __LINE__, src_data.input_frames_used, src_data.input_frames) ; + exit (1) ; + } ; + + if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2) + { printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ; + printf (" input len : %d\n", ARRAY_LEN (input) / channels) ; + printf (" output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen, + floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ; + exit (1) ; + } ; + + throughput = lrint (floor (total_frames / duration)) ; + + if (best_throughput == 0) + { best_throughput = MAX (throughput, best_throughput) ; + printf ("%5.2f %10ld\n", duration, throughput) ; + } + else + { best_throughput = MAX (throughput, best_throughput) ; + printf ("%5.2f %10ld %10ld\n", duration, throughput, best_throughput) ; + } + + return best_throughput ; +} /* throughput_test */ + +static void +single_run (void) +{ const int max_channels = 10 ; + int k ; + + printf ("\n CPU name : %s\n", get_cpu_name ()) ; + + puts ( + "\n" + " Converter Channels Duration Throughput\n" + " ---------------------------------------------------------------------" + ) ; + + for (k = 1 ; k <= max_channels / 2 ; k++) + throughput_test (SRC_SINC_FASTEST, k, 0) ; + + puts ("") ; + for (k = 1 ; k <= max_channels / 2 ; k++) + throughput_test (SRC_SINC_MEDIUM_QUALITY, k, 0) ; + + puts ("") ; + for (k = 1 ; k <= max_channels ; k++) + throughput_test (SRC_SINC_BEST_QUALITY, k, 0) ; + + puts ("") ; + return ; +} /* single_run */ + +static void +multi_run (int run_count) +{ int k, ch ; + + printf ("\n CPU name : %s\n", get_cpu_name ()) ; + + puts ( + "\n" + " Converter Channels Duration Throughput Best Throughput\n" + " ----------------------------------------------------------------------------------------" + ) ; + + for (ch = 1 ; ch <= 5 ; ch++) + { long sinc_fastest = 0, sinc_medium = 0, sinc_best = 0 ; + + for (k = 0 ; k < run_count ; k++) + { sinc_fastest = throughput_test (SRC_SINC_FASTEST, ch, sinc_fastest) ; + sinc_medium = throughput_test (SRC_SINC_MEDIUM_QUALITY, ch, sinc_medium) ; + sinc_best = throughput_test (SRC_SINC_BEST_QUALITY, ch, sinc_best) ; + + puts ("") ; + + /* Let the CPU cool down. We might be running on a laptop. */ + sleep (10) ; + } ; + + puts ( + "\n" + " Converter Best Throughput\n" + " ------------------------------------------------" + ) ; + + printf (" %-30s %10ld\n", src_get_name (SRC_SINC_FASTEST), sinc_fastest) ; + printf (" %-30s %10ld\n", src_get_name (SRC_SINC_MEDIUM_QUALITY), sinc_medium) ; + printf (" %-30s %10ld\n", src_get_name (SRC_SINC_BEST_QUALITY), sinc_best) ; + } ; + + puts ("") ; +} /* multi_run */ + +static void +usage_exit (const char * argv0) +{ const char * cptr ; + + if ((cptr = strrchr (argv0, '/')) != NULL) + argv0 = cptr ; + + printf ( + "Usage :\n" + " %s - Single run of the throughput test.\n" + " %s --best-of N - Do N runs of test a print bext result.\n" + "\n", + argv0, argv0) ; + + exit (0) ; +} /* usage_exit */ + +int +main (int argc, char ** argv) +{ double freq ; + + memset (input, 0, sizeof (input)) ; + freq = 0.01 ; + gen_windowed_sines (1, &freq, 1.0, input, BUFFER_LEN) ; + + if (argc == 1) + single_run () ; + else if (argc == 3 && strcmp (argv [1], "--best-of") == 0) + { int run_count = atoi (argv [2]) ; + + if (run_count < 1 || run_count > 20) + { printf ("Please be sensible. Run count should be in range (1, 10].\n") ; + exit (1) ; + } ; + + multi_run (run_count) ; + } + else + usage_exit (argv [0]) ; + + puts ( + " Duration is in seconds.\n" + " Throughput is in frames/sec (more is better).\n" + ) ; + + return 0 ; +} /* main */ + diff --git a/soxr/lsr-tests/reset_test.c b/soxr/lsr-tests/reset_test.c new file mode 100644 index 0000000..40485c2 --- /dev/null +++ b/soxr/lsr-tests/reset_test.c @@ -0,0 +1,238 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include + +#include + +#include "util.h" + +#define BUFFER_LEN 2048 +#define CB_READ_LEN 256 + +static void process_reset_test (int converter) ; +static void callback_reset_test (int converter) ; + +static float data_one [BUFFER_LEN] ; +static float data_zero [BUFFER_LEN] ; + +int +main (void) +{ + puts ("") ; + + process_reset_test (SRC_ZERO_ORDER_HOLD) ; + process_reset_test (SRC_LINEAR) ; + process_reset_test (SRC_SINC_FASTEST) ; + + callback_reset_test (SRC_ZERO_ORDER_HOLD) ; + callback_reset_test (SRC_LINEAR) ; + callback_reset_test (SRC_SINC_FASTEST) ; + + puts ("") ; + + return 0 ; +} /* main */ + +static void +process_reset_test (int converter) +{ static float output [BUFFER_LEN] ; + + SRC_STATE *src_state ; + SRC_DATA src_data ; + int k, error ; + + printf ("\tprocess_reset_test (%-28s) ....... ", src_get_name (converter)) ; + fflush (stdout) ; + + for (k = 0 ; k < BUFFER_LEN ; k++) + { data_one [k] = 1.0 ; + data_zero [k] = 0.0 ; + } ; + + /* Get a converter. */ + if ((src_state = src_new (converter, 1, &error)) == NULL) + { printf ("\n\nLine %d : src_new() failed : %s.\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + /* Process a bunch of 1.0 valued samples. */ + src_data.data_in = data_one ; + src_data.data_out = output ; + src_data.input_frames = BUFFER_LEN ; + src_data.output_frames = BUFFER_LEN ; + src_data.src_ratio = 0.9 ; + src_data.end_of_input = 1 ; + + if ((error = src_process (src_state, &src_data)) != 0) + { printf ("\n\nLine %d : src_simple () returned error : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + /* Reset the state of the converter.*/ + src_reset (src_state) ; + + /* Now process some zero data. */ + src_data.data_in = data_zero ; + src_data.data_out = output ; + src_data.input_frames = BUFFER_LEN ; + src_data.output_frames = BUFFER_LEN ; + src_data.src_ratio = 0.9 ; + src_data.end_of_input = 1 ; + + if ((error = src_process (src_state, &src_data)) != 0) + { printf ("\n\nLine %d : src_simple () returned error : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + /* Finally make sure that the output data is zero ie reset was sucessful. */ + for (k = 0 ; k < BUFFER_LEN / 2 ; k++) + if (output [k] != 0.0) + { printf ("\n\nLine %d : output [%d] should be 0.0, is %f.\n", __LINE__, k, output [k]) ; + exit (1) ; + } ; + + /* Make sure that this function has been exported. */ + src_set_ratio (src_state, 1.0) ; + + /* Delete converter. */ + src_state = src_delete (src_state) ; + + puts ("ok") ; +} /* process_reset_test */ + +/*============================================================================== +*/ + +typedef struct +{ int channels ; + long count, total ; + float *data ; +} TEST_CB_DATA ; + +static long +test_callback_func (void *cb_data, float **data) +{ TEST_CB_DATA *pcb_data ; + + long frames ; + + if ((pcb_data = cb_data) == NULL) + return 0 ; + + if (data == NULL) + return 0 ; + + if (pcb_data->total - pcb_data->count > 0) + frames = pcb_data->total - pcb_data->count ; + else + frames = 0 ; + + *data = pcb_data->data + pcb_data->count ; + pcb_data->count += frames ; + + return frames ; +} /* test_callback_func */ + +static void +callback_reset_test (int converter) +{ static TEST_CB_DATA test_callback_data ; + + static float output [BUFFER_LEN] ; + + SRC_STATE *src_state ; + + double src_ratio = 1.1 ; + long read_count, read_total ; + int k, error ; + + printf ("\tcallback_reset_test (%-28s) ....... ", src_get_name (converter)) ; + fflush (stdout) ; + + for (k = 0 ; k < ARRAY_LEN (data_one) ; k++) + { data_one [k] = 1.0 ; + data_zero [k] = 0.0 ; + } ; + + if ((src_state = src_callback_new (test_callback_func, converter, 1, &error, &test_callback_data)) == NULL) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + /* Process a bunch of 1.0 valued samples. */ + test_callback_data.channels = 1 ; + test_callback_data.count = 0 ; + test_callback_data.total = ARRAY_LEN (data_one) ; + test_callback_data.data = data_one ; + + read_total = 0 ; + do + { read_count = (ARRAY_LEN (output) - read_total > CB_READ_LEN) ? CB_READ_LEN : ARRAY_LEN (output) - read_total ; + read_count = src_callback_read (src_state, src_ratio, read_count, output + read_total) ; + read_total += read_count ; + } + while (read_count > 0) ; + + /* Check for errors. */ + if ((error = src_error (src_state)) != 0) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + /* Reset the state of the converter.*/ + src_reset (src_state) ; + + /* Process a bunch of 0.0 valued samples. */ + test_callback_data.channels = 1 ; + test_callback_data.count = 0 ; + test_callback_data.total = ARRAY_LEN (data_zero) ; + test_callback_data.data = data_zero ; + + /* Now process some zero data. */ + read_total = 0 ; + do + { read_count = (ARRAY_LEN (output) - read_total > CB_READ_LEN) ? CB_READ_LEN : ARRAY_LEN (output) - read_total ; + read_count = src_callback_read (src_state, src_ratio, read_count, output + read_total) ; + read_total += read_count ; + } + while (read_count > 0) ; + + /* Check for errors. */ + if ((error = src_error (src_state)) != 0) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + /* Finally make sure that the output data is zero ie reset was sucessful. */ + for (k = 0 ; k < BUFFER_LEN / 2 ; k++) + if (output [k] != 0.0) + { printf ("\n\nLine %d : output [%d] should be 0.0, is %f.\n\n", __LINE__, k, output [k]) ; + save_oct_float ("output.dat", data_one, ARRAY_LEN (data_one), output, ARRAY_LEN (output)) ; + exit (1) ; + } ; + + /* Make sure that this function has been exported. */ + src_set_ratio (src_state, 1.0) ; + + /* Delete converter. */ + src_state = src_delete (src_state) ; + + puts ("ok") ; +} /* callback_reset_test */ + + diff --git a/soxr/lsr-tests/simple_test.c b/soxr/lsr-tests/simple_test.c new file mode 100644 index 0000000..91dcde3 --- /dev/null +++ b/soxr/lsr-tests/simple_test.c @@ -0,0 +1,117 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include +#include + +#include + +#include "util.h" + +#define BUFFER_LEN 2048 + +static void simple_test (int converter, double ratio) ; + +int +main (void) +{ static double src_ratios [] = + { 1.0001, 0.099, 0.1, 0.33333333, 0.789, 1.9, 3.1, 9.9 + } ; + + int k ; + + puts ("") ; + + puts (" Zero Order Hold interpolator :") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + simple_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ; + + puts (" Linear interpolator :") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + simple_test (SRC_LINEAR, src_ratios [k]) ; + + puts (" Sinc interpolator :") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + simple_test (SRC_SINC_FASTEST, src_ratios [k]) ; + + puts ("") ; + + return 0 ; +} /* main */ + +static void +simple_test (int converter, double src_ratio) +{ static float input [BUFFER_LEN], output [BUFFER_LEN] ; + + SRC_DATA src_data ; + + int input_len, output_len, error, terminate ; + + printf ("\tsimple_test (SRC ratio = %6.4f) ........... ", src_ratio) ; + fflush (stdout) ; + + /* Calculate maximun input and output lengths. */ + if (src_ratio >= 1.0) + { output_len = BUFFER_LEN ; + input_len = (int) floor (BUFFER_LEN / src_ratio) ; + } + else + { input_len = BUFFER_LEN ; + output_len = (int) floor (BUFFER_LEN * src_ratio) ; + } ; + + /* Reduce input_len by 10 so output is longer than necessary. */ + input_len -= 10 ; + + if (output_len > BUFFER_LEN) + { printf ("\n\nLine %d : output_len > BUFFER_LEN\n\n", __LINE__) ; + exit (1) ; + } ; + + memset (&src_data, 0, sizeof (src_data)) ; + + src_data.data_in = input ; + src_data.input_frames = input_len ; + + src_data.src_ratio = src_ratio ; + + src_data.data_out = output ; + src_data.output_frames = BUFFER_LEN ; + + if ((error = src_simple (&src_data, converter, 1))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + terminate = (int) ceil ((src_ratio >= 1.0) ? src_ratio : 1.0 / src_ratio) ; + + if (fabs (src_data.output_frames_gen - src_ratio * input_len) > 2 * terminate) + { printf ("\n\nLine %d : bad output data length %ld should be %d.\n", __LINE__, + src_data.output_frames_gen, (int) floor (src_ratio * input_len)) ; + printf ("\tsrc_ratio : %.4f\n", src_ratio) ; + printf ("\tinput_len : %d\n\toutput_len : %d\n\n", input_len, output_len) ; + exit (1) ; + } ; + + puts ("ok") ; + + return ; +} /* simple_test */ + diff --git a/soxr/lsr-tests/sndfile-resample.c b/soxr/lsr-tests/sndfile-resample.c new file mode 100644 index 0000000..63d179c --- /dev/null +++ b/soxr/lsr-tests/sndfile-resample.c @@ -0,0 +1,332 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include "config.h" + +#include +#include +#include +#include +#include + +#if (HAVE_SNDFILE) + +#include +#include + +#define DEFAULT_CONVERTER SRC_SINC_MEDIUM_QUALITY + +#define BUFFER_LEN 4096 /*-(1<<16)-*/ + +static void usage_exit (const char *progname) ; +static sf_count_t sample_rate_convert (SNDFILE *infile, SNDFILE *outfile, int converter, double src_ratio, int channels, double * gain) ; +static double apply_gain (float * data, long frames, int channels, double max, double gain) ; + +int +main (int argc, char *argv []) +{ SNDFILE *infile, *outfile = NULL ; + SF_INFO sfinfo ; + + sf_count_t count ; + double src_ratio = -1.0, gain = 1.0 ; + int new_sample_rate = -1, k, converter, max_speed = SF_FALSE ; + + if (argc == 2 && strcmp (argv [1], "--version") == 0) + { char buffer [64], *cptr ; + + if ((cptr = strrchr (argv [0], '/')) != NULL) + argv [0] = cptr + 1 ; + if ((cptr = strrchr (argv [0], '\\')) != NULL) + argv [0] = cptr + 1 ; + + sf_command (NULL, SFC_GET_LIB_VERSION, buffer, sizeof (buffer)) ; + + printf ("%s (%s,%s)\n", argv [0], src_get_version (), buffer) ; + exit (0) ; + } ; + + if (argc != 5 && argc != 7 && argc != 8) + usage_exit (argv [0]) ; + + /* Set default converter. */ + converter = DEFAULT_CONVERTER ; + + for (k = 1 ; k < argc - 2 ; k++) + { if (strcmp (argv [k], "--max-speed") == 0) + max_speed = SF_TRUE ; + else if (strcmp (argv [k], "-to") == 0) + { k ++ ; + new_sample_rate = atoi (argv [k]) ; + } + else if (strcmp (argv [k], "-by") == 0) + { k ++ ; + src_ratio = atof (argv [k]) ; + } + else if (strcmp (argv [k], "-c") == 0) + { k ++ ; + converter = atoi (argv [k]) ; + } + else + usage_exit (argv [0]) ; + } ; + + if (new_sample_rate <= 0 && src_ratio <= 0.0) + usage_exit (argv [0]) ; + + if (src_get_name (converter) == NULL) + { printf ("Error : bad converter number.\n") ; + usage_exit (argv [0]) ; + } ; + + if (strcmp (argv [argc - 2], argv [argc - 1]) == 0) + { printf ("Error : input and output file names are the same.\n") ; + exit (1) ; + } ; + + if ((infile = sf_open (argv [argc - 2], SFM_READ, &sfinfo)) == NULL) + { printf ("Error : Not able to open input file '%s'\n", argv [argc - 2]) ; + exit (1) ; + } ; + + printf ("Input File : %s\n", argv [argc - 2]) ; + printf ("Sample Rate : %d\n", sfinfo.samplerate) ; + printf ("Input Frames : %ld\n\n", (long) sfinfo.frames) ; + + if (new_sample_rate > 0) + { src_ratio = (1.0 * new_sample_rate) / sfinfo.samplerate ; + sfinfo.samplerate = new_sample_rate ; + } + else if (src_is_valid_ratio (src_ratio)) + sfinfo.samplerate = (int) floor (sfinfo.samplerate * src_ratio) ; + else + { printf ("Not able to determine new sample rate. Exiting.\n") ; + sf_close (infile) ; + exit (1) ; + } ; + + if (fabs (src_ratio - 1.0) < 1e-20) + { printf ("Target samplerate and input samplerate are the same. Exiting.\n") ; + sf_close (infile) ; + exit (0) ; + } ; + + printf ("SRC Ratio : %f\n", src_ratio) ; + printf ("Converter : %s\n\n", src_get_name (converter)) ; + + if (src_is_valid_ratio (src_ratio) == 0) + { printf ("Error : Sample rate change out of valid range.\n") ; + sf_close (infile) ; + exit (1) ; + } ; + + /* Delete the output file length to zero if already exists. */ + remove (argv [argc - 1]) ; + + printf ("Output file : %s\n", argv [argc - 1]) ; + printf ("Sample Rate : %d\n", sfinfo.samplerate) ; + + do + { sf_close (outfile) ; + + if ((outfile = sf_open (argv [argc - 1], SFM_WRITE, &sfinfo)) == NULL) + { printf ("Error : Not able to open output file '%s'\n", argv [argc - 1]) ; + sf_close (infile) ; + exit (1) ; + } ; + + if (max_speed) + { /* This is mainly for the comparison program tests/src-evaluate.c */ + sf_command (outfile, SFC_SET_ADD_PEAK_CHUNK, NULL, SF_FALSE) ; + } + else + { /* Update the file header after every write. */ + sf_command (outfile, SFC_SET_UPDATE_HEADER_AUTO, NULL, SF_TRUE) ; + } ; + + sf_command (outfile, SFC_SET_CLIPPING, NULL, SF_TRUE) ; + + count = sample_rate_convert (infile, outfile, converter, src_ratio, sfinfo.channels, &gain) ; + } + while (count < 0) ; + + printf ("Output Frames : %ld\n\n", (long) count) ; + + sf_close (infile) ; + sf_close (outfile) ; + + return 0 ; +} /* main */ + +/*============================================================================== +*/ + +static sf_count_t +sample_rate_convert (SNDFILE *infile, SNDFILE *outfile, int converter, double src_ratio, int channels, double * gain) +{ static float input [BUFFER_LEN] ; + static float output [BUFFER_LEN] ; + + SRC_STATE *src_state ; + SRC_DATA src_data ; + int error ; + double max = 0.0 ; + sf_count_t output_count = 0 ; + + sf_seek (infile, 0, SEEK_SET) ; + sf_seek (outfile, 0, SEEK_SET) ; + + /* Initialize the sample rate converter. */ + if ((src_state = src_new (converter, channels, &error)) == NULL) + { printf ("\n\nError : src_new() failed : %s.\n\n", src_strerror (error)) ; + exit (1) ; + } ; + + src_data.end_of_input = 0 ; /* Set this later. */ + + /* Start with zero to force load in while loop. */ + src_data.input_frames = 0 ; + src_data.data_in = input ; + + src_data.src_ratio = src_ratio ; + + src_data.data_out = output ; + src_data.output_frames = BUFFER_LEN /channels ; + + while (1) + { + /* If the input buffer is empty, refill it. */ + if (src_data.input_frames == 0) + { src_data.input_frames = sf_readf_float (infile, input, BUFFER_LEN / channels) ; + src_data.data_in = input ; + + /* The last read will not be a full buffer, so snd_of_input. */ + if (src_data.input_frames < BUFFER_LEN / channels) + src_data.end_of_input = SF_TRUE ; + } ; + + if ((error = src_process (src_state, &src_data))) + { printf ("\nError : %s\n", src_strerror (error)) ; + exit (1) ; + } ; + + /* Terminate if done. */ + if (src_data.end_of_input && src_data.output_frames_gen == 0) + break ; + + max = apply_gain (src_data.data_out, src_data.output_frames_gen, channels, max, *gain) ; + + /* Write output. */ + sf_writef_float (outfile, output, src_data.output_frames_gen) ; + output_count += src_data.output_frames_gen ; + + src_data.data_in += src_data.input_frames_used * channels ; + src_data.input_frames -= src_data.input_frames_used ; + } ; + + src_state = src_delete (src_state) ; + + if (max > 1.0) + { *gain = 1.0 / max ; + printf ("\nOutput has clipped. Restarting conversion to prevent clipping.\n\n") ; + return -1 ; + } ; + + return output_count ; +} /* sample_rate_convert */ + +static double +apply_gain (float * data, long frames, int channels, double max, double gain) +{ + long k ; + + for (k = 0 ; k < frames * channels ; k++) + { data [k] *= gain ; + + if (fabs (data [k]) > max) + max = fabs (data [k]) ; + } ; + + return max ; +} /* apply_gain */ + +static void +usage_exit (const char *progname) +{ char lsf_ver [128] ; + const char *cptr ; + int k ; + + if ((cptr = strrchr (progname, '/')) != NULL) + progname = cptr + 1 ; + + if ((cptr = strrchr (progname, '\\')) != NULL) + progname = cptr + 1 ; + + + sf_command (NULL, SFC_GET_LIB_VERSION, lsf_ver, sizeof (lsf_ver)) ; + + printf ("\n" + " A Sample Rate Converter using libsndfile for file I/O and Secret \n" + " Rabbit Code (aka libsamplerate) for performing the conversion.\n" + " It works on any file format supported by libsndfile with any \n" + " number of channels (limited only by host memory).\n" + "\n" + " %s\n" + " %s\n" + "\n" + " Usage : \n" + " %s -to [-c ] \n" + " %s -by [-c ] \n" + "\n", src_get_version (), lsf_ver, progname, progname) ; + + puts ( + " The optional -c argument allows the converter type to be chosen from\n" + " the following list :" + "\n" + ) ; + + for (k = 0 ; (cptr = src_get_name (k)) != NULL ; k++) + printf (" %d : %s%s\n", k, cptr, k == DEFAULT_CONVERTER ? " (default)" : "") ; + + puts ("") ; + + exit (1) ; +} /* usage_exit */ + +/*============================================================================== +*/ + +#else /* (HAVE_SNFILE == 0) */ + +/* Alternative main function when libsndfile is not available. */ + +int +main (void) +{ puts ( + "\n" + "****************************************************************\n" + " This example program was compiled without libsndfile \n" + " (http://www.mega-nerd.com/libsndfile/).\n" + " It is therefore completely broken and non-functional.\n" + "****************************************************************\n" + "\n" + ) ; + + return 0 ; +} /* main */ + +#endif + diff --git a/soxr/lsr-tests/snr_bw_test.c b/soxr/lsr-tests/snr_bw_test.c new file mode 100644 index 0000000..55130b4 --- /dev/null +++ b/soxr/lsr-tests/snr_bw_test.c @@ -0,0 +1,401 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include "config.h" + +#include +#include +#include +#include +#include + +#if (HAVE_FFTW3) + +#include + +#include "util.h" + +#define BUFFER_LEN 50000 +#define MAX_FREQS 4 +#define MAX_RATIOS 6 +#define MAX_SPEC_LEN (1<<15) + +#ifndef M_PI +#define M_PI 3.14159265358979323846264338 +#endif + +enum +{ BOOLEAN_FALSE = 0, + BOOLEAN_TRUE = 1 +} ; + +typedef struct +{ int freq_count ; + double freqs [MAX_FREQS] ; + + double src_ratio ; + int pass_band_peaks ; + + double snr ; + double peak_value ; +} SINGLE_TEST ; + +typedef struct +{ int converter ; + int tests ; + int do_bandwidth_test ; + SINGLE_TEST test_data [10] ; +} CONVERTER_TEST ; + +static double snr_test (SINGLE_TEST *snr_test_data, int number, int converter, int verbose) ; +static double find_peak (float *output, int output_len) ; +static double bandwidth_test (int converter, int verbose) ; + +int +main (int argc, char *argv []) +{ CONVERTER_TEST snr_test_data [] = + { + { SRC_ZERO_ORDER_HOLD, + 8, + BOOLEAN_FALSE, + { { 1, { 0.01111111111 }, 3.0, 1, 28.0, 1.0 }, + { 1, { 0.01111111111 }, 0.6, 1, 36.0, 1.0 }, + { 1, { 0.01111111111 }, 0.3, 1, 36.0, 1.0 }, + { 1, { 0.01111111111 }, 1.0, 1, 150.0, 1.0 }, + { 1, { 0.01111111111 }, 1.001, 1, 38.0, 1.0 }, + { 2, { 0.011111, 0.324 }, 1.9999, 2, 14.0, .96 }, + { 2, { 0.012345, 0.457 }, 0.456789, 1, 12.0, .96 }, + { 1, { 0.3511111111 }, 1.33, 1, 10.0, 1.0 } + } + }, + + { SRC_LINEAR, + 8, + BOOLEAN_FALSE, + { { 1, { 0.01111111111 }, 3.0, 1, 73.0, 1.0 }, + { 1, { 0.01111111111 }, 0.6, 1, 73.0, 1.0 }, + { 1, { 0.01111111111 }, 0.3, 1, 73.0, 1.0 }, + { 1, { 0.01111111111 }, 1.0, 1, 150.0, 1.0 }, + { 1, { 0.01111111111 }, 1.001, 1, 77.0, 1.0 }, + { 2, { 0.011111, 0.324 }, 1.9999, 2, 16.0, 0.96 }, + { 2, { 0.012345, 0.457 }, 0.456789, 1, 26.0, 0.96 }, + { 1, { 0.3511111111 }, 1.33, 1, 14.4, 0.99 } + } + }, + + { SRC_SINC_FASTEST, + 9, + BOOLEAN_TRUE, + { { 1, { 0.01111111111 }, 3.0, 1, 100.0, 1.0 }, + { 1, { 0.01111111111 }, 0.6, 1, 99.0, 1.0 }, + { 1, { 0.01111111111 }, 0.3, 1, 100.0, 1.0 }, + { 1, { 0.01111111111 }, 1.0, 1, 150.0, 1.0 }, + { 1, { 0.01111111111 }, 1.001, 1, 100.0, 1.0 }, + { 2, { 0.011111, 0.324 }, 1.9999, 2, 97.0, 1.0 }, + { 2, { 0.012345, 0.457 }, 0.456789, 1, 100.0, 0.5 }, + { 2, { 0.011111, 0.45 }, 0.6, 1, 97.0, 0.5 }, + { 1, { 0.3511111111 }, 1.33, 1, 97.0, 1.0 } + } + }, + + { SRC_SINC_MEDIUM_QUALITY, + 9, + BOOLEAN_TRUE, + { { 1, { 0.01111111111 }, 3.0, 1, 130.0, 1.0 }, + { 1, { 0.01111111111 }, 0.6, 1, 132.0, 1.0 }, + { 1, { 0.01111111111 }, 0.3, 1, 135.0, 1.0 }, + { 1, { 0.01111111111 }, 1.0, 1, 155.0, 1.0 }, + { 1, { 0.01111111111 }, 1.001, 1, 133.0, 1.0 }, + { 2, { 0.011111, 0.324 }, 1.9999, 2, 127.0, 1.0 }, + { 2, { 0.012345, 0.457 }, 0.456789, 1, 124.0, 0.5 }, + { 2, { 0.011111, 0.45 }, 0.6, 1, 126.0, 0.5 }, + { 1, { 0.43111111111 }, 1.33, 1, 121.0, 1.0 } + } + }, + + { SRC_SINC_BEST_QUALITY, + 9, + BOOLEAN_TRUE, + { { 1, { 0.01111111111 }, 3.0, 1, 147.0, 1.0 }, + { 1, { 0.01111111111 }, 0.6, 1, 147.0, 1.0 }, + { 1, { 0.01111111111 }, 0.3, 1, 147.0, 1.0 }, + { 1, { 0.01111111111 }, 1.0, 1, 155.0, 1.0 }, + { 1, { 0.01111111111 }, 1.001, 1, 146.0, 1.0 }, + { 2, { 0.011111, 0.324 }, 1.9999, 2, 147.0, 1.0 }, + { 2, { 0.012345, 0.457 }, 0.456789, 1, 148.0, 0.5 }, + { 2, { 0.011111, 0.45 }, 0.6, 1, 145.0, 0.5 }, + { 1, { 0.43111111111 }, 1.33, 1, 145.0, 1.0 } + } + }, + } ; /* snr_test_data */ + + double best_snr, snr, freq3dB ; + int j, k, converter, verbose = 0 ; + + if (argc == 2 && strcmp (argv [1], "--verbose") == 0) + verbose = 1 ; + + puts ("") ; + + for (j = 0 ; j < ARRAY_LEN (snr_test_data) ; j++) + { best_snr = 5000.0 ; + + converter = snr_test_data [j].converter ; + + printf (" Converter %d : %s\n", converter, src_get_name (converter)) ; + printf (" %s\n", src_get_description (converter)) ; + + for (k = 0 ; k < snr_test_data [j].tests ; k++) + { snr = snr_test (&(snr_test_data [j].test_data [k]), k, converter, verbose) ; + if (best_snr > snr) + best_snr = snr ; + } ; + + printf (" Worst case Signal-to-Noise Ratio : %.2f dB.\n", best_snr) ; + + if (snr_test_data [j].do_bandwidth_test == BOOLEAN_FALSE) + { puts (" Bandwith test not performed on this converter.\n") ; + continue ; + } + + freq3dB = bandwidth_test (converter, verbose) ; + + printf (" Measured -3dB rolloff point : %5.2f %%.\n\n", freq3dB) ; + } ; + + return 0 ; +} /* main */ + +/*============================================================================== +*/ + +static double +snr_test (SINGLE_TEST *test_data, int number, int converter, int verbose) +{ static float data [BUFFER_LEN + 1] ; + static float output [MAX_SPEC_LEN] ; + + SRC_STATE *src_state ; + SRC_DATA src_data ; + + double output_peak, snr ; + int k, output_len, input_len, error ; + + if (verbose != 0) + { printf ("\tSignal-to-Noise Ratio Test %d.\n" + "\t=====================================\n", number) ; + printf ("\tFrequencies : [ ") ; + for (k = 0 ; k < test_data->freq_count ; k++) + printf ("%6.4f ", test_data->freqs [k]) ; + + printf ("]\n\tSRC Ratio : %8.4f\n", test_data->src_ratio) ; + } + else + { printf ("\tSignal-to-Noise Ratio Test %d : ", number) ; + fflush (stdout) ; + } ; + + /* Set up the output array. */ + if (test_data->src_ratio >= 1.0) + { output_len = MAX_SPEC_LEN ; + input_len = (int) ceil (MAX_SPEC_LEN / test_data->src_ratio) ; + if (input_len > BUFFER_LEN) + input_len = BUFFER_LEN ; + } + else + { input_len = BUFFER_LEN ; + output_len = (int) ceil (BUFFER_LEN * test_data->src_ratio) ; + output_len &= ((-1) << 4) ; + if (output_len > MAX_SPEC_LEN) + output_len = MAX_SPEC_LEN ; + input_len = (int) ceil (output_len / test_data->src_ratio) ; + } ; + + memset (output, 0, sizeof (output)) ; + + /* Generate input data array. */ + gen_windowed_sines (test_data->freq_count, test_data->freqs, 1.0, data, input_len) ; + + /* Perform sample rate conversion. */ + if ((src_state = src_new (converter, 1, &error)) == NULL) + { printf ("\n\nLine %d : src_new() failed : %s.\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + src_data.end_of_input = 1 ; /* Only one buffer worth of input. */ + + src_data.data_in = data ; + src_data.input_frames = input_len ; + + src_data.src_ratio = test_data->src_ratio ; + + src_data.data_out = output ; + src_data.output_frames = output_len ; + + if ((error = src_process (src_state, &src_data))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + src_state = src_delete (src_state) ; + + if (verbose != 0) + printf ("\tOutput Len : %ld\n", src_data.output_frames_gen) ; + + if (abs (src_data.output_frames_gen - output_len) > 4) + { printf ("\n\nLine %d : output data length should be %d.\n\n", __LINE__, output_len) ; + exit (1) ; + } ; + + /* Check output peak. */ + output_peak = find_peak (output, src_data.output_frames_gen) ; + + if (verbose != 0) + printf ("\tOutput Peak : %6.4f\n", output_peak) ; + + if (fabs (output_peak - test_data->peak_value) > 0.01) + { printf ("\n\nLine %d : output peak (%6.4f) should be %6.4f\n\n", __LINE__, output_peak, test_data->peak_value) ; + save_oct_float ("snr_test.dat", data, BUFFER_LEN, output, output_len) ; + exit (1) ; + } ; + + /* Calculate signal-to-noise ratio. */ + snr = calculate_snr (output, src_data.output_frames_gen, test_data->pass_band_peaks) ; + + if (snr < 0.0) + { /* An error occurred. */ + save_oct_float ("snr_test.dat", data, BUFFER_LEN, output, src_data.output_frames_gen) ; + exit (1) ; + } ; + + if (verbose != 0) + printf ("\tSNR Ratio : %.2f dB\n", snr) ; + + if (snr < test_data->snr) + { printf ("\n\nLine %d : SNR (%5.2f) should be > %6.2f dB\n\n", __LINE__, snr, test_data->snr) ; + exit (1) ; + } ; + + if (verbose != 0) + puts ("\t-------------------------------------\n\tPass\n") ; + else + puts ("Pass") ; + + return snr ; +} /* snr_test */ + +static double +find_peak (float *data, int len) +{ double peak = 0.0 ; + int k = 0 ; + + for (k = 0 ; k < len ; k++) + if (fabs (data [k]) > peak) + peak = fabs (data [k]) ; + + return peak ; +} /* find_peak */ + + +static double +find_attenuation (double freq, int converter, int verbose) +{ static float input [BUFFER_LEN] ; + static float output [2 * BUFFER_LEN] ; + + SRC_DATA src_data ; + double output_peak ; + int error ; + + gen_windowed_sines (1, &freq, 1.0, input, BUFFER_LEN) ; + + src_data.end_of_input = 1 ; /* Only one buffer worth of input. */ + + src_data.data_in = input ; + src_data.input_frames = BUFFER_LEN ; + + src_data.src_ratio = 1.999 ; + + src_data.data_out = output ; + src_data.output_frames = ARRAY_LEN (output) ; + + if ((error = src_simple (&src_data, converter, 1))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + output_peak = find_peak (output, ARRAY_LEN (output)) ; + + if (verbose) + printf ("\tFreq : %6f InPeak : %6f OutPeak : %6f Atten : %6.2f dB\n", + freq, 1.0, output_peak, 20.0 * log10 (1.0 / output_peak)) ; + + return 20.0 * log10 (1.0 / output_peak) ; +} /* find_attenuation */ + +static double +bandwidth_test (int converter, int verbose) +{ double f1, f2, a1, a2 ; + double freq, atten ; + + f1 = 0.35 ; + a1 = find_attenuation (f1, converter, verbose) ; + + f2 = 0.495 ; + a2 = find_attenuation (f2, converter, verbose) ; + + if (a1 > 3.0 || a2 < 3.0) + { printf ("\n\nLine %d : cannot bracket 3dB point.\n\n", __LINE__) ; + exit (1) ; + } ; + + while (a2 - a1 > 1.0) + { freq = f1 + 0.5 * (f2 - f1) ; + atten = find_attenuation (freq, converter, verbose) ; + + if (atten < 3.0) + { f1 = freq ; + a1 = atten ; + } + else + { f2 = freq ; + a2 = atten ; + } ; + } ; + + freq = f1 + (3.0 - a1) * (f2 - f1) / (a2 - a1) ; + + return 200.0 * freq ; +} /* bandwidth_test */ + +#else /* (HAVE_FFTW3) == 0 */ + +/* Alternative main function when librfftw is not available. */ + +int +main (void) +{ puts ("\n" + "****************************************************************\n" + " This test cannot be run without FFTW (http://www.fftw.org/).\n" + " Both the real and the complex versions of the library are\n" + " required.") ; + puts ("****************************************************************\n") ; + + return 0 ; +} /* main */ + +#endif + diff --git a/soxr/lsr-tests/termination_test.c b/soxr/lsr-tests/termination_test.c new file mode 100644 index 0000000..6bb0fc0 --- /dev/null +++ b/soxr/lsr-tests/termination_test.c @@ -0,0 +1,339 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include + +#include + +#include "util.h" + +#define SHORT_BUFFER_LEN 2048 +#define LONG_BUFFER_LEN ((1 << 16) - 20) + +static void simple_test (int converter) ; +static void stream_test (int converter, double ratio) ; +static void init_term_test (int converter, double ratio) ; + +static int next_block_length (int reset) ; + +int +main (void) +{ static double src_ratios [] = + { 0.999900, 1.000100, 0.789012, 1.200000, 0.333333, 3.100000, + 0.125000, 8.000000, 0.099900, 9.990000, 0.100000, 10.00000 + } ; + + int k ; + + puts ("\n Zero Order Hold interpolator:") ; + + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + init_term_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ; + puts ("") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + stream_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ; + + + puts ("\n Linear interpolator:") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + init_term_test (SRC_LINEAR, src_ratios [k]) ; + puts ("") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + stream_test (SRC_LINEAR, src_ratios [k]) ; + + + puts ("\n Sinc interpolator:") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + init_term_test (SRC_SINC_FASTEST, src_ratios [k]) ; + puts ("") ; + for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++) + stream_test (SRC_SINC_FASTEST, src_ratios [k]) ; + + puts ("") ; + + simple_test (SRC_SINC_FASTEST) ; + + return 0 ; +} /* main */ + +static void +simple_test (int converter) +{ + int ilen = 199030, olen = 1000, error ; + + { + float in [ilen] ; + float out [olen] ; + double ratio = (1.0 * olen) / ilen ; + SRC_DATA src_data = + { in, out, + ilen, olen, + 0, 0, 0, + ratio + } ; + + error = src_simple (&src_data, converter, 1) ; + if (error) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + } ; + + return ; +} /* simple_test */ + +static void +init_term_test (int converter, double src_ratio) +{ static float input [SHORT_BUFFER_LEN], output [SHORT_BUFFER_LEN] ; + + SRC_DATA src_data ; + + int k, input_len, output_len, error, terminate ; + + printf ("\tinit_term_test (SRC ratio = %7.4f) .......... ", src_ratio) ; + fflush (stdout) ; + + /* Calculate maximun input and output lengths. */ + if (src_ratio >= 1.0) + { output_len = SHORT_BUFFER_LEN ; + input_len = (int) floor (SHORT_BUFFER_LEN / src_ratio) ; + } + else + { input_len = SHORT_BUFFER_LEN ; + output_len = (int) floor (SHORT_BUFFER_LEN * src_ratio) ; + } ; + + /* Reduce input_len by 10 so output is longer than necessary. */ + input_len -= 10 ; + + for (k = 0 ; k < ARRAY_LEN (input) ; k++) + input [k] = 1.0 ; + + if (output_len > SHORT_BUFFER_LEN) + { printf ("\n\nLine %d : output_len > SHORT_BUFFER_LEN\n\n", __LINE__) ; + exit (1) ; + } ; + + src_data.data_in = input ; + src_data.input_frames = input_len ; + + src_data.src_ratio = src_ratio ; + + src_data.data_out = output ; + src_data.output_frames = SHORT_BUFFER_LEN ; + + if ((error = src_simple (&src_data, converter, 1))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + terminate = (int) ceil ((src_ratio >= 1.0) ? 1 : 1.0 / src_ratio) ; + + if (fabs (src_ratio * input_len - src_data.output_frames_gen) > terminate) + { printf ("\n\nLine %d : Bad output frame count.\n\n", __LINE__) ; + printf ("\tterminate : %d\n", terminate) ; + printf ("\tsrc_ratio : %.4f\n", src_ratio) ; + printf ("\tinput_len : %d\n" + "\tinput_len * src_ratio : %f\n", input_len, input_len * src_ratio) ; + printf ("\toutput_frames_gen : %ld\n\n", src_data.output_frames_gen) ; + exit (1) ; + } ; + + if (abs (src_data.input_frames_used - input_len) > 1) + { printf ("\n\nLine %d : input_frames_used should be %d, is %ld.\n\n", + __LINE__, input_len, src_data.input_frames_used) ; + printf ("\tsrc_ratio : %.4f\n", src_ratio) ; + printf ("\tinput_len : %d\n\tinput_used : %ld\n\n", input_len, src_data.input_frames_used) ; + exit (1) ; + } ; + + if (fabs (output [0]) < 0.1) + { printf ("\n\nLine %d : First output sample is bad.\n\n", __LINE__) ; + printf ("\toutput [0] == %f\n\n", output [0]) ; + exit (1) ; + } + + puts ("ok") ; + + return ; +} /* init_term_test */ + +static void +stream_test (int converter, double src_ratio) +{ static float input [LONG_BUFFER_LEN], output [LONG_BUFFER_LEN] ; + + SRC_STATE *src_state ; + SRC_DATA src_data ; + + int input_len, output_len, current_in, current_out ; + int k, error, terminate ; + + printf ("\tstream_test (SRC ratio = %7.4f) .......... ", src_ratio) ; + fflush (stdout) ; + +/* Erik */ +for (k = 0 ; k < LONG_BUFFER_LEN ; k++) input [k] = k * 1.0 ; + + /* Calculate maximun input and output lengths. */ + if (src_ratio >= 1.0) + { output_len = LONG_BUFFER_LEN ; + input_len = (int) floor (LONG_BUFFER_LEN / src_ratio) ; + } + else + { input_len = LONG_BUFFER_LEN ; + output_len = (int) floor (LONG_BUFFER_LEN * src_ratio) ; + } ; + + /* Reduce input_len by 10 so output is longer than necessary. */ + input_len -= 20 ; + + if (output_len > LONG_BUFFER_LEN) + { printf ("\n\nLine %d : output_len > LONG_BUFFER_LEN\n\n", __LINE__) ; + exit (1) ; + } ; + + current_in = current_out = 0 ; + + /* Perform sample rate conversion. */ + if ((src_state = src_new (converter, 1, &error)) == NULL) + { printf ("\n\nLine %d : src_new() failed : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + src_data.end_of_input = 0 ; /* Set this later. */ + + src_data.data_in = input ; + + src_data.src_ratio = src_ratio ; + + src_data.data_out = output ; + src_data.output_frames = ARRAY_LEN (output) / 10 ; + + terminate = 1 + (int) ceil ((src_ratio >= 1.0) ? src_ratio : 1.0 / src_ratio) ; + + while (1) + { + src_data.input_frames = next_block_length (0) ; + src_data.input_frames = MIN (src_data.input_frames, input_len - current_in) ; + + src_data.output_frames = ARRAY_LEN (output) - current_out ; + /*-Erik MIN (src_data.output_frames, output_len - current_out) ;-*/ + + src_data.end_of_input = (current_in >= input_len) ? 1 : 0 ; + + if ((error = src_process (src_state, &src_data))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + printf (" src_data.input_frames : %ld\n", src_data.input_frames) ; + printf (" src_data.output_frames : %ld\n\n", src_data.output_frames) ; + exit (1) ; + } ; + + if (src_data.end_of_input && src_data.output_frames_gen == 0) + break ; + + if (src_data.input_frames_used > src_data.input_frames) + { printf ("\n\nLine %d : input_frames_used > input_frames\n\n", __LINE__) ; + printf (" src_data.input_frames : %ld\n", src_data.input_frames) ; + printf (" src_data.input_frames_used : %ld\n", src_data.input_frames_used) ; + printf (" src_data.output_frames : %ld\n", src_data.output_frames) ; + printf (" src_data.output_frames_gen : %ld\n\n", src_data.output_frames_gen) ; + exit (1) ; + } ; + + if (src_data.input_frames_used < 0) + { printf ("\n\nLine %d : input_frames_used (%ld) < 0\n\n", __LINE__, src_data.input_frames_used) ; + exit (1) ; + } ; + + if (src_data.output_frames_gen < 0) + { printf ("\n\nLine %d : output_frames_gen (%ld) < 0\n\n", __LINE__, src_data.output_frames_gen) ; + exit (1) ; + } ; + + current_in += src_data.input_frames_used ; + current_out += src_data.output_frames_gen ; + + if (current_in > input_len + terminate) + { printf ("\n\nLine %d : current_in (%d) > input_len (%d + %d)\n\n", __LINE__, current_in, input_len, terminate) ; + exit (1) ; + } ; + + if (current_out > output_len) + { printf ("\n\nLine %d : current_out (%d) > output_len (%d)\n\n", __LINE__, current_out, output_len) ; + exit (1) ; + } ; + + if (src_data.input_frames_used > input_len) + { printf ("\n\nLine %d : input_frames_used (%ld) > %d\n\n", __LINE__, src_data.input_frames_used, input_len) ; + exit (1) ; + } ; + + if (src_data.output_frames_gen > output_len) + { printf ("\n\nLine %d : output_frames_gen (%ld) > %d\n\n", __LINE__, src_data.output_frames_gen, output_len) ; + exit (1) ; + } ; + + if (src_data.data_in == NULL && src_data.output_frames_gen == 0) + break ; + + + src_data.data_in += src_data.input_frames_used ; + src_data.data_out += src_data.output_frames_gen ; + } ; + + src_state = src_delete (src_state) ; + + if (fabs (current_out - src_ratio * input_len) > terminate) + { printf ("\n\nLine %d : bad output data length %d should be %2.1f +/- %d.\n", __LINE__, + current_out, src_ratio * input_len, terminate) ; + printf ("\tsrc_ratio : %.4f\n", src_ratio) ; + printf ("\tinput_len : %d\n\tinput_used : %d\n", input_len, current_in) ; + printf ("\toutput_len : %d\n\toutput_gen : %d\n\n", output_len, current_out) ; + exit (1) ; + } ; + + if (current_in != input_len) + { printf ("\n\nLine %d : unused input.\n", __LINE__) ; + printf ("\tinput_len : %d\n", input_len) ; + printf ("\tinput_frames_used : %d\n\n", current_in) ; + exit (1) ; + } ; + + puts ("ok") ; + + return ; +} /* stream_test */ + +static int +next_block_length (int reset) +{ static int block_lengths [] = /* Should be an odd length. */ + { /*-2, 500, 5, 400, 10, 300, 20, 200, 50, 100, 70 -*/ + 5, 400, 10, 300, 20, 200, 50, 100, 70 + } ; + static int block_len_index = 0 ; + + if (reset) + block_len_index = 0 ; + else + block_len_index = (block_len_index + 1) % ARRAY_LEN (block_lengths) ; + + return block_lengths [block_len_index] ; +} /* next_block_length */ + diff --git a/soxr/lsr-tests/throughput_test.c b/soxr/lsr-tests/throughput_test.c new file mode 100644 index 0000000..28b6fe5 --- /dev/null +++ b/soxr/lsr-tests/throughput_test.c @@ -0,0 +1,212 @@ +/* +** Copyright (C) 2004-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include +#include +#include + +#include + +#include "config.h" + +#include "util.h" +#include "float_cast.h" + +#define BUFFER_LEN (1<<16) + +static float input [BUFFER_LEN] ; +static float output [BUFFER_LEN] ; + +static long +throughput_test (int converter, long best_throughput) +{ SRC_DATA src_data ; + clock_t start_time, clock_time ; + double duration ; + long total_frames = 0, throughput ; + int error ; + + printf (" %-30s ", src_get_name (converter)) ; + fflush (stdout) ; + + src_data.data_in = input ; + src_data.input_frames = ARRAY_LEN (input) ; + + src_data.data_out = output ; + src_data.output_frames = ARRAY_LEN (output) ; + + src_data.src_ratio = 0.99 ; + + sleep (2) ; + + start_time = clock () ; + + do + { + if ((error = src_simple (&src_data, converter, 1)) != 0) + { puts (src_strerror (error)) ; + exit (1) ; + } ; + + total_frames += src_data.output_frames_gen ; + + clock_time = clock () - start_time ; + duration = (1.0 * clock_time) / CLOCKS_PER_SEC ; + } + while (duration < 3.0) ; + + if (src_data.input_frames_used != ARRAY_LEN (input)) + { printf ("\n\nLine %d : input frames used %ld should be %d\n", __LINE__, src_data.input_frames_used, ARRAY_LEN (input)) ; + exit (1) ; + } ; + + if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2) + { printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ; + printf (" input len : %d\n", ARRAY_LEN (input)) ; + printf (" output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen, + floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ; + exit (1) ; + } ; + + throughput = lrint (floor (total_frames / duration)) ; + + if (best_throughput == 0) + { best_throughput = MAX (throughput, best_throughput) ; + printf ("%5.2f %10ld\n", duration, throughput) ; + } + else + { best_throughput = MAX (throughput, best_throughput) ; + printf ("%5.2f %10ld %10ld\n", duration, throughput, best_throughput) ; + } + + + return best_throughput ; +} /* throughput_test */ + +static void +single_run (void) +{ + + printf ("\n CPU name : %s\n", get_cpu_name ()) ; + + puts ( + "\n" + " Converter Duration Throughput\n" + " -----------------------------------------------------------" + ) ; + + throughput_test (SRC_ZERO_ORDER_HOLD, 0) ; + throughput_test (SRC_LINEAR, 0) ; + throughput_test (SRC_SINC_FASTEST, 0) ; + throughput_test (SRC_SINC_MEDIUM_QUALITY, 0) ; + throughput_test (SRC_SINC_BEST_QUALITY, 0) ; + + puts ("") ; + return ; +} /* single_run */ + +static void +multi_run (int run_count) +{ long zero_order_hold = 0, linear = 0 ; + long sinc_fastest = 0, sinc_medium = 0, sinc_best = 0 ; + int k ; + + puts ( + "\n" + " Converter Duration Throughput Best Throughput\n" + " --------------------------------------------------------------------------------" + ) ; + + for (k = 0 ; k < run_count ; k++) + { zero_order_hold = throughput_test (SRC_ZERO_ORDER_HOLD, zero_order_hold) ; + linear = throughput_test (SRC_LINEAR, linear) ; + sinc_fastest = throughput_test (SRC_SINC_FASTEST, sinc_fastest) ; + sinc_medium = throughput_test (SRC_SINC_MEDIUM_QUALITY, sinc_medium) ; + sinc_best = throughput_test (SRC_SINC_BEST_QUALITY, sinc_best) ; + + puts ("") ; + + /* Let the CPU cool down. We might be running on a laptop. */ + sleep (10) ; + } ; + + printf ("\n CPU name : %s\n", get_cpu_name ()) ; + + puts ( + "\n" + " Converter Best Throughput\n" + " ------------------------------------------------" + ) ; + printf (" %-30s %10ld\n", src_get_name (SRC_ZERO_ORDER_HOLD), zero_order_hold) ; + printf (" %-30s %10ld\n", src_get_name (SRC_LINEAR), linear) ; + printf (" %-30s %10ld\n", src_get_name (SRC_SINC_FASTEST), sinc_fastest) ; + printf (" %-30s %10ld\n", src_get_name (SRC_SINC_MEDIUM_QUALITY), sinc_medium) ; + printf (" %-30s %10ld\n", src_get_name (SRC_SINC_BEST_QUALITY), sinc_best) ; + + puts ("") ; +} /* multi_run */ + +static void +usage_exit (const char * argv0) +{ const char * cptr ; + + if ((cptr = strrchr (argv0, '/')) != NULL) + argv0 = cptr ; + + printf ( + "Usage :\n" + " %s - Single run of the throughput test.\n" + " %s --best-of N - Do N runs of test a print bext result.\n" + "\n", + argv0, argv0) ; + + exit (0) ; +} /* usage_exit */ + +int +main (int argc, char ** argv) +{ double freq ; + + memset (input, 0, sizeof (input)) ; + freq = 0.01 ; + gen_windowed_sines (1, &freq, 1.0, input, BUFFER_LEN) ; + + if (argc == 1) + single_run () ; + else if (argc == 3 && strcmp (argv [1], "--best-of") == 0) + { int run_count = atoi (argv [2]) ; + + if (run_count < 1 || run_count > 20) + { printf ("Please be sensible. Run count should be in range (1, 10].\n") ; + exit (1) ; + } ; + + multi_run (run_count) ; + } + else + usage_exit (argv [0]) ; + + puts ( + " Duration is in seconds.\n" + " Throughput is in samples/sec (more is better).\n" + ) ; + + return 0 ; +} /* main */ + diff --git a/soxr/lsr-tests/util.c b/soxr/lsr-tests/util.c new file mode 100644 index 0000000..fefcaf2 --- /dev/null +++ b/soxr/lsr-tests/util.c @@ -0,0 +1,230 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include +#include +#include + +#include "util.h" + +#ifndef M_PI +#define M_PI 3.14159265358979323846264338 +#endif + +void +gen_windowed_sines (int freq_count, const double *freqs, double max, float *output, int output_len) +{ int k, freq ; + double amplitude, phase ; + + amplitude = max / freq_count ; + + for (k = 0 ; k < output_len ; k++) + output [k] = 0.0 ; + + for (freq = 0 ; freq < freq_count ; freq++) + { phase = 0.9 * M_PI / freq_count ; + + if (freqs [freq] <= 0.0 || freqs [freq] >= 0.5) + { printf ("\n%s : Error : freq [%d] == %g is out of range. Should be < 0.5.\n", __FILE__, freq, freqs [freq]) ; + exit (1) ; + } ; + + for (k = 0 ; k < output_len ; k++) + output [k] += amplitude * sin (freqs [freq] * (2 * k) * M_PI + phase) ; + } ; + + /* Apply Hanning Window. */ + for (k = 0 ; k < output_len ; k++) + output [k] *= 0.5 - 0.5 * cos ((2 * k) * M_PI / (output_len - 1)) ; + + /* data [k] *= 0.3635819 - 0.4891775 * cos ((2 * k) * M_PI / (output_len - 1)) + + 0.1365995 * cos ((4 * k) * M_PI / (output_len - 1)) + - 0.0106411 * cos ((6 * k) * M_PI / (output_len - 1)) ; + */ + + return ; +} /* gen_windowed_sines */ + +void +save_oct_float (char *filename, float *input, int in_len, float *output, int out_len) +{ FILE *file ; + int k ; + + printf ("Dumping input and output data to file : %s.\n\n", filename) ; + + if (! (file = fopen (filename, "w"))) + return ; + + fprintf (file, "# Not created by Octave\n") ; + + fprintf (file, "# name: input\n") ; + fprintf (file, "# type: matrix\n") ; + fprintf (file, "# rows: %d\n", in_len) ; + fprintf (file, "# columns: 1\n") ; + + for (k = 0 ; k < in_len ; k++) + fprintf (file, "% g\n", input [k]) ; + + fprintf (file, "# name: output\n") ; + fprintf (file, "# type: matrix\n") ; + fprintf (file, "# rows: %d\n", out_len) ; + fprintf (file, "# columns: 1\n") ; + + for (k = 0 ; k < out_len ; k++) + fprintf (file, "% g\n", output [k]) ; + + fclose (file) ; + return ; +} /* save_oct_float */ + +void +save_oct_double (char *filename, double *input, int in_len, double *output, int out_len) +{ FILE *file ; + int k ; + + printf ("Dumping input and output data to file : %s.\n\n", filename) ; + + if (! (file = fopen (filename, "w"))) + return ; + + fprintf (file, "# Not created by Octave\n") ; + + fprintf (file, "# name: input\n") ; + fprintf (file, "# type: matrix\n") ; + fprintf (file, "# rows: %d\n", in_len) ; + fprintf (file, "# columns: 1\n") ; + + for (k = 0 ; k < in_len ; k++) + fprintf (file, "% g\n", input [k]) ; + + fprintf (file, "# name: output\n") ; + fprintf (file, "# type: matrix\n") ; + fprintf (file, "# rows: %d\n", out_len) ; + fprintf (file, "# columns: 1\n") ; + + for (k = 0 ; k < out_len ; k++) + fprintf (file, "% g\n", output [k]) ; + + fclose (file) ; + return ; +} /* save_oct_double */ + +void +interleave_data (const float *in, float *out, int frames, int channels) +{ int fr, ch ; + + for (fr = 0 ; fr < frames ; fr++) + for (ch = 0 ; ch < channels ; ch++) + out [ch + channels * fr] = in [fr + frames * ch] ; + + return ; +} /* interleave_data */ + +void +deinterleave_data (const float *in, float *out, int frames, int channels) +{ int fr, ch ; + + for (ch = 0 ; ch < channels ; ch++) + for (fr = 0 ; fr < frames ; fr++) + out [fr + frames * ch] = in [ch + channels * fr] ; + + return ; +} /* deinterleave_data */ + +void +reverse_data (float *data, int datalen) +{ int left, right ; + float temp ; + + left = 0 ; + right = datalen - 1 ; + + while (left < right) + { temp = data [left] ; + data [left] = data [right] ; + data [right] = temp ; + left ++ ; + right -- ; + } ; + +} /* reverse_data */ + +const char * +get_cpu_name (void) +{ + const char *name = "Unknown", *search = NULL ; + static char buffer [512] ; + FILE * file = NULL ; + int is_pipe = 0 ; + +#if defined (__linux__) + file = fopen ("/proc/cpuinfo", "r") ; + search = "model name" ; +#elif defined (__APPLE__) + file = popen ("/usr/sbin/system_profiler -detailLevel full SPHardwareDataType", "r") ; + search = "Processor Name" ; + is_pipe = 1 ; +#elif defined (__FreeBSD__) + file = popen ("sysctl -a", "r") ; + search = "hw.model" ; + is_pipe = 1 ; +#else + file = NULL ; +#endif + + if (file == NULL) + return name ; + + if (search == NULL) + { printf ("Error : search is NULL in function %s.\n", __func__) ; + return name ; + } ; + + while (fgets (buffer, sizeof (buffer), file) != NULL) + if (strstr (buffer, search)) + { char *src, *dest ; + + if ((src = strchr (buffer, ':')) != NULL) + { src ++ ; + while (isspace (src [0])) + src ++ ; + name = src ; + + /* Remove consecutive spaces. */ + src ++ ; + for (dest = src ; src [0] ; src ++) + { if (isspace (src [0]) && isspace (dest [-1])) + continue ; + dest [0] = src [0] ; + dest ++ ; + } ; + dest [0] = 0 ; + break ; + } ; + } ; + + if (is_pipe) + pclose (file) ; + else + fclose (file) ; + + return name ; +} /* get_cpu_name */ + diff --git a/soxr/lsr-tests/util.h b/soxr/lsr-tests/util.h new file mode 100644 index 0000000..80b1b49 --- /dev/null +++ b/soxr/lsr-tests/util.h @@ -0,0 +1,50 @@ +/* +** Copyright (C) 2002-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#define ABS(a) (((a) < 0) ? - (a) : (a)) +#define MIN(a,b) (((a) < (b)) ? (a) : (b)) +#define MAX(a,b) (((a) >= (b)) ? (a) : (b)) + +#define ARRAY_LEN(x) ((int) (sizeof (x) / sizeof ((x) [0]))) + +void gen_windowed_sines (int freq_count, const double *freqs, double max, float *output, int output_len) ; + +void save_oct_float (char *filename, float *input, int in_len, float *output, int out_len) ; +void save_oct_double (char *filename, double *input, int in_len, double *output, int out_len) ; + +void interleave_data (const float *in, float *out, int frames, int channels) ; + +void deinterleave_data (const float *in, float *out, int frames, int channels) ; + +void reverse_data (float *data, int datalen) ; + +double calculate_snr (float *data, int len, int expected_peaks) ; + +const char * get_cpu_name (void) ; + +#if OS_IS_WIN32 +/* +** Extra Win32 hacks. +** +** Despite Microsoft claim of windows being POSIX compatibile it has '_sleep' +** instead of 'sleep'. +*/ + +#define sleep _sleep +#endif + diff --git a/soxr/lsr-tests/varispeed_test.c b/soxr/lsr-tests/varispeed_test.c new file mode 100644 index 0000000..52b2f43 --- /dev/null +++ b/soxr/lsr-tests/varispeed_test.c @@ -0,0 +1,152 @@ +/* +** Copyright (C) 2006-2011 Erik de Castro Lopo +** +** This program is free software; you can redistribute it and/or modify +** it under the terms of the GNU General Public License as published by +** the Free Software Foundation; either version 2 of the License, or +** (at your option) any later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software +** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA. +*/ + +#include +#include +#include +#include + +#include + +#include "util.h" + +#define BUFFER_LEN (1 << 16) + +static void varispeed_test (int converter, double target_snr) ; + +int +main (void) +{ + puts ("") ; + printf (" Zero Order Hold interpolator : ") ; + varispeed_test (SRC_ZERO_ORDER_HOLD, 10.0) ; + + printf (" Linear interpolator : ") ; + varispeed_test (SRC_LINEAR, 10.0) ; + + printf (" Sinc interpolator : ") ; + varispeed_test (SRC_SINC_FASTEST, 115.0) ; + + puts ("") ; + + return 0 ; +} /* main */ + +static void +varispeed_test (int converter, double target_snr) +{ static float input [BUFFER_LEN], output [BUFFER_LEN] ; + double sine_freq, snr ; + + SRC_STATE *src_state ; + SRC_DATA src_data ; + + int input_len, error ; + + memset (input, 0, sizeof (input)) ; + + input_len = ARRAY_LEN (input) / 2 ; + + sine_freq = 0.0111 ; + gen_windowed_sines (1, &sine_freq, 1.0, input, input_len) ; + + /* Perform sample rate conversion. */ + if ((src_state = src_new (converter, 1, &error)) == NULL) + { printf ("\n\nLine %d : src_new() failed : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + src_data.end_of_input = 1 ; + + src_data.data_in = input ; + src_data.input_frames = input_len ; + + src_data.src_ratio = 3.0 ; + + src_data.data_out = output ; + src_data.output_frames = ARRAY_LEN (output) ; + + if ((error = src_set_ratio (src_state, 1.0 / src_data.src_ratio))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + if ((error = src_process (src_state, &src_data))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + printf (" src_data.input_frames : %ld\n", src_data.input_frames) ; + printf (" src_data.output_frames : %ld\n\n", src_data.output_frames) ; + exit (1) ; + } ; + + if (src_data.input_frames_used != input_len) + { printf ("\n\nLine %d : unused input.\n", __LINE__) ; + printf ("\tinput_len : %d\n", input_len) ; + printf ("\tinput_frames_used : %ld\n\n", src_data.input_frames_used) ; + exit (1) ; + } ; + + /* Copy the last output to the input. */ + memcpy (input, output, sizeof (input)) ; + reverse_data (input, src_data.output_frames_gen) ; + + if ((error = src_reset (src_state))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + src_data.end_of_input = 1 ; + + src_data.data_in = input ; + input_len = src_data.input_frames = src_data.output_frames_gen ; + + src_data.data_out = output ; + src_data.output_frames = ARRAY_LEN (output) ; + + if ((error = src_set_ratio (src_state, 1.0 / src_data.src_ratio))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + exit (1) ; + } ; + + if ((error = src_process (src_state, &src_data))) + { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ; + printf (" src_data.input_frames : %ld\n", src_data.input_frames) ; + printf (" src_data.output_frames : %ld\n\n", src_data.output_frames) ; + exit (1) ; + } ; + + if (src_data.input_frames_used != input_len) + { printf ("\n\nLine %d : unused input.\n", __LINE__) ; + printf ("\tinput_len : %d\n", input_len) ; + printf ("\tinput_frames_used : %ld\n\n", src_data.input_frames_used) ; + exit (1) ; + } ; + + src_state = src_delete (src_state) ; + + snr = calculate_snr (output, src_data.output_frames_gen, 1) ; + + if (target_snr > snr) + { printf ("\n\nLine %d : snr (%3.1f) does not meet target (%3.1f)\n\n", __LINE__, snr, target_snr) ; + save_oct_float ("varispeed.mat", input, src_data.input_frames, output, src_data.output_frames_gen) ; + exit (1) ; + } ; + + puts ("ok") ; + + return ; +} /* varispeed_test */ + diff --git a/soxr/msvc/README b/soxr/msvc/README new file mode 100644 index 0000000..5b7f60a --- /dev/null +++ b/soxr/msvc/README @@ -0,0 +1,22 @@ +SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + +Cmake is the recommended way to configure, build (as either a DLL or a static +library), and install libsoxr for general use on MS-Windows, as on other OSs. + +However, building within MS Visual Studio is also possible, as exemplified by +the accompanying files: + + * soxr-config.h Pre-configured for a modern Win32 system. + + * libsoxr.vcproj Builds the library as a DLL, per above. + + * libsoxr.sln, Build an example exe using the above. + example1.vcproj + +The following notes apply to adaptation of these files: + + * For a system without AVX support, set WITH_CR64S to 0 in + soxr-config.h and exclude the three files ...64s.c from the build. + + * If changing libsoxr.vcproj to build a static library, then also + remove the preprocessor definition: SOXR_DLL. diff --git a/soxr/msvc/example1.vcproj b/soxr/msvc/example1.vcproj new file mode 100644 index 0000000..170a522 --- /dev/null +++ b/soxr/msvc/example1.vcproj @@ -0,0 +1,82 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/soxr/msvc/libsoxr.sln b/soxr/msvc/libsoxr.sln new file mode 100644 index 0000000..c1a840b --- /dev/null +++ b/soxr/msvc/libsoxr.sln @@ -0,0 +1,29 @@ + +Microsoft Visual Studio Solution File, Format Version 10.00 +# Visual C++ Express 2008 +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example1", "example1.vcproj", "{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}" + ProjectSection(ProjectDependencies) = postProject + {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB} = {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB} + EndProjectSection +EndProject +Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libsoxr", "libsoxr.vcproj", "{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Release|Win32 = Release|Win32 + Debug|Win32 = Debug|Win32 + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Release|Win32.ActiveCfg = Release|Win32 + {CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Release|Win32.Build.0 = Release|Win32 + {CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Debug|Win32.ActiveCfg = Debug|Win32 + {CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Debug|Win32.Build.0 = Debug|Win32 + {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Release|Win32.ActiveCfg = Release|Win32 + {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Release|Win32.Build.0 = Release|Win32 + {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Debug|Win32.ActiveCfg = Debug|Win32 + {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Debug|Win32.Build.0 = Debug|Win32 + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/soxr/msvc/libsoxr.vcproj b/soxr/msvc/libsoxr.vcproj new file mode 100644 index 0000000..499f895 --- /dev/null +++ b/soxr/msvc/libsoxr.vcproj @@ -0,0 +1,97 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/soxr/msvc/soxr-config.h b/soxr/msvc/soxr-config.h new file mode 100644 index 0000000..74415e2 --- /dev/null +++ b/soxr/msvc/soxr-config.h @@ -0,0 +1,30 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +/* N.B. Pre-configured for modern MS-Windows systems. However, the normal + * procedure is to use the cmake configuration and build system. See INSTALL. */ + +#if !defined soxr_config_included +#define soxr_config_included + +#define AVCODEC_FOUND 0 +#define AVUTIL_FOUND 0 +#define WITH_PFFFT 1 + +#define HAVE_FENV_H 1 +#define HAVE_STDBOOL_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_LRINT 1 +#define HAVE_BIGENDIAN 0 + +#define WITH_CR32 1 +#define WITH_CR32S 1 +#define WITH_CR64 1 +#define WITH_CR64S 1 +#define WITH_VR32 1 + +#define WITH_HI_PREC_CLOCK 1 +#define WITH_FLOAT_STD_PREC_CLOCK 0 +#define WITH_DEV_TRACE 1 + +#endif diff --git a/soxr/multi-arch b/soxr/multi-arch new file mode 100644 index 0000000..288b578 --- /dev/null +++ b/soxr/multi-arch @@ -0,0 +1,31 @@ +#!/usr/bin/env bash +set -e + +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +rm -f CMakeCache.txt # Prevent interference from any in-tree build + +j=-j4 +build=Release + +for n in \ + cc: \ + clang: \ + arm-linux-gnueabi-gcc:Linux \ + x86_64-w64-mingw32-gcc:Windows \ + i686-w64-mingw32-gcc:Windows \ + ; do + compiler=$(echo $n | sed 's/:.*//') + system=$(echo $n | sed 's/.*://') + dir=$build-$compiler + which $compiler > /dev/null || echo $compiler not found && ( + echo "***" $dir + mkdir -p $dir + cd $dir + cmake -DCMAKE_BUILD_TYPE=$build -DCMAKE_C_COMPILER=$compiler -DCMAKE_SYSTEM_NAME="$system" -DBUILD_SHARED_LIBS=OFF -DWITH_OPENMP=OFF .. + make $j && [ /$system = / ] && ctest -j || true + cd tests + ../../tests/throughput-test && SOXR_THROUGHPUT_GAIN=.6 ../../tests/throughput-test 2 3 || true + ) +done diff --git a/soxr/soxr-config.h.in b/soxr/soxr-config.h.in index 227bcfd..00b3b45 100644 --- a/soxr/soxr-config.h.in +++ b/soxr/soxr-config.h.in @@ -1,46 +1,27 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ #if !defined soxr_config_included #define soxr_config_included -#define HAVE_SINGLE_PRECISION @HAVE_SINGLE_PRECISION@ -#define HAVE_DOUBLE_PRECISION @HAVE_DOUBLE_PRECISION@ -#define HAVE_AVFFT @HAVE_AVFFT@ -#define HAVE_SIMD @HAVE_SIMD@ -#define HAVE_FENV_H @HAVE_FENV_H@ -#define HAVE_LRINT @HAVE_LRINT@ -#define WORDS_BIGENDIAN @WORDS_BIGENDIAN@ +#cmakedefine01 AVCODEC_FOUND +#cmakedefine01 AVUTIL_FOUND +#cmakedefine01 WITH_PFFFT -#include +#cmakedefine01 HAVE_FENV_H +#cmakedefine01 HAVE_STDBOOL_H +#cmakedefine01 HAVE_STDINT_H +#cmakedefine01 HAVE_LRINT +#cmakedefine01 HAVE_BIGENDIAN -#undef bool -#undef false -#undef true -#define bool int -#define false 0 -#define true 1 +#cmakedefine01 WITH_CR32 +#cmakedefine01 WITH_CR32S +#cmakedefine01 WITH_CR64 +#cmakedefine01 WITH_CR64S +#cmakedefine01 WITH_VR32 -#undef int16_t -#undef int32_t -#undef int64_t -#undef uint32_t -#undef uint64_t -#define int16_t short -#if LONG_MAX > 2147483647L - #define int32_t int - #define int64_t long -#elif LONG_MAX < 2147483647L -#error this library requires that 'long int' has at least 32-bits -#else - #define int32_t long - #if defined _MSC_VER - #define int64_t __int64 - #else - #define int64_t long long - #endif -#endif -#define uint32_t unsigned int32_t -#define uint64_t unsigned int64_t +#cmakedefine01 WITH_HI_PREC_CLOCK +#cmakedefine01 WITH_FLOAT_STD_PREC_CLOCK +#cmakedefine01 WITH_DEV_TRACE #endif diff --git a/soxr/src/CMakeLists.txt b/soxr/src/CMakeLists.txt index cd41aa7..bb01a0d 100644 --- a/soxr/src/CMakeLists.txt +++ b/soxr/src/CMakeLists.txt @@ -1,4 +1,4 @@ -# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net # Licence for this file: LGPL v2.1 See LICENCE for details. @@ -7,90 +7,89 @@ if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vr-coefs.h) include_directories(${CMAKE_CURRENT_BINARY_DIR}) - set_property(SOURCE vr32.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h) + set_property(SOURCE vr32.c + APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h) add_executable (vr-coefs vr-coefs.c) + target_link_libraries (vr-coefs ${LIBM_LIBRARIES}) ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h COMMAND vr-coefs > ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h DEPENDS vr-coefs) endif () -# Minimalist boo configuration: -add_definitions (${PROJECT_C_FLAGS} -DSOXR_LIB -DSOXR_SILENT=1) -include (CheckFunctionExists) -include (CheckIncludeFiles) -set(WITH_LSR_BINDINGS OFF) -set(WITH_SINGLE_PRECISION ON) -set(WITH_DOUBLE_PRECISION OFF) -set(WITH_SIMD ON) -set(HAVE_SINGLE_PRECISION "1") -set(HAVE_DOUBLE_PRECISION "0") -set(HAVE_AVFFT "0") -set(HAVE_SIMD "1") -check_function_exists (lrint HAVE_LRINT) -if(NOT HAVE_LRINT) - set(HAVE_LRINT "0") -endif() -check_include_files (fenv.h HAVE_FENV_H) -if(NOT HAVE_FENV_H) - set(HAVE_FENV_H "0") -endif() -set(WORDS_BIGENDIAN "0") +add_definitions (${PROJECT_C_FLAGS} -DSOXR_LIB) + -configure_file ( - ${CMAKE_CURRENT_SOURCE_DIR}/../soxr-config.h.in - ${CMAKE_CURRENT_BINARY_DIR}/soxr-config.h) -include_directories (${CMAKE_CURRENT_BINARY_DIR}) # Libsoxr configuration: -set (RDFT32 fft4g32.c) -if (WITH_AVFFT AND AVCODEC_FOUND) - set (RDFT32 avfft32.c) - set (RDFT32S avfft32s.c) +set (RDFT32 fft4g32) +if (AVCODEC_FOUND) + set (RDFT32 avfft32) + set (RDFT32S avfft32s) elseif (WITH_PFFFT) - #set (RDFT32 pffft32.c) - set (RDFT32S pffft32s.c) -elseif (WITH_SIMD) - set (RDFT32S fft4g32s.c) + #set (RDFT32 pffft32) + set (RDFT32S pffft32s) +elseif (WITH_CR32S) + set (RDFT32S fft4g32s) + if (NOT WITH_CR32) + list (APPEND RDFT32S fft4g32) + endif () endif () -if (WITH_DOUBLE_PRECISION) - set (DP_SOURCES rate64.c) +set (SOURCES ${PROJECT_NAME}.c data-io) + +if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S) + list (APPEND SOURCES dbesi0 filter fft4g64 cr) endif () -if (WITH_SINGLE_PRECISION) - set (SP_SOURCES rate32.c ${RDFT32}) +if (WITH_CR32) + list (APPEND SOURCES cr32 ${RDFT32}) endif () -if (HAVE_SIMD) - set (SIMD_SOURCES rate32s.c vr32s.c ${RDFT32S} simd.c) - foreach (source ${SIMD_SOURCES}) - set_property (SOURCE ${source} PROPERTY COMPILE_FLAGS ${SIMD_C_FLAGS}) +if (WITH_CR64) + list (APPEND SOURCES cr64) +endif () + +if (WITH_VR32) + list (APPEND SOURCES vr32) +endif () + +if (WITH_CR32S) + foreach (source cr32s ${RDFT32S} util32s) + list (APPEND SOURCES ${source}) + set_property (SOURCE ${source} + APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD32_C_FLAGS}) + endforeach () +endif () + +if (WITH_CR64S) + foreach (source cr64s pffft64s util64s) + list (APPEND SOURCES ${source}) + set_property (SOURCE ${source} + APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD64_C_FLAGS}) endforeach () -else () - set (SIMD_SOURCES vr32.c) endif () # Libsoxr: -add_library (soxr ${LIB_TYPE} soxr.c data-io.c dbesi0.c filter.c fft4g64.c - ${SP_SOURCES} ${DP_SOURCES} ${SIMD_SOURCES}) -set_target_properties (soxr PROPERTIES +add_library (${PROJECT_NAME} ${LIB_TYPE} ${SOURCES}) +target_link_libraries (${PROJECT_NAME} PRIVATE ${LIBS} ${LIBM_LIBRARIES}) +set_target_properties (${PROJECT_NAME} PROPERTIES VERSION "${SO_VERSION}" SOVERSION ${SO_VERSION_MAJOR} INSTALL_NAME_DIR ${LIB_INSTALL_DIR} LINK_INTERFACE_LIBRARIES "" - PUBLIC_HEADER "soxr.h") + PUBLIC_HEADER "${PROJECT_NAME}.h") if (BUILD_FRAMEWORK) - set_target_properties (soxr PROPERTIES FRAMEWORK TRUE) + set_target_properties (${PROJECT_NAME} PROPERTIES FRAMEWORK TRUE) elseif (NOT WIN32) -# set (TARGET_PCS ${CMAKE_CURRENT_BINARY_DIR}/soxr.pc) -# configure_file (${CMAKE_CURRENT_SOURCE_DIR}/soxr.pc.in ${TARGET_PCS}) -# install (FILES ${CMAKE_CURRENT_BINARY_DIR}/soxr.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig) + set (TARGET_PCS ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc) + configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}.pc.in ${TARGET_PCS}) + install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig) endif () @@ -98,11 +97,11 @@ endif () # LSR bindings: if (WITH_LSR_BINDINGS) - set (LSR soxr-lsr) + set (LSR ${PROJECT_NAME}-lsr) set (LSR_SO_VERSION 0.1.9) set (LSR_SO_VERSION_MAJOR 0) - add_library (${LSR} ${LIB_TYPE} lsr) - target_link_libraries (${LSR} soxr) + add_library (${LSR} ${LIB_TYPE} ${LSR}) + target_link_libraries (${LSR} ${PROJECT_NAME}) set_target_properties (${LSR} PROPERTIES VERSION "${LSR_SO_VERSION}" SOVERSION ${LSR_SO_VERSION_MAJOR} @@ -112,9 +111,9 @@ if (WITH_LSR_BINDINGS) if (BUILD_FRAMEWORK) set_target_properties (${LSR} PROPERTIES FRAMEWORK TRUE) elseif (NOT WIN32) -# set (TARGET_PCS "${TARGET_PCS} ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc") -# configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc) -# install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig) + set (TARGET_PCS "${TARGET_PCS} ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc") + configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc) + install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig) endif () endif () @@ -122,29 +121,9 @@ endif () # Installation (from build from source): -#install (TARGETS soxr ${LSR} -# FRAMEWORK DESTINATION ${FRAMEWORK_INSTALL_DIR} -# LIBRARY DESTINATION ${LIB_INSTALL_DIR} -# RUNTIME DESTINATION ${BIN_INSTALL_DIR} -# ARCHIVE DESTINATION ${LIB_INSTALL_DIR} -# PUBLIC_HEADER DESTINATION ${INCLUDE_INSTALL_DIR}) - - - -# Packaging (for unix-like distributions): - -#get_property (LIB1 TARGET soxr PROPERTY LOCATION) -#if (BUILD_SHARED_LIBS) -# set (LIB1 ${LIB1}.${SO_VERSION_MAJOR} ${LIB1}.${SO_VERSION}) -#endif () -#list (APPEND TARGET_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/soxr.h") -#if (WITH_LSR_BINDINGS) -# get_property (LIB2 TARGET ${LSR} PROPERTY LOCATION) -# if (BUILD_SHARED_LIBS) -# set (LIB2 ${LIB2}.${LSR_SO_VERSION_MAJOR} ${LIB2}.${LSR_SO_VERSION}) -# endif () -# list (APPEND TARGET_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.h") -#endif () -#set (TARGET_LIBS ${LIB1} ${LIB2}) -#configure_file (${CMAKE_CURRENT_SOURCE_DIR}/libsoxr.src.in ${CMAKE_CURRENT_BINARY_DIR}/libsoxr.src) -#configure_file (${CMAKE_CURRENT_SOURCE_DIR}/libsoxr-dev.src.in ${CMAKE_CURRENT_BINARY_DIR}/libsoxr-dev.src) +install (TARGETS ${PROJECT_NAME} ${LSR} + FRAMEWORK DESTINATION ${FRAMEWORK_INSTALL_DIR} + LIBRARY DESTINATION ${LIB_INSTALL_DIR} + RUNTIME DESTINATION ${BIN_INSTALL_DIR} + ARCHIVE DESTINATION ${LIB_INSTALL_DIR} + PUBLIC_HEADER DESTINATION ${INCLUDE_INSTALL_DIR}) diff --git a/soxr/src/aliases.h b/soxr/src/aliases.h index eb42bdc..d1a392f 100644 --- a/soxr/src/aliases.h +++ b/soxr/src/aliases.h @@ -1,4 +1,4 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ #if defined SOXR_LIB @@ -18,8 +18,10 @@ #define lsx_dfst_f _soxr_dfst_f #define lsx_dfst _soxr_dfst #define lsx_fir_to_phase _soxr_fir_to_phase +#define lsx_f_resp _soxr_f_resp #define lsx_init_fft_cache_f _soxr_init_fft_cache_f #define lsx_init_fft_cache _soxr_init_fft_cache +#define lsx_inv_f_resp _soxr_inv_f_resp #define lsx_kaiser_beta _soxr_kaiser_beta #define lsx_kaiser_params _soxr_kaiser_params #define lsx_make_lpf _soxr_make_lpf diff --git a/soxr/src/avfft32.c b/soxr/src/avfft32.c index 5be13d2..c3096aa 100644 --- a/soxr/src/avfft32.c +++ b/soxr/src/avfft32.c @@ -1,27 +1,33 @@ /* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ +#include #include #include #include "filter.h" +#include "rdft_t.h" static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);} static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);} static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;} static int multiplier(void) {return 2;} static void nothing(void) {} +static int flags(void) {return 0;} -typedef void (* fn_t)(void); -fn_t _soxr_rdft32_cb[] = { - (fn_t)forward_setup, - (fn_t)backward_setup, - (fn_t)av_rdft_end, - (fn_t)rdft, - (fn_t)rdft, - (fn_t)rdft, - (fn_t)rdft, - (fn_t)_soxr_ordered_convolve_f, - (fn_t)_soxr_ordered_partial_convolve_f, - (fn_t)multiplier, - (fn_t)nothing, +rdft_cb_table _soxr_rdft32_cb = { + forward_setup, + backward_setup, + av_rdft_end, + rdft, + rdft, + rdft, + rdft, + _soxr_ordered_convolve_f, + _soxr_ordered_partial_convolve_f, + multiplier, + nothing, + malloc, + calloc, + free, + flags, }; diff --git a/soxr/src/avfft32s.c b/soxr/src/avfft32s.c index 75e485e..2944144 100644 --- a/soxr/src/avfft32s.c +++ b/soxr/src/avfft32s.c @@ -3,25 +3,30 @@ #include #include -#include "simd.h" +#include "util32s.h" +#include "rdft_t.h" static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);} static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);} -static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;} +static void rdft(int length, void * setup, void * H, void * scratch) {av_rdft_calc(setup, H); (void)length; (void)scratch;} static int multiplier(void) {return 2;} -static void nothing(void) {} +static void nothing2(int u1, void *u2, void *u3, void *u4) {(void)u1; (void)u2; (void)u3; (void)u4;} +static int flags(void) {return RDFT_IS_SIMD;} -typedef void (* fn_t)(void); -fn_t _soxr_rdft32s_cb[] = { - (fn_t)forward_setup, - (fn_t)backward_setup, - (fn_t)av_rdft_end, - (fn_t)rdft, - (fn_t)rdft, - (fn_t)rdft, - (fn_t)rdft, - (fn_t)_soxr_ordered_convolve_simd, - (fn_t)_soxr_ordered_partial_convolve_simd, - (fn_t)multiplier, - (fn_t)nothing, +rdft_cb_table _soxr_rdft32s_cb = { + forward_setup, + backward_setup, + av_rdft_end, + rdft, + rdft, + rdft, + rdft, + ORDERED_CONVOLVE_SIMD, + ORDERED_PARTIAL_CONVOLVE_SIMD, + multiplier, + nothing2, + SIMD_ALIGNED_MALLOC, + SIMD_ALIGNED_CALLOC, + SIMD_ALIGNED_FREE, + flags, }; diff --git a/soxr/src/cb_t.h b/soxr/src/cb_t.h new file mode 100644 index 0000000..d78ebd7 --- /dev/null +++ b/soxr/src/cb_t.h @@ -0,0 +1,26 @@ +/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +* Licence for this file: LGPL v2.1 See LICENCE for details. */ + +typedef struct { + void * (*input)(void *, void * samples, size_t n); + void (*process)(void *, size_t); + void const * (*output)(void *, void * samples, size_t * n); + void (*flush)(void *); + void (*close)(void *); + double (*delay)(void *); + void (*sizes)(size_t * shared, size_t * channel); + char const * (*create)(void * channel, void * shared, double io_ratio, void * q_spec, void * r_spec, double scale); + void (*set_io_ratio)(void *, double io_ratio, size_t len); + char const * (*id)(void); +} control_block_t; + +#define resampler_input p->control_block.input +#define resampler_process p->control_block.process +#define resampler_output p->control_block.output +#define resampler_flush p->control_block.flush +#define resampler_close p->control_block.close +#define resampler_delay p->control_block.delay +#define resampler_sizes p->control_block.sizes +#define resampler_create p->control_block.create +#define resampler_set_io_ratio p->control_block.set_io_ratio +#define resampler_id p->control_block.id diff --git a/soxr/src/ccrw2.h b/soxr/src/ccrw2.h index b42185b..09331a4 100644 --- a/soxr/src/ccrw2.h +++ b/soxr/src/ccrw2.h @@ -3,8 +3,8 @@ /* Concurrent Control with "Readers" and "Writers", P.J. Courtois et al, 1971 */ -#if !defined ccrw2_included -#define ccrw2_included +#if !defined soxr_ccrw2_included +#define soxr_ccrw2_included #if defined SOXR_LIB #include "internal.h" diff --git a/soxr/src/cr-core.c b/soxr/src/cr-core.c new file mode 100644 index 0000000..5355de3 --- /dev/null +++ b/soxr/src/cr-core.c @@ -0,0 +1,316 @@ +/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. + * + * Constant-rate resampling engine-specific code. */ + +#include +#include +#include +#include + +#include "filter.h" + +#if defined SOXR_LIB + #include "internal.h" + #include "cr.h" + #if CORE_TYPE & CORE_DBL + typedef double sample_t; + #if CORE_TYPE & CORE_SIMD_DFT + #define RDFT_CB _soxr_rdft64s_cb + #else + #define RDFT_CB _soxr_rdft64_cb + #endif + #else + typedef float sample_t; + #if CORE_TYPE & CORE_SIMD_DFT + #define RDFT_CB _soxr_rdft32s_cb + #else + #define RDFT_CB _soxr_rdft32_cb + #endif + #endif + + #if CORE_TYPE & (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT) + #if CORE_TYPE & CORE_DBL + #include "util64s.h" + #include "dev64s.h" + #else + #include "util32s.h" + #include "dev32s.h" + #endif + #endif + + extern rdft_cb_table RDFT_CB; +#else + #define RDFT_CB 0 +#endif + + + +static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo) +{ + sample_t const * input = stage_read_p(p); + int num_in = min(stage_occupancy(p), p->input_size); + int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio); + sample_t * output = fifo_reserve(output_fifo, max_num_out); + + for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) { + sample_t const * s = input + p->at.integer; + double x = p->at.fraction * (1 / MULT32); + double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b); + double c = s[1]-*s-a-b; + output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s)); + } + assert(max_num_out - i >= 0); + fifo_trim_by(output_fifo, max_num_out - i); + fifo_read(&p->fifo, p->at.integer, NULL); + p->at.integer = 0; +} + + + +#if defined __AVX__ + #define DEFINED_AVX 1 +#else + #define DEFINED_AVX 0 +#endif + +#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86 + #define DEFINED_X86 1 +#else + #define DEFINED_X86 0 +#endif + +#if defined __arm__ + #define DEFINED_ARM 1 +#else + #define DEFINED_ARM 0 +#endif + + + +#if CORE_TYPE & CORE_DBL + #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_AVX) + #define SIMD_SSE 0 +#else + #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_X86) + #define SIMD_AVX 0 +#endif + +#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_ARM) + + + +#include "half-coefs.h" + +#if !(CORE_TYPE & CORE_SIMD_HALF) +#define FUNCTION_H h7 +#define CONVOLVE ____ __ _ +#include "half-fir.h" +#endif + +#define FUNCTION_H h8 +#define CONVOLVE ____ ____ +#include "half-fir.h" + +#define FUNCTION_H h9 +#define CONVOLVE ____ ____ _ +#include "half-fir.h" + +#if CORE_TYPE & CORE_DBL + #define FUNCTION_H h10 + #define CONVOLVE ____ ____ __ + #include "half-fir.h" + + #define FUNCTION_H h11 + #define CONVOLVE ____ ____ __ _ + #include "half-fir.h" + + #define FUNCTION_H h12 + #define CONVOLVE ____ ____ ____ + #include "half-fir.h" + + #define FUNCTION_H h13 + #define CONVOLVE ____ ____ ____ _ + #include "half-fir.h" +#endif + +static half_fir_info_t const half_firs[] = { +#if !(CORE_TYPE & CORE_SIMD_HALF) + { 7, half_fir_coefs_7 , h7 , 0 , 120.65f}, +#endif + { 8, half_fir_coefs_8 , h8 , 0 , 136.51f}, + { 9, half_fir_coefs_9 , h9 , 0 , 152.32f}, +#if CORE_TYPE & CORE_DBL + {10, half_fir_coefs_10, h10, 0 , 168.08f}, + {11, half_fir_coefs_11, h11, 0 , 183.79f}, + {12, half_fir_coefs_12, h12, 0 , 199.46f}, + {13, half_fir_coefs_13, h13, 0 , 215.12f}, +#endif +}; + +#undef SIMD_AVX +#undef SIMD_NEON +#undef SIMD_SSE + + + +#if CORE_TYPE & CORE_DBL + #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_AVX) + #define SIMD_SSE 0 +#else + #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_X86) + #define SIMD_AVX 0 +#endif + +#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_ARM) + + + +#define COEFS (sample_t * __restrict)p->shared->poly_fir_coefs +#define VAR_LENGTH p->n +#define VAR_CONVOLVE(n) while (j < (n)) _ +#define VAR_POLY_PHASE_BITS p->phase_bits + + + +#define FUNCTION vpoly0 +#define FIR_LENGTH VAR_LENGTH +#define CONVOLVE(n) VAR_CONVOLVE(n) +#include "poly-fir0.h" + +#define FUNCTION vpoly1 +#define COEF_INTERP 1 +#define PHASE_BITS VAR_POLY_PHASE_BITS +#define FIR_LENGTH VAR_LENGTH +#define CONVOLVE(n) VAR_CONVOLVE(n) +#include "poly-fir.h" + +#define FUNCTION vpoly2 +#define COEF_INTERP 2 +#define PHASE_BITS VAR_POLY_PHASE_BITS +#define FIR_LENGTH VAR_LENGTH +#define CONVOLVE(n) VAR_CONVOLVE(n) +#include "poly-fir.h" + +#define FUNCTION vpoly3 +#define COEF_INTERP 3 +#define PHASE_BITS VAR_POLY_PHASE_BITS +#define FIR_LENGTH VAR_LENGTH +#define CONVOLVE(n) VAR_CONVOLVE(n) +#include "poly-fir.h" + + + +#if !(CORE_TYPE & CORE_SIMD_POLY) + +#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ +#define FUNCTION U100_0 +#define FIR_LENGTH U100_l +#define CONVOLVE(n) poly_fir_convolve_U100 +#include "poly-fir0.h" + +#define u100_l 11 +#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _ +#define FUNCTION u100_0 +#define FIR_LENGTH u100_l +#define CONVOLVE(n) poly_fir_convolve_u100 +#include "poly-fir0.h" + +#define FUNCTION u100_1 +#define COEF_INTERP 1 +#define PHASE_BITS 8 +#define FIR_LENGTH u100_l +#define CONVOLVE(n) poly_fir_convolve_u100 +#include "poly-fir.h" + +#define FUNCTION u100_2 +#define COEF_INTERP 2 +#define PHASE_BITS 6 +#define FIR_LENGTH u100_l +#define CONVOLVE(n) poly_fir_convolve_u100 +#include "poly-fir.h" + +#endif + +#define u100_1_b 8 +#define u100_2_b 6 + + + +static poly_fir_t const poly_firs[] = { + {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}}, + {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}}, + {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}}, + {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}}, + {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}}, + {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}}, + + {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}}, + {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}}, + {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}}, + {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}}, + {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}}, + {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}}, + +#if CORE_TYPE & CORE_SIMD_POLY + {10.62f, {{0, vpoly0}, {0, 0}, {0, 0}}}, + {-1, {{0, vpoly0}, {u100_1_b, vpoly1}, {u100_2_b, vpoly2}}}, +#else + {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}}, + {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}}, +#endif + {-1, {{0, vpoly0}, { 9, vpoly1}, { 6, vpoly2}}}, + {-1, {{0, vpoly0}, { 11, vpoly1}, { 7, vpoly2}}}, + {-1, {{0, vpoly0}, { 13, vpoly1}, { 8, vpoly2}}}, + {-1, {{0, vpoly0}, { 10, vpoly2}, { 8, vpoly3}}}, + {-1, {{0, vpoly0}, { 12, vpoly2}, { 9, vpoly3}}}, +}; + + + +static cr_core_t const cr_core = { + +#if CORE_TYPE & CORE_SIMD_POLY + {SIMD_ALIGNED_MALLOC, SIMD_ALIGNED_CALLOC, SIMD_ALIGNED_FREE}, +#else + {malloc, calloc, free}, +#endif + half_firs, array_length(half_firs), + 0, 0, + cubic_stage_fn, + poly_firs, &RDFT_CB +}; + + + +#if defined SOXR_LIB + +#include "soxr.h" + +static char const * rate_create(void * channel, void * shared, double io_ratio, + void * q_spec, void * r_spec, double scale) +{ + return _soxr_init(channel, shared, io_ratio, q_spec, r_spec, scale, + &cr_core, CORE_TYPE); +} + + + +static char const * id(void) {return CORE_STR;} + +#include "cb_t.h" + +control_block_t RATE_CB = { + _soxr_input, + _soxr_process, + _soxr_output, + _soxr_flush, + _soxr_close, + _soxr_delay, + _soxr_sizes, + rate_create, + 0, + id, +}; + +#endif diff --git a/soxr/src/cr.c b/soxr/src/cr.c new file mode 100644 index 0000000..eabe700 --- /dev/null +++ b/soxr/src/cr.c @@ -0,0 +1,600 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. + * + * Constant-rate resampling common code. */ + +#include +#include +#include +#include +#include +#include + +#include "filter.h" + +#if defined SOXR_LIB + #include "internal.h" + #define STATIC +#endif + +#include "cr.h" + +#define num_coefs4 ((core_flags&CORE_SIMD_POLY)? ((num_coefs+3)&~3) : num_coefs) + +#define coef_coef(C,T,x) \ + C((T*)result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i) + +#define STORE(C,T) { \ + if (interp_order > 2) coef_coef(C,T,3) = (T)d; \ + if (interp_order > 1) coef_coef(C,T,2) = (T)c; \ + if (interp_order > 0) coef_coef(C,T,1) = (T)b; \ + coef_coef(C,T,0) = (T)f0;} + +static real * prepare_poly_fir_coefs(double const * coefs, int num_coefs, + int num_phases, int interp_order, double multiplier, + core_flags_t core_flags, alloc_t const * mem) +{ + int i, j, length = num_coefs4 * num_phases * (interp_order + 1); + real * result = mem->calloc(1,(size_t)length << LOG2_SIZEOF_REAL(core_flags)); + double fm1 = coefs[0], f1 = 0, f2 = 0; + + for (i = num_coefs - 1; i >= 0; --i) + for (j = num_phases - 1; j >= 0; --j) { + double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */ + int pos = i * num_phases + j - 1; + fm1 = pos > 0 ? coefs[pos - 1] * multiplier : 0; + switch (interp_order) { + case 1: b = f1 - f0; break; + case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break; + case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break; + default: assert(!interp_order); + } + switch (core_flags & 3) { + case 0: if (WITH_CR32 ) STORE(coef , float ); break; + case 1: if (WITH_CR64 ) STORE(coef , double); break; + case 2: if (WITH_CR32S) STORE(coef4, float ); break; + default:if (WITH_CR64S) STORE(coef4, double); break; + } + f2 = f1, f1 = f0; + } + return result; +} + +#undef STORE +#undef coef_coef + +#define IS_FLOAT32 (WITH_CR32 || WITH_CR32S) && \ + (!(WITH_CR64 || WITH_CR64S) || sizeof_real == sizeof(float)) +#define WITH_FLOAT64 WITH_CR64 || WITH_CR64S + +static void dft_stage_fn(stage_t * p, fifo_t * output_fifo) +{ + real * output, * dft_out; + int i, j, num_in = max(0, fifo_occupancy(&p->fifo)); + rate_shared_t const * s = p->shared; + dft_filter_t const * f = &s->dft_filter[p->dft_filter_num]; + int const overlap = f->num_taps - 1; + + if (p->at.integer + p->L * num_in >= f->dft_length) { + rdft_cb_table const * const RDFT_CB = p->rdft_cb; + size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(p->core_flags); + div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L); + real const * input = fifo_read_ptr(&p->fifo); + fifo_read(&p->fifo, divd.quot, NULL); + num_in -= divd.quot; + + output = fifo_reserve(output_fifo, f->dft_length); + dft_out = (p->core_flags & CORE_SIMD_DFT)? p->dft_out : output; + + if (lsx_is_power_of_2(p->L)) { /* F-domain */ + int portion = f->dft_length / p->L; + memcpy(dft_out, input, (unsigned)portion * sizeof_real); + rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch); + if (IS_FLOAT32) { +#define dft_out ((float *)dft_out) + for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */ + dft_out[i] = dft_out[(portion << 1) - i], + dft_out[i+1] = -dft_out[(portion << 1) - i + 1]; + dft_out[portion] = dft_out[1]; + dft_out[portion + 1] = 0; + dft_out[1] = dft_out[0]; +#undef dft_out + } + else if (WITH_FLOAT64) { +#define dft_out ((double *)dft_out) + for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */ + dft_out[i] = dft_out[(portion << 1) - i], + dft_out[i+1] = -dft_out[(portion << 1) - i + 1]; + dft_out[portion] = dft_out[1]; + dft_out[portion + 1] = 0; + dft_out[1] = dft_out[0]; +#undef dft_out + } + + for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) { + memcpy((char *)dft_out + (size_t)i * sizeof_real, dft_out, (size_t)portion * sizeof_real); + if (IS_FLOAT32) + #define dft_out ((float *)dft_out) + dft_out[i + 1] = 0; + #undef dft_out + else if (WITH_FLOAT64) + #define dft_out ((double *)dft_out) + dft_out[i + 1] = 0; + #undef dft_out + } + + if (p->step.integer > 0) { + rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch); + } + } else { + if (p->L == 1) + memcpy(dft_out, input, (size_t)f->dft_length * sizeof_real); + else { + + memset(dft_out, 0, (size_t)f->dft_length * sizeof_real); + if (IS_FLOAT32) + for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L) + ((float *)dft_out)[i] = ((float *)input)[j]; + else if (WITH_FLOAT64) + for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L) + ((double *)dft_out)[i] = ((double *)input)[j]; + p->at.integer = p->L - 1 - divd.rem; + } + if (p->step.integer > 0) + rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch); + else + rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch); + } + + if (p->step.integer > 0) { + rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs); + rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch); + if ((p->core_flags & CORE_SIMD_DFT) && p->step.integer == 1) + memcpy(output, dft_out, (size_t)f->dft_length * sizeof_real); + if (p->step.integer != 1) { + if (IS_FLOAT32) + for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j, + i += p->step.integer) + ((float *)output)[j] = ((float *)dft_out)[i]; + else if (WITH_FLOAT64) + for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j, + i += p->step.integer) + ((double *)output)[j] = ((double *)dft_out)[i]; + p->remM = i - (f->dft_length - overlap); + fifo_trim_by(output_fifo, f->dft_length - j); + } + else fifo_trim_by(output_fifo, overlap); + } + else { /* F-domain */ + int m = -p->step.integer; + rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs); + rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch); + if (p->core_flags & CORE_SIMD_DFT) + memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof_real); + fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m); + } + (rdft_cb_table const *)RDFT_CB; + } + p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L; +} + +/* Set to 4 x nearest power of 2 or half of that */ +/* if danger of causing too many cache misses. */ +static int set_dft_length(int num_taps, int min, int large) +{ + double d = log((double)num_taps) / log(2.); + return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large)); +} + +static void dft_stage_init( + unsigned instance, double Fp, double Fs, double Fn, double att, + double phase_response, stage_t * p, int L, int M, double * multiplier, + unsigned min_dft_size, unsigned large_dft_size, core_flags_t core_flags, + rdft_cb_table const * rdft_table) +{ + rdft_cb_table const * const RDFT_CB = rdft_table; + dft_filter_t * f = &p->shared->dft_filter[instance]; + int num_taps = 0, dft_length = f->dft_length, i, offset; + bool f_domain_m = abs(3-M) == 1 && Fs <= 1; + size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags); + + if (!dft_length) { + int k = phase_response == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4; + double m, * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.); + + if (phase_response != 50) + lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase_response); + else f->post_peak = num_taps / 2; + + dft_length = set_dft_length(num_taps, (int)min_dft_size, (int)large_dft_size); + f->coefs = rdft_calloc((size_t)dft_length, sizeof_real); + offset = dft_length - num_taps + 1; + m = (1. / dft_length) * rdft_multiplier() * L * *multiplier; + if (IS_FLOAT32) for (i = 0; i < num_taps; ++i) + ((float *)f->coefs)[(i + offset) & (dft_length - 1)] =(float)(h[i] * m); + else if (WITH_FLOAT64) for (i = 0; i < num_taps; ++i) + ((double *)f->coefs)[(i + offset) & (dft_length - 1)] = h[i] * m; + free(h); + } + + if (rdft_flags() & RDFT_IS_SIMD) + p->dft_out = rdft_malloc(sizeof_real * (size_t)dft_length); + if (rdft_flags() & RDFT_NEEDS_SCRATCH) + p->dft_scratch = rdft_malloc(2 * sizeof_real * (size_t)dft_length); + + if (!f->dft_length) { + void * coef_setup = rdft_forward_setup(dft_length); + int Lp = lsx_is_power_of_2(L)? L : 1; + int Mp = f_domain_m? M : 1; + f->dft_forward_setup = rdft_forward_setup(dft_length / Lp); + f->dft_backward_setup = rdft_backward_setup(dft_length / Mp); + if (Mp == 1) + rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch); + else + rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch); + rdft_delete_setup(coef_setup); + f->num_taps = num_taps; + f->dft_length = dft_length; + lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i", + num_taps, dft_length, Fp, Fs, Fn, att, L, M); + } + *multiplier = 1; + p->out_in_ratio = (double)L / M; + p->core_flags = core_flags; + p->rdft_cb = rdft_table; + p->fn = dft_stage_fn; + p->preload = f->post_peak / L; + p->at.integer = f->post_peak % L; + p->L = L; + p->step.integer = f_domain_m? -M/2 : M; + p->dft_filter_num = instance; + p->block_len = f->dft_length - (f->num_taps - 1); + p->phase0 = p->at.integer / p->L; + p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L; +} + +static struct half_fir_info const * find_half_fir( + struct half_fir_info const * firs, size_t len, double att) +{ + size_t i; + for (i = 0; i + 1 < len && att > firs[i].att; ++i); + return &firs[i]; +} + +#define have_pre_stage (preM * preL != 1) +#define have_arb_stage (arbM * arbL != 1) +#define have_post_stage (postM * postL != 1) + +#include "soxr.h" + +STATIC char const * _soxr_init( + rate_t * const p, /* Per audio channel. */ + rate_shared_t * const shared, /* By channels undergoing same rate change. */ + double const io_ratio, /* Input rate divided by output rate. */ + soxr_quality_spec_t const * const q_spec, + soxr_runtime_spec_t const * const r_spec, + double multiplier, /* Linear gain to apply during conversion. */ + cr_core_t const * const core, + core_flags_t const core_flags) +{ + size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags); + double const tolerance = 1 + 1e-5; + + double bits = q_spec->precision; + rolloff_t const rolloff = (rolloff_t)(q_spec->flags & 3); + int interpolator = (int)(r_spec->flags & 3) - 1; + double const Fp0 = q_spec->passband_end, Fs0 = q_spec->stopband_begin; + double const phase_response = q_spec->phase_response, tbw0 = Fs0-Fp0; + + bool const maintain_3dB_pt = !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT); + double tbw_tighten = 1, alpha; + #define tighten(x) (Fs0-(Fs0-(x))*tbw_tighten) + + double arbM = io_ratio, Fn1, Fp1 = Fp0, Fs1 = Fs0, bits1 = min(bits,33); + double att = (bits1 + 1) * linear_to_dB(2.), attArb = att; /* +1: pass+stop */ + int preL = 1, preM = 1, shr = 0, arbL = 1, postL = 1, postM = 1; + bool upsample=false, rational=false, iOpt=!(r_spec->flags&SOXR_NOSMALLINTOPT); + bool lq_bits= (q_spec->flags & SOXR_PROMOTE_TO_LQ)? bits <= 16 : bits == 16; + bool lq_Fp0 = (q_spec->flags & SOXR_PROMOTE_TO_LQ)? Fp0<=lq_bw0 : Fp0==lq_bw0; + int n = 0, i, mode = lq_bits && rolloff == rolloff_medium? io_ratio > 1 || + phase_response != 50 || !lq_Fp0 || Fs0 != 1 : ((int)ceil(bits1) - 6) / 4; + struct half_fir_info const * half_fir_info; + stage_t * s; + + if (io_ratio < 1 && Fs0 - 1 > 1 - Fp0 / tolerance) + return "imaging greater than rolloff"; + if (.002 / tolerance > tbw0 || tbw0 > .5 * tolerance) + return "transition bandwidth not in [0.2,50] % of nyquist"; + if (.5 / tolerance > Fp0 || Fs0 > 1.5 * tolerance) + return "transition band not within [50,150] % of nyquist"; + if (bits!=0 && (15 > bits || bits > 33)) + return "precision not in [15,33] bits"; + if (io_ratio <= 0) + return "resampling factor not positive"; + if (0 > phase_response || phase_response > 100) + return "phase response not in [0=min-phase,100=max-phase] %"; + + p->core = core; + p->io_ratio = io_ratio; + if (bits!=0) while (!n++) { /* Determine stages: */ + int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 : + (int)ceil(r_spec->coef_size_kbytes * 1000. / (U100_l * (int)sizeof_real)); + double d, epsilon = 0, frac; + upsample = arbM < 1; + for (i = (int)(.5 * arbM), shr = 0; i >>= 1; arbM *= .5, ++shr); + preM = upsample || (arbM > 1.5 && arbM < 2); + postM = 1 + (arbM > 1 && preM), arbM /= postM; + preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL; + if ((frac = arbM - (int)arbM)!=0) + epsilon = fabs(floor(frac * MULT32 + .5) / (frac * MULT32) - 1); + for (i = 1, rational = frac==0; i <= maxL && !rational; ++i) { + d = frac * i, try = (int)(d + .5); + if ((rational = fabs(try / d - 1) <= epsilon)) { /* No long doubles! */ + if (try == i) + arbM = ceil(arbM), shr += x = arbM > 3, arbM /= 1 + x; + else arbM = i * (int)arbM + try, arbL = i; + } + } + L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x; + if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) { + for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1); + arbM = arbM * postL / arbL / preL, arbL = 1, n = 0; + } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt)) + preL = L, preM = M, arbM = arbL = postM = 1; + if (!mode && (!rational || !n)) + ++mode, n = 0; + } + + p->num_stages = shr + have_pre_stage + have_arb_stage + have_post_stage; + if (!p->num_stages && multiplier != 1) { + bits = arbL = 0; /* Use cubic_stage in this case. */ + ++p->num_stages; + } + p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages)); + if (!p->stages) + return "out of memory"; + for (i = 0; i < p->num_stages; ++i) { + p->stages[i].num = i; + p->stages[i].shared = shared; + p->stages[i].input_size = 8192; + } + p->stages[0].is_input = true; + + alpha = postM / (io_ratio * (postL << 0)); + + if ((n = p->num_stages) > 1) { /* Att. budget: */ + if (have_arb_stage) + att += linear_to_dB(2.), attArb = att, --n; + att += linear_to_dB((double)n); + } + + half_fir_info = find_half_fir(core->half_firs, core->half_firs_len, att); + for (i = 0, s = p->stages; i < shr; ++i, ++s) { + s->fn = half_fir_info->fn; + s->coefs = half_fir_info->coefs; + s->n = half_fir_info->num_coefs; + s->pre_post = 4 * s->n; + s->preload = s->pre = s->pre_post >> 1; + } + + if (have_pre_stage) { + if (maintain_3dB_pt && have_post_stage) { /* Trans. bands overlapping. */ + double x = tbw0 * lsx_inv_f_resp(-3., att); + x = -lsx_f_resp(x / (max(2 * alpha - Fs0, alpha) - Fp0), att); + if (x > .035) { + tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014; + lsx_debug("tbw_tighten=%g (%gdB)", tbw_tighten, x); + } + } + Fn1 = preM? max(preL, preM) : arbM / arbL; + dft_stage_init(0, tighten(Fp1), Fs1, Fn1, att, phase_response, s++, preL, + max(preM, 1), &multiplier, r_spec->log2_min_dft_size, + r_spec->log2_large_dft_size, core_flags, core->rdft_cb); + Fp1 /= Fn1, Fs1 /= Fn1; + } + + if (bits==0 && have_arb_stage) { /* `Quick' cubic arb stage: */ + s->fn = core->cubic_stage_fn; + s->mult = multiplier, multiplier = 1; + s->step.whole = (int64_t)(arbM * MULT32 + .5); + s->pre_post = max(3, s->step.integer); + s->preload = s->pre = 1; + s->out_in_ratio = MULT32 / (double)s->step.whole; + } + else if (have_arb_stage) { /* Higher quality arb stage: */ + static const float rolloffs[] = {-.01f, -.3f, 0, -.103f}; + poly_fir_t const * f = &core->poly_firs[6*(upsample+!!preM)+mode-!upsample]; + int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases; + size_t coefs_size; + double at, Fp = Fp1, Fs, Fn, mult = upsample? 1 : arbM / arbL; + poly_fir1_t const * f1; + + if (!upsample && preM) + Fn = 2 * mult, Fs = 3 + fabs(Fs1 - 1); + else Fn = 1, Fs = 2 - (mode? Fp1 + (Fs1 - Fp1) * .7 : Fs1); + + if (mode) + Fp = Fs - (Fs - Fp) / (1 - lsx_inv_f_resp(rolloffs[rolloff], attArb)); + + i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1; + do { + f1 = &f->interp[++i]; + assert(f1->fn); + if (i) + arbM /= arbL, arbL = 1, rational = false; + phase_bits = (int)ceil(f1->scalar - log(mult)/log(2.)); + phases = !rational? (1 << phase_bits) : arbL; + if (f->interp[0].scalar==0) { + int phases0 = max(phases, 19), n0 = 0; + lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta); + num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM; + } + if ((num_coefs & 1) && rational && (arbL & 1)) + phases <<= 1, arbL <<= 1, arbM *= 2; + at = arbL * (s->phase0 = .5 * (num_coefs & 1)); + order = i + (i && mode > 4); + coefs_size = (size_t)(num_coefs4 * phases * (order+1)) * sizeof_real; + } while (interpolator < 0 && i < 2 && f->interp[i+1].fn && + coefs_size / 1000 > r_spec->coef_size_kbytes); + + if (!s->shared->poly_fir_coefs) { + int num_taps = num_coefs * phases - 1; + double * coefs = lsx_design_lpf( + Fp, Fs, Fn, attArb, &num_taps, phases, f->beta); + s->shared->poly_fir_coefs = prepare_poly_fir_coefs( + coefs, num_coefs, phases, order, multiplier, core_flags, &core->mem); + lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk", + num_coefs, phases, order, (double)coefs_size / 1000.); + free(coefs); + } + multiplier = 1; + s->fn = f1->fn; + s->pre_post = num_coefs4 - 1; + s->preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs); + s->n = num_coefs4; + s->phase_bits = phase_bits; + s->L = arbL; + s->use_hi_prec_clock = + mode>1 && (q_spec->flags & SOXR_HI_PREC_CLOCK) && !rational; +#if WITH_FLOAT_STD_PREC_CLOCK + if (order && !s->use_hi_prec_clock) { + s->at.flt = at; + s->step.flt = arbM; + s->out_in_ratio = (double)(arbL / s->step.flt); + } else +#endif + { + s->at.whole = (int64_t)(at * MULT32 + .5); +#if WITH_HI_PREC_CLOCK + if (s->use_hi_prec_clock) { + double M = arbM * MULT32; + s->at.fix.ls.parts.ms = 0x80000000ul; + s->step.whole = (int64_t)M; + M -= (double)s->step.whole; + M *= MULT32 * MULT32; + s->step.fix.ls.all = (uint64_t)M; + } else +#endif + s->step.whole = (int64_t)(arbM * MULT32 + .5); + s->out_in_ratio = MULT32 * arbL / (double)s->step.whole; + } + ++s; + } + + if (have_post_stage) + dft_stage_init(1, tighten(Fp0 / (upsample? alpha : 1)), upsample? max(2 - + Fs0 / alpha, 1) : Fs0, (double)max(postL, postM), att, phase_response, + s++, postL, postM, &multiplier, r_spec->log2_min_dft_size, + r_spec->log2_large_dft_size, core_flags, core->rdft_cb); + + lsx_debug("%g: >>%i %i/%i %i/%g %i/%i (%x)", 1/io_ratio, + shr, preL, preM, arbL, arbM, postL, postM, core_flags); + + for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) { + fifo_create(&s->fifo, (int)sizeof_real); + memset(fifo_reserve(&s->fifo, s->preload), 0, + sizeof_real * (size_t)s->preload); + lsx_debug_more("%5i|%-5i preload=%i remL=%i", + s->pre, s->pre_post-s->pre, s->preload, s->at.integer); + } + fifo_create(&s->fifo, (int)sizeof_real); + return 0; +} + +static bool stage_process(stage_t * stage, bool flushing) +{ + fifo_t * fifo = &stage->fifo; + bool done = false; + int want; + while (!done && (want = stage->input_size - fifo_occupancy(fifo)) > 0) { + if (stage->is_input) { + if (flushing) + memset(fifo_reserve(fifo, want), 0, fifo->item_size * (size_t)want); + else done = true; + } + else done = stage_process(stage - 1, flushing); + } + stage->fn(stage, &stage[1].fifo); + return done && fifo_occupancy(fifo) < stage->input_size; +} + +STATIC void _soxr_process(void * P, size_t olen) +{ + rate_t *p = P; + int const n = p->flushing? min(-(int)p->samples_out, (int)olen) : (int)olen; + stage_t * stage = &p->stages[p->num_stages]; + fifo_t * fifo = &stage->fifo; + bool done = false; + while (!done && fifo_occupancy(fifo) < (int)n) + done = stage->is_input || stage_process(stage - 1, p->flushing); +} + +STATIC void * _soxr_input(void * P, void * samples, size_t n) +{ + rate_t *p = P; + if (p->flushing) + return 0; + p->samples_in += (int64_t)n; + return fifo_write(&p->stages[0].fifo, (int)n, samples); +} + +STATIC void const * _soxr_output(void * P, void * samples, size_t * n0) +{ + rate_t *p = P; + fifo_t * fifo = &p->stages[p->num_stages].fifo; + int n = p->flushing? min(-(int)p->samples_out, (int)*n0) : (int)*n0; + p->samples_out += n = min(n, fifo_occupancy(fifo)); + return fifo_read(fifo, (int)(*n0 = (size_t)n), samples); +} + +STATIC void _soxr_flush(void * P) +{ + rate_t *p = P; + if (p->flushing) return; + p->samples_out -= (int64_t)((double)p->samples_in / p->io_ratio + .5); + p->samples_in = 0; + p->flushing = true; +} + +STATIC void _soxr_close(void * P) +{ + rate_t *p = P; + if (p->stages) { + rdft_cb_table const * const RDFT_CB = p->core->rdft_cb; + rate_shared_t * shared = p->stages[0].shared; + int i; + + for (i = 0; i <= p->num_stages; ++i) { + stage_t * s = &p->stages[i]; + rdft_free(s->dft_scratch); + rdft_free(s->dft_out); + fifo_delete(&s->fifo); + } + if (shared) { + for (i = 0; i < 2; ++i) { + dft_filter_t * f= &shared->dft_filter[i]; + rdft_free(f->coefs); + rdft_delete_setup(f->dft_forward_setup); + rdft_delete_setup(f->dft_backward_setup); + } + p->core->mem.free(shared->poly_fir_coefs); + memset(shared, 0, sizeof(*shared)); + } + free(p->stages); + (rdft_cb_table const *)RDFT_CB; + } +} + +#if defined SOXR_LIB +STATIC double _soxr_delay(void * P) +{ + rate_t *p = P; + return (double)p->samples_in / p->io_ratio - (double)p->samples_out; +} + +STATIC void _soxr_sizes(size_t * shared, size_t * channel) +{ + *shared = sizeof(rate_shared_t); + *channel = sizeof(rate_t); +} +#endif diff --git a/soxr/src/cr.h b/soxr/src/cr.h new file mode 100644 index 0000000..880eb1d --- /dev/null +++ b/soxr/src/cr.h @@ -0,0 +1,178 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined soxr_cr_included +#define soxr_cr_included + +#define FIFO_SIZE_T int +#include "fifo.h" + +typedef void real; /* float or double */ +struct stage; +typedef void (* stage_fn_t)(struct stage * input, fifo_t * output); +typedef struct half_fir_info { + int num_coefs; + real const * coefs; + stage_fn_t fn, dfn; + float att; +} half_fir_info_t; +typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t; +typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t; + +#define U100_l 42 +#define MULT32 (65536. * 65536.) + +/* Conceptually: coef_p is &coefs[num_phases][fir_len][interp_order+1]: */ +#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\ + (fir_len) * ((interp_order) + 1) * (phase_num) + \ + ((interp_order) + 1) * (fir_coef_num) + \ + ((interp_order) - (coef_interp_num))] + +/* Conceptually: coef_p is &coefs[num_phases][fir_len/4][interp_order+1][4]: */ +#define coef4(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\ + (fir_len) * ((interp_order) + 1) * (phase_num) + \ + ((interp_order) + 1) * ((fir_coef_num) & ~3) + \ + 4 * ((interp_order) - (coef_interp_num)) + \ + ((fir_coef_num) & 3)] + +typedef union { /* Int64 in parts */ + #if HAVE_BIGENDIAN + struct {int32_t ms; uint32_t ls;} parts; + #else + struct {uint32_t ls; int32_t ms;} parts; + #endif + int64_t all; +} int64p_t; + +typedef union { /* Uint64 in parts */ + #if HAVE_BIGENDIAN + struct {uint32_t ms, ls;} parts; + #else + struct {uint32_t ls, ms;} parts; + #endif + uint64_t all; +} uint64p_t; + +typedef struct { + int dft_length, num_taps, post_peak; + void * dft_forward_setup, * dft_backward_setup; + real * coefs; +} dft_filter_t; + +typedef struct { /* So generated filter coefs may be shared between channels */ + real * poly_fir_coefs; + dft_filter_t dft_filter[2]; +} rate_shared_t; + +typedef double float_step_t; /* Or long double or __float128. */ + +typedef union { /* Fixed point arithmetic */ + struct {uint64p_t ls; int64p_t ms;} fix; /* Hi-prec has ~96 bits. */ + float_step_t flt; +} step_t; + +#define integer fix.ms.parts.ms +#define fraction fix.ms.parts.ls +#define whole fix.ms.all + +#define CORE_DBL 1 +#define CORE_SIMD_POLY 2 +#define CORE_SIMD_HALF 4 +#define CORE_SIMD_DFT 8 +#define LOG2_SIZEOF_REAL(core_flags) (2 + ((core_flags) & 1)) + +typedef int core_flags_t; + +#if defined SOXR_LIB +#include "rdft_t.h" +#else +typedef void fn_t; +#endif + +typedef struct stage { + int num; + + /* Common to all stage types: */ + core_flags_t core_flags; + stage_fn_t fn; + fifo_t fifo; + int pre; /* Number of past samples to store */ + int pre_post; /* pre + number of future samples to store */ + int preload; /* Number of zero samples to pre-load the fifo */ + double out_in_ratio; /* For buffer management. */ + int input_size; + bool is_input; + + /* For a stage with variable (run-time generated) filter coefs: */ + rdft_cb_table const * rdft_cb; + rate_shared_t * shared; + unsigned dft_filter_num; /* Which, if any, of the 2 DFT filters to use */ + real * dft_scratch; + float * dft_out; + real const * coefs; + + /* For a stage with variable L/M: */ + step_t at, step; + bool use_hi_prec_clock; + int L, remM; + int n, phase_bits, block_len; + double mult, phase0; +} stage_t; + +#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post) +#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre) + +#define lq_bw0 (1385/2048.) /* ~.67625, FP exact. */ + +typedef enum {rolloff_small, rolloff_medium, rolloff_none} rolloff_t; + +typedef struct { + void * (* alloc)(size_t); + void * (* calloc)(size_t, size_t); + void (* free)(void *); +} alloc_t; + +typedef struct { + alloc_t mem; + half_fir_info_t const * half_firs; + size_t half_firs_len; + half_fir_info_t const * doub_firs; + size_t doub_firs_len; + stage_fn_t cubic_stage_fn; + poly_fir_t const * poly_firs; + rdft_cb_table * rdft_cb; +} cr_core_t; + +typedef struct rate rate_t; +struct rate { + cr_core_t const * core; + double io_ratio; + int64_t samples_in, samples_out; + int num_stages, flushing; + stage_t * stages; +}; + +#if defined SOXR_LIB + +#include "soxr.h" + +char const * _soxr_init( + rate_t * const p, /* Per audio channel. */ + rate_shared_t * const shared, /* Between channels (undergoing same rate change)*/ + double const io_ratio, /* Input rate divided by output rate. */ + soxr_quality_spec_t const * const q_spec, + soxr_runtime_spec_t const * const r_spec, + double multiplier, /* Linear gain to apply during conversion. 1 */ + cr_core_t const * const core, + core_flags_t const); + +void _soxr_process(void * p, size_t olen); +void * _soxr_input(void * p, void * samples, size_t n); +void const * _soxr_output(void * p, void * samples, size_t * n0); +void _soxr_flush(void * p); +void _soxr_close(void * p); +double _soxr_delay(void * p); +void _soxr_sizes(size_t * shared, size_t * channel); +#endif + +#endif diff --git a/soxr/src/cr32.c b/soxr/src/cr32.c new file mode 100644 index 0000000..b9eb264 --- /dev/null +++ b/soxr/src/cr32.c @@ -0,0 +1,8 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#define RATE_CB _soxr_rate32_cb +#define CORE_STR "cr32" + +#define CORE_TYPE 0 +#include "cr-core.c" diff --git a/soxr/src/cr32s.c b/soxr/src/cr32s.c new file mode 100644 index 0000000..5de2a43 --- /dev/null +++ b/soxr/src/cr32s.c @@ -0,0 +1,8 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#define RATE_CB _soxr_rate32s_cb +#define CORE_STR "cr32s" + +#define CORE_TYPE (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT) +#include "cr-core.c" diff --git a/soxr/src/cr64.c b/soxr/src/cr64.c new file mode 100644 index 0000000..518cdd7 --- /dev/null +++ b/soxr/src/cr64.c @@ -0,0 +1,8 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#define RATE_CB _soxr_rate64_cb +#define CORE_STR "cr64" + +#define CORE_TYPE CORE_DBL +#include "cr-core.c" diff --git a/soxr/src/cr64s.c b/soxr/src/cr64s.c new file mode 100644 index 0000000..5dcd6f1 --- /dev/null +++ b/soxr/src/cr64s.c @@ -0,0 +1,8 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#define RATE_CB _soxr_rate64s_cb +#define CORE_STR "cr64s" + +#define CORE_TYPE (CORE_DBL|CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT) +#include "cr-core.c" diff --git a/soxr/src/data-io.c b/soxr/src/data-io.c index 1cd8e7f..fb61675 100644 --- a/soxr/src/data-io.c +++ b/soxr/src/data-io.c @@ -1,4 +1,4 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ #include @@ -14,8 +14,8 @@ unsigned i; \ size_t j; \ T const * src = *src0; \ - if (ch > 1) \ - for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) dest[i][j] = (DEINTERLEAVE_TO)*src++; \ + if (ch > 1) for (j = 0; j < n; ++j) \ + for (i = 0; i < ch; ++i) dest[i][j] = (DEINTERLEAVE_TO)*src++; \ else if (flag) memcpy(dest[0], src, n * sizeof(T)), src = &src[n]; \ else for (j = 0; j < n; dest[0][j++] = (DEINTERLEAVE_TO)*src++); \ *src0 = src; \ @@ -23,7 +23,7 @@ -#if HAVE_DOUBLE_PRECISION +#if WITH_CR64 || WITH_CR64S void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */ soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch) { @@ -40,7 +40,7 @@ void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */ -#if HAVE_SINGLE_PRECISION +#if WITH_CR32 || WITH_CR32S || WITH_VR32 void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */ soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch) { @@ -60,35 +60,6 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */ #include "rint.h" -#if HAVE_FENV_H - #include - #define fe_test_invalid() fetestexcept(FE_INVALID) - #define fe_clear_invalid() feclearexcept(FE_INVALID) -#elif defined _MSC_VER - #define FE_INVALID 1 - #if defined _WIN64 - #include - #define fe_test_invalid() (_statusfp() & _SW_INVALID) - #define fe_clear_invalid _clearfp /* FIXME clears all */ - #else - static __inline int fe_test_invalid() - { - short status_word; - __asm fnstsw status_word - return status_word & FE_INVALID; - } - - static __inline int fe_clear_invalid() - { - int16_t status[14]; - __asm fnstenv status - status[2] &= ~FE_INVALID; - __asm fldenv status - return 0; - } - #endif -#endif - #if defined FE_INVALID && defined FPU_RINT32 && defined __STDC_VERSION__ @@ -97,13 +68,13 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */ #endif #endif -#if HAVE_DOUBLE_PRECISION +#if WITH_CR64 || WITH_CR64S #define FLOATX double #define LSX_RINT_CLIP_2 lsx_rint32_clip_2 #define LSX_RINT_CLIP lsx_rint32_clip #define RINT_CLIP rint32_clip -#define RINT rint32 +#define RINT rint32D #if defined FPU_RINT32 #define FPU_RINT #endif @@ -114,7 +85,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */ #define LSX_RINT_CLIP_2 lsx_rint16_clip_2 #define LSX_RINT_CLIP lsx_rint16_clip #define RINT_CLIP rint16_clip -#define RINT rint16 +#define RINT rint16D #if defined FPU_RINT16 #define FPU_RINT #endif @@ -125,7 +96,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */ #define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither #define LSX_RINT_CLIP lsx_rint16_clip_dither #define RINT_CLIP rint16_clip_dither -#define RINT rint16 +#define RINT rint16D #if defined FPU_RINT16 #define FPU_RINT #endif @@ -139,13 +110,13 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */ -#if HAVE_SINGLE_PRECISION +#if WITH_CR32 || WITH_CR32S || WITH_VR32 #define FLOATX float #define LSX_RINT_CLIP_2 lsx_rint32_clip_2_f #define LSX_RINT_CLIP lsx_rint32_clip_f #define RINT_CLIP rint32_clip_f -#define RINT rint32 +#define RINT rint32F #if defined FPU_RINT32 #define FPU_RINT #endif @@ -156,7 +127,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */ #define LSX_RINT_CLIP_2 lsx_rint16_clip_2_f #define LSX_RINT_CLIP lsx_rint16_clip_f #define RINT_CLIP rint16_clip_f -#define RINT rint16 +#define RINT rint16F #if defined FPU_RINT16 #define FPU_RINT #endif @@ -167,7 +138,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */ #define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither_f #define LSX_RINT_CLIP lsx_rint16_clip_dither_f #define RINT_CLIP rint16_clip_dither_f -#define RINT rint16 +#define RINT rint16D #if defined FPU_RINT16 #define FPU_RINT #endif @@ -199,7 +170,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */ return 0; \ } while (0) -#if HAVE_DOUBLE_PRECISION +#if WITH_CR64 || WITH_CR64S size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0, double const * const * src, size_t n, unsigned ch, unsigned long * seed) { @@ -225,7 +196,7 @@ size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0, } #endif -#if HAVE_SINGLE_PRECISION +#if WITH_CR32 || WITH_CR32S || WITH_VR32 size_t /* clips */ _soxr_interleave_f(soxr_datatype_t data_type, void * * dest0, float const * const * src, size_t n, unsigned ch, unsigned long * seed) { diff --git a/soxr/src/dev32s.h b/soxr/src/dev32s.h new file mode 100644 index 0000000..7edae86 --- /dev/null +++ b/soxr/src/dev32s.h @@ -0,0 +1,54 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined soxr_dev32s_included +#define soxr_dev32s_included + +#if defined __GNUC__ + #define SIMD_INLINE(T) static __inline T __attribute__((always_inline)) + #define vAlign __attribute__((aligned (16))) +#elif defined _MSC_VER + #define SIMD_INLINE(T) static __forceinline T + #define vAlign __declspec(align(16)) +#endif + +#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86 + +#include + +#define vZero() _mm_setzero_ps() +#define vSet1(a) _mm_set_ss(a) +#define vMul(a,b) _mm_mul_ps(a,b) +#define vAdd(a,b) _mm_add_ps(a,b) +#define vMac(a,b,c) vAdd(vMul(a,b),c) +#define vLds(a) _mm_set1_ps(a) +#define vLd(a) _mm_load_ps(a) +#define vLdu(a) _mm_loadu_ps(a) + +typedef __m128 v4_t; + +SIMD_INLINE(void) vStorSum(float * a, v4_t b) { + v4_t t = vAdd(_mm_movehl_ps(b, b), b); + _mm_store_ss(a, vAdd(t, _mm_shuffle_ps(t,t,1)));} + +#elif defined __arm__ + +#include + +#define vZero() vdupq_n_f32(0) +#define vMul(a,b) vmulq_f32(a,b) +#define vAdd(a,b) vaddq_f32(a,b) +#define vMac(a,b,c) vmlaq_f32(c,a,b) +#define vLds(a) vld1q_dup_f32(&(a)) +#define vLd(a) vld1q_f32(a) +#define vLdu(a) vld1q_f32(a) + +typedef float32x4_t v4_t; + +SIMD_INLINE(void) vStorSum(float * a, v4_t b) { + float32x2_t t = vadd_f32(vget_high_f32(b), vget_low_f32(b)); + *a = vget_lane_f32(vpadd_f32(t, t), 0);} + +#endif + +#endif diff --git a/soxr/src/dev64s.h b/soxr/src/dev64s.h new file mode 100644 index 0000000..4672210 --- /dev/null +++ b/soxr/src/dev64s.h @@ -0,0 +1,42 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined soxr_dev64s_included +#define soxr_dev64s_included + +#if defined __GNUC__ + #define SIMD_INLINE(T) static __inline T __attribute__((always_inline)) + #define vAlign __attribute__((aligned (32))) +#elif defined _MSC_VER + #define SIMD_INLINE(T) static __forceinline T + #define vAlign __declspec(align(32)) +#else + #define SIMD_INLINE(T) static __inline T +#endif + +#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86 + +#include + +#if defined __AVX__ + +#define vZero() _mm256_setzero_pd() +#define vSet1(a) _mm256_set_pd(0,0,0,a) +#define vMul(a,b) _mm256_mul_pd(a,b) +#define vAdd(a,b) _mm256_add_pd(a,b) +#define vMac(a,b,c) vAdd(vMul(a,b),c) /* Note: gcc -mfma will `fuse' these */ +#define vLds(a) _mm256_set1_pd(a) +#define vLd(a) _mm256_load_pd(a) +#define vLdu(a) _mm256_loadu_pd(a) + +typedef __m256d v4_t; + +SIMD_INLINE(void) vStorSum(double * a, v4_t b) { + b = _mm256_hadd_pd(b, _mm256_permute2f128_pd(b,b,1)); + _mm_store_sd(a, _mm256_castpd256_pd128(_mm256_hadd_pd(b,b)));} + +#endif + +#endif + +#endif diff --git a/soxr/src/fft4g.c b/soxr/src/fft4g.c index 5fae8a6..cf6293a 100644 --- a/soxr/src/fft4g.c +++ b/soxr/src/fft4g.c @@ -282,22 +282,16 @@ Appendix : */ -#include +#include "math-wrap.h" #include "fft4g.h" #ifdef FFT4G_FLOAT #define double float #define one_half 0.5f -#if defined _MSC_VER - #define sin (float)sin - #define cos (float)cos - #define atan (float)atan -#else - #define sin sinf - #define cos cosf - #define atan atanf -#endif + #define sin(x) sinf(x) + #define cos(x) cosf(x) + #define atan(x) atanf(x) #define cdft lsx_cdft_f #define rdft lsx_rdft_f @@ -818,7 +812,7 @@ static void bitrv2(int n, int *ip0, double *a) static void bitrv2conj(int n, int *ip0, double *a) { - int j, j1, k, k1, l, m, m2, ip[256]; + int j, j1, k, k1, l, m, m2, ip[512]; double xr, xi, yr, yi; (void)ip0; diff --git a/soxr/src/fft4g32.c b/soxr/src/fft4g32.c index 8741394..4e4912e 100644 --- a/soxr/src/fft4g32.c +++ b/soxr/src/fft4g32.c @@ -1,27 +1,38 @@ /* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ +#include #include "filter.h" #define FFT4G_FLOAT #include "fft4g.c" +#include "soxr-config.h" -static void * null(void) {return 0;} -static void forward (int length, void * setup, double * H) {lsx_safe_rdft_f(length, 1, H); (void)setup;} -static void backward(int length, void * setup, double * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;} +#if WITH_CR32 +#include "rdft_t.h" +static void * null(int u1) {(void)u1; return 0;} +static void forward (int length, void * setup, void * H, void * scratch) {lsx_safe_rdft_f(length, 1, H); (void)setup; (void)scratch;} +static void backward(int length, void * setup, void * H, void * scratch) {lsx_safe_rdft_f(length, -1, H); (void)setup; (void)scratch;} static int multiplier(void) {return 2;} -static void nothing(void) {} +static void nothing(void *u1) {(void)u1;} +static void nothing2(int u1, void *u2, void *u3, void *u4) {(void)u1; (void)u2; (void)u3; (void)u4;} +static int flags(void) {return 0;} -typedef void (* fn_t)(void); -fn_t _soxr_rdft32_cb[] = { - (fn_t)null, - (fn_t)null, - (fn_t)nothing, - (fn_t)forward, - (fn_t)forward, - (fn_t)backward, - (fn_t)backward, - (fn_t)_soxr_ordered_convolve_f, - (fn_t)_soxr_ordered_partial_convolve_f, - (fn_t)multiplier, - (fn_t)nothing, +rdft_cb_table _soxr_rdft32_cb = { + null, + null, + nothing, + forward, + forward, + backward, + backward, + _soxr_ordered_convolve_f, + _soxr_ordered_partial_convolve_f, + multiplier, + nothing2, + malloc, + calloc, + free, + flags, }; + +#endif diff --git a/soxr/src/fft4g32s.c b/soxr/src/fft4g32s.c index 4a95a7d..c7f3772 100644 --- a/soxr/src/fft4g32s.c +++ b/soxr/src/fft4g32s.c @@ -2,25 +2,30 @@ * Licence for this file: LGPL v2.1 See LICENCE for details. */ #include "filter.h" -#include "simd.h" +#include "util32s.h" +#include "rdft_t.h" static void * null(void) {return 0;} static void nothing(void) {} static void forward (int length, void * setup, float * H) {lsx_safe_rdft_f(length, 1, H); (void)setup;} static void backward(int length, void * setup, float * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;} static int multiplier(void) {return 2;} +static int flags(void) {return RDFT_IS_SIMD;} -typedef void (* fn_t)(void); -fn_t _soxr_rdft32s_cb[] = { - (fn_t)null, - (fn_t)null, - (fn_t)nothing, - (fn_t)forward, - (fn_t)forward, - (fn_t)backward, - (fn_t)backward, - (fn_t)_soxr_ordered_convolve_simd, - (fn_t)_soxr_ordered_partial_convolve_simd, - (fn_t)multiplier, - (fn_t)nothing, +rdft_cb_table _soxr_rdft32s_cb = { + null, + null, + nothing, + forward, + forward, + backward, + backward, + ORDERED_CONVOLVE_SIMD, + ORDERED_PARTIAL_CONVOLVE_SIMD, + multiplier, + nothing, + SIMD_ALIGNED_MALLOC, + SIMD_ALIGNED_CALLOC, + SIMD_ALIGNED_FREE, + flags, }; diff --git a/soxr/src/fft4g64.c b/soxr/src/fft4g64.c index 48eaddd..fb87281 100644 --- a/soxr/src/fft4g64.c +++ b/soxr/src/fft4g64.c @@ -1,29 +1,36 @@ /* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ +#include #include "filter.h" #include "fft4g.c" #include "soxr-config.h" -#if HAVE_DOUBLE_PRECISION -static void * null(void) {return 0;} -static void nothing(void) {} -static void forward (int length, void * setup, double * H) {lsx_safe_rdft(length, 1, H); (void)setup;} -static void backward(int length, void * setup, double * H) {lsx_safe_rdft(length, -1, H); (void)setup;} +#if WITH_CR64 +#include "rdft_t.h" +static void * null(int u1) {(void)u1; return 0;} +static void nothing(void *u1) {(void)u1;} +static void nothing2(int u1, void *u2, void *u3, void *u4) {(void)u1; (void)u2; (void)u3; (void)u4;} +static void forward (int length, void * setup, void * H, void * scratch) {lsx_safe_rdft(length, 1, H); (void)setup; (void)scratch;} +static void backward(int length, void * setup, void * H, void * scratch) {lsx_safe_rdft(length, -1, H); (void)setup; (void)scratch;} static int multiplier(void) {return 2;} +static int flags(void) {return 0;} -typedef void (* fn_t)(void); -fn_t _soxr_rdft64_cb[] = { - (fn_t)null, - (fn_t)null, - (fn_t)nothing, - (fn_t)forward, - (fn_t)forward, - (fn_t)backward, - (fn_t)backward, - (fn_t)_soxr_ordered_convolve, - (fn_t)_soxr_ordered_partial_convolve, - (fn_t)multiplier, - (fn_t)nothing, +rdft_cb_table _soxr_rdft64_cb = { + null, + null, + nothing, + forward, + forward, + backward, + backward, + _soxr_ordered_convolve, + _soxr_ordered_partial_convolve, + multiplier, + nothing2, + malloc, + calloc, + free, + flags, }; #endif diff --git a/soxr/src/fifo.h b/soxr/src/fifo.h index b2bda43..33af9fe 100644 --- a/soxr/src/fifo.h +++ b/soxr/src/fifo.h @@ -1,14 +1,15 @@ /* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ -#ifndef fifo_included -#define fifo_included +#ifndef soxr_fifo_included +#define soxr_fifo_included #if !defined FIFO_SIZE_T #define FIFO_SIZE_T size_t #endif #if !defined FIFO_REALLOC +#include #define FIFO_REALLOC(a,b,c) realloc(a,b) #undef FIFO_FREE #define FIFO_FREE free diff --git a/soxr/src/filter.c b/soxr/src/filter.c index ca146d2..019d24d 100644 --- a/soxr/src/filter.c +++ b/soxr/src/filter.c @@ -1,12 +1,9 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ #include "filter.h" -#include -#if !defined M_PI -#define M_PI 3.14159265358979323846 -#endif +#include "math-wrap.h" #include #include #include @@ -14,7 +11,7 @@ #include "fft4g.h" #include "ccrw2.h" -#if 1 || HAVE_DOUBLE_PRECISION /* Always need this, for lsx_fir_to_phase. */ +#if 1 || WITH_CR64 || WITH_CR64S /* Always need this, for lsx_fir_to_phase. */ #define DFT_FLOAT double #define DONE_WITH_FFT_CACHE done_with_fft_cache #define FFT_CACHE_CCRW fft_cache_ccrw @@ -31,7 +28,7 @@ #include "fft4g_cache.h" #endif -#if HAVE_SINGLE_PRECISION && !HAVE_AVFFT +#if (WITH_CR32 && !AVCODEC_FOUND) || (WITH_CR32S && !AVCODEC_FOUND && !WITH_PFFFT) #define DFT_FLOAT float #define DONE_WITH_FFT_CACHE done_with_fft_cache_f #define FFT_CACHE_CCRW fft_cache_ccrw_f @@ -48,14 +45,14 @@ #include "fft4g_cache.h" #endif -#if HAVE_DOUBLE_PRECISION || !SOXR_LIB +#if WITH_CR64 || WITH_CR64S || !SOXR_LIB #define DFT_FLOAT double #define ORDERED_CONVOLVE lsx_ordered_convolve #define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve #include "rdft.h" #endif -#if HAVE_SINGLE_PRECISION +#if WITH_CR32 #define DFT_FLOAT float #define ORDERED_CONVOLVE lsx_ordered_convolve_f #define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve_f @@ -96,12 +93,12 @@ double * lsx_make_lpf( double * h = malloc((size_t)num_taps * sizeof(*h)); double mult = scale / lsx_bessel_I_0(beta), mult1 = 1 / (.5 * m + rho); assert(Fc >= 0 && Fc <= 1); - lsx_debug("make_lpf(n=%i Fc=%.7g β=%g ρ=%g scale=%g)", + lsx_debug("make_lpf(n=%i Fc=%.7g beta=%g rho=%g scale=%g)", num_taps, Fc, beta, rho, scale); if (h) for (i = 0; i <= m / 2; ++i) { double z = i - .5 * m, x = z * M_PI, y = z * mult1; - h[i] = x? sin(Fc * x) / x : Fc; + h[i] = x!=0? sin(Fc * x) / x : Fc; h[i] *= lsx_bessel_I_0(beta * sqrt(1 - y * y)) * mult; if (m - i != i) h[m - i] = h[i]; @@ -123,12 +120,15 @@ double * lsx_design_lpf( double Fn, /* Nyquist freq; e.g. 0.5, 1, PI */ double att, /* Stop-band attenuation in dB */ int * num_taps, /* 0: value will be estimated */ - int k, /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */ + int k, /* >0: number of phases; <0: num_taps = 1 (mod -k) */ double beta) /* <0: value will be estimated */ { int n = *num_taps, phases = max(k, 1), modulo = max(-k, 1); double tr_bw, Fc, rho = phases == 1? .5 : att < 120? .63 : .75; + lsx_debug_more("./sinctest %-12.7g %-12.7g %g 0 %-5g %i %i 50 %g %g -4 >1", + Fp, Fs, Fn, att, *num_taps, k, beta, rho); + Fp /= fabs(Fn), Fs /= fabs(Fn); /* Normalise to Fn = 1 */ tr_bw = .5 * (Fs - Fp); /* Transition band-width: 6dB to stop points */ tr_bw /= phases, Fs /= phases; @@ -145,7 +145,7 @@ double * lsx_design_lpf( static double safe_log(double x) { assert(x >= 0); - if (x) + if (x!=0) return log(x); lsx_debug("log(0)"); return -26; @@ -222,7 +222,7 @@ void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase) while (peak && fabs(work[peak-1]) > fabs(work[peak]) && work[peak-1] * work[peak] > 0) --peak; - if (!phase1) + if (phase1==0) begin = 0; else if (phase1 == 1) begin = peak - *len / 2; @@ -243,3 +243,35 @@ void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase) work[imp_peak], *len, *post_len, 100 - 100. * *post_len / (*len - 1)); free(pi_wraps), free(work); } + +#define F_x(F,expr) static double F(double x) {return expr;} +F_x(sinePhi, ((2.0517e-07*x-1.1303e-04)*x+.023154)*x+.55924 ) +F_x(sinePsi, ((9.0667e-08*x-5.6114e-05)*x+.013658)*x+1.0977 ) +F_x(sinePow, log(.5)/log(sin(x*.5)) ) +#define dB_to_linear(x) exp((x) * (M_LN10 * 0.05)) + +double lsx_f_resp(double t, double a) +{ + double x; + if (t > (a <= 160? .8 : .82)) { + double a1 = a+15; + double p = .00035*a+.375; + double w = 1/(1-.597)*asin(pow((a1-10.6)/a1,1/p)); + double c = 1+asin(pow(1-a/a1,1/p))/w; + return a1*(pow(sin((c-t)*w),p)-1); + } + if (t > .5) + x = sinePsi(a), x = pow(sin((1-t) * x), sinePow(x)); + else + x = sinePhi(a), x = 1 - pow(sin(t * x), sinePow(x)); + return linear_to_dB(x); +} + +double lsx_inv_f_resp(double drop, double a) +{ + double x = sinePhi(a), s; + drop = dB_to_linear(drop); + s = drop > .5 ? 1 - drop : drop; + x = asin(pow(s, 1/sinePow(x))) / x; + return drop > .5? x : 1 -x; +} diff --git a/soxr/src/filter.h b/soxr/src/filter.h index 435303b..203e73d 100644 --- a/soxr/src/filter.h +++ b/soxr/src/filter.h @@ -16,10 +16,10 @@ void lsx_safe_rdft(int len, int type, double * d); void lsx_safe_cdft(int len, int type, double * d); void lsx_safe_rdft_f(int len, int type, float * d); void lsx_safe_cdft_f(int len, int type, float * d); -void lsx_ordered_convolve(int n, void * not_used, double * a, const double * b); -void lsx_ordered_convolve_f(int n, void * not_used, float * a, const float * b); -void lsx_ordered_partial_convolve(int n, double * a, const double * b); -void lsx_ordered_partial_convolve_f(int n, float * a, const float * b); +void lsx_ordered_convolve(int n, void * not_used, void * a, const void * b); +void lsx_ordered_convolve_f(int n, void * not_used, void * a, const void * b); +void lsx_ordered_partial_convolve(int n, void * a, const void * b); +void lsx_ordered_partial_convolve_f(int n, void * a, const void * b); double lsx_kaiser_beta(double att, double tr_bw); double * lsx_make_lpf(int num_taps, double Fc, double beta, double rho, @@ -31,9 +31,14 @@ double * lsx_design_lpf( double Fn, /* Nyquist freq; e.g. 0.5, 1, PI; < 0: dummy run */ double att, /* Stop-band attenuation in dB */ int * num_taps, /* 0: value will be estimated */ - int k, /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */ + int k, /* >0: number of phases; <0: num_taps = 1 (mod -k) */ double beta); /* <0: value will be estimated */ + void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase0); +double lsx_f_resp(double t, double a); +double lsx_inv_f_resp(double drop, double a); +#define lsx_to_3dB(a) (1 - lsx_inv_f_resp(-3., a)) + #endif diff --git a/soxr/src/filters.h b/soxr/src/filters.h deleted file mode 100644 index e9a8011..0000000 --- a/soxr/src/filters.h +++ /dev/null @@ -1,151 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -#include "half_coefs.h" - -#define FUNCTION h8 -#define CONVOLVE _ _ _ _ _ _ _ _ -#define h8_l 8 -#define COEFS half_fir_coefs_8 -#include "half-fir.h" - -#define FUNCTION h9 -#define CONVOLVE _ _ _ _ _ _ _ _ _ -#define h9_l 9 -#define COEFS half_fir_coefs_9 -#include "half-fir.h" - -#define FUNCTION h10 -#define CONVOLVE _ _ _ _ _ _ _ _ _ _ -#define h10_l 10 -#define COEFS half_fir_coefs_10 -#include "half-fir.h" - -#define FUNCTION h11 -#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ -#define h11_l 11 -#define COEFS half_fir_coefs_11 -#include "half-fir.h" - -#define FUNCTION h12 -#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _ -#define h12_l 12 -#define COEFS half_fir_coefs_12 -#include "half-fir.h" - -#define FUNCTION h13 -#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _ _ -#define h13_l 13 -#define COEFS half_fir_coefs_13 -#include "half-fir.h" - -static struct {int num_coefs; stage_fn_t fn; float att;} const half_firs[] = { - { 8, h8 , 136.51f}, - { 9, h9 , 152.32f}, - {10, h10, 168.07f}, - {11, h11, 183.78f}, - {12, h12, 199.44f}, - {13, h13, 212.75f}, -}; - -#define HI_PREC_CLOCK - -#define VAR_LENGTH p->n -#define VAR_CONVOLVE while (j < FIR_LENGTH) _ -#define VAR_POLY_PHASE_BITS p->phase_bits - -#define FUNCTION vpoly0 -#define FIR_LENGTH VAR_LENGTH -#define CONVOLVE VAR_CONVOLVE -#include "poly-fir0.h" - -#define FUNCTION vpoly1 -#define COEF_INTERP 1 -#define PHASE_BITS VAR_POLY_PHASE_BITS -#define FIR_LENGTH VAR_LENGTH -#define CONVOLVE VAR_CONVOLVE -#include "poly-fir.h" - -#define FUNCTION vpoly2 -#define COEF_INTERP 2 -#define PHASE_BITS VAR_POLY_PHASE_BITS -#define FIR_LENGTH VAR_LENGTH -#define CONVOLVE VAR_CONVOLVE -#include "poly-fir.h" - -#define FUNCTION vpoly3 -#define COEF_INTERP 3 -#define PHASE_BITS VAR_POLY_PHASE_BITS -#define FIR_LENGTH VAR_LENGTH -#define CONVOLVE VAR_CONVOLVE -#include "poly-fir.h" - -#undef HI_PREC_CLOCK - -#define U100_l 42 -#if RATE_SIMD_POLY - #define U100_l_EXTRA _ _ - #define u100_l_EXTRA _ - #define U100_l_EXTRA_LENGTH 2 - #define u100_l_EXTRA_LENGTH 1 -#else - #define U100_l_EXTRA - #define u100_l_EXTRA - #define U100_l_EXTRA_LENGTH 0 - #define u100_l_EXTRA_LENGTH 0 -#endif -#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ U100_l_EXTRA -#define FUNCTION U100_0 -#define FIR_LENGTH (U100_l + U100_l_EXTRA_LENGTH) -#define CONVOLVE poly_fir_convolve_U100 -#include "poly-fir0.h" - -#define u100_l 11 -#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _ u100_l_EXTRA -#define FUNCTION u100_0 -#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH) -#define CONVOLVE poly_fir_convolve_u100 -#include "poly-fir0.h" - -#define FUNCTION u100_1 -#define COEF_INTERP 1 -#define PHASE_BITS 8 -#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH) -#define CONVOLVE poly_fir_convolve_u100 -#include "poly-fir.h" -#define u100_1_b 8 - -#define FUNCTION u100_2 -#define COEF_INTERP 2 -#define PHASE_BITS 6 -#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH) -#define CONVOLVE poly_fir_convolve_u100 -#include "poly-fir.h" -#define u100_2_b 6 - -typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t; -typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t; - -static poly_fir_t const poly_firs[] = { - {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}}, - {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}}, - {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}}, - {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}}, - {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}}, - {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}}, - - {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}}, - {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}}, - {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}}, - {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}}, - {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}}, - {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}}, - - {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}}, - {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}}, - {-1, {{0, vpoly0}, { 9, vpoly1}, { 6, vpoly2}}}, - {-1, {{0, vpoly0}, { 11, vpoly1}, { 7, vpoly2}}}, - {-1, {{0, vpoly0}, { 13, vpoly1}, { 8, vpoly2}}}, - {-1, {{0, vpoly0}, { 10, vpoly2}, { 8, vpoly3}}}, - {-1, {{0, vpoly0}, { 12, vpoly2}, { 9, vpoly3}}}, -}; diff --git a/soxr/src/half-coefs.h b/soxr/src/half-coefs.h new file mode 100644 index 0000000..a5a0882 --- /dev/null +++ b/soxr/src/half-coefs.h @@ -0,0 +1,75 @@ +/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if defined __GNUC__ + #pragma GCC system_header +#elif defined __SUNPRO_C + #pragma disable_warn +#elif defined _MSC_VER + #pragma warning(push, 1) +#endif + +#if CORE_TYPE & CORE_SIMD_HALF + #define VALIGN vAlign +#else + #define VALIGN +#endif + +#if !(CORE_TYPE & CORE_SIMD_HALF) +static VALIGN const sample_t half_fir_coefs_7[] = { + 3.1062656496657370e-01, -8.4998810699955796e-02, 3.4007044621123500e-02, +-1.2839903789829387e-02, 3.9899380181723145e-03, -8.9355202017945374e-04, + 1.0918292424806546e-04, +}; +#endif + +static VALIGN const sample_t half_fir_coefs_8[] = { + 3.1154652365332069e-01, -8.7344917685739543e-02, 3.6814458353637280e-02, +-1.5189204581464479e-02, 5.4540855610738801e-03, -1.5643862626630416e-03, + 3.1816575906323303e-04, -3.4799449225005688e-05, +}; + +static VALIGN const sample_t half_fir_coefs_9[] = { + 3.1227034755311189e-01, -8.9221517147969526e-02, 3.9139704015071934e-02, +-1.7250558515852023e-02, 6.8589440230476112e-03, -2.3045049636430419e-03, + 6.0963740543348963e-04, -1.1323803957431231e-04, 1.1197769991000046e-05, +}; + +#if CORE_TYPE & CORE_DBL +static VALIGN const sample_t half_fir_coefs_10[] = { + 3.1285456012000523e-01, -9.0756740799292787e-02, 4.1096398104193160e-02, +-1.9066319572525220e-02, 8.1840569787684902e-03, -3.0766876176359834e-03, + 9.6396524429277980e-04, -2.3585679989922018e-04, 4.0252189026627833e-05, +-3.6298196342497932e-06, +}; + +static VALIGN const sample_t half_fir_coefs_11[] = { + 3.1333588822574199e-01, -9.2035898673019811e-02, 4.2765169698406408e-02, +-2.0673580894964429e-02, 9.4225426824512421e-03, -3.8563379950013192e-03, + 1.3634742159642453e-03, -3.9874150714431009e-04, 9.0586723632664806e-05, +-1.4285617244076783e-05, 1.1834642946400529e-06, +}; + +static VALIGN const sample_t half_fir_coefs_12[] = { + 3.1373928463345568e-01, -9.3118180335301962e-02, 4.4205005881659098e-02, +-2.2103860986973051e-02, 1.0574689371162864e-02, -4.6276428065385065e-03, + 1.7936153397572132e-03, -5.9617527051353237e-04, 1.6314517495669067e-04, +-3.4555126770115446e-05, 5.0617615610782593e-06, -3.8768958592971409e-07, +}; + +static VALIGN const sample_t half_fir_coefs_13[] = { + 3.1408224847888910e-01, -9.4045836332667387e-02, 4.5459878763259978e-02, +-2.3383369012219993e-02, 1.1644273044890753e-02, -5.3806714579057013e-03, + 2.2429072878264022e-03, -8.2204347506606424e-04, 2.5724946477840893e-04, +-6.6072709864248668e-05, 1.3099163296288644e-05, -1.7907147069136000e-06, + 1.2750825595240592e-07, +}; +#endif + +#undef VALIGN + +#if defined __SUNPRO_C + #pragma enable_warn +#elif defined _MSC_VER + #pragma warning(pop) +#endif diff --git a/soxr/src/half-fir.h b/soxr/src/half-fir.h index 0a8ee97..782be1b 100644 --- a/soxr/src/half-fir.h +++ b/soxr/src/half-fir.h @@ -1,25 +1,61 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ -/* Down-sample by a factor of 2 using a FIR with odd length (LEN).*/ +/* Decimate by 2 using a FIR with odd length (LEN). */ /* Input must be preceded and followed by LEN >> 1 samples. */ -#define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j; -static void FUNCTION(stage_t * p, fifo_t * output_fifo) +#define COEFS ((sample_t const *)p->coefs) + +#if SIMD_SSE + #define BEGINNING v4_t sum, q1, q2, t + #define ____ \ + q1 = _mm_shuffle_ps(t=vLdu(input+2*j),vLdu(input+2*j+4),_MM_SHUFFLE(3,1,3,1)); \ + q2 = _mm_shuffle_ps(vLdu(input-2*j-4),vLdu(input-2*j-8),_MM_SHUFFLE(1,3,1,3)); \ + sum = vAdd(j? sum : vMul(vSet1(.5), t), vMul(vAdd(q1, q2), vLd(COEFS+j))); \ + j += 4; + #define __ \ + q1 = _mm_shuffle_ps(vLdu(input+2*j), vLdu(input-2*j-4), _MM_SHUFFLE(1,3,3,1)); \ + q2 = _mm_loadl_pi(q2, (__m64*)(COEFS+j)), q2 = _mm_movelh_ps(q2, q2); \ + sum = vAdd(sum, vMul(q1, q2)); \ + j += 2; + #define _ \ + q1 = _mm_add_ss(_mm_load_ss(input+2*j+1), _mm_load_ss(input-2*j-1)); \ + sum = _mm_add_ss(sum, _mm_mul_ss(q1, _mm_load_ss(COEFS+j))); \ + ++j; + #define END vStorSum(output+i, sum) +/* #elif SIMD_AVX; No good solution found. */ +/* #elif SIMD_NEON; No need: gcc -O3 does a good job by itself. */ +#else + #define BEGINNING sample_t sum = input[0] * .5f + #define ____ __ __ + #define __ _ _ + #define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j; + #define END output[i] = sum +#endif + + + +static void FUNCTION_H(stage_t * p, fifo_t * output_fifo) { - sample_t const * input = stage_read_p(p); - int i, num_out = (stage_occupancy(p) + 1) / 2; - sample_t * output = fifo_reserve(output_fifo, num_out); + sample_t const * __restrict input = stage_read_p(p); + int num_in = min(stage_occupancy(p), p->input_size); + int i, num_out = (num_in + 1) >> 1; + sample_t * __restrict output = fifo_reserve(output_fifo, num_out); for (i = 0; i < num_out; ++i, input += 2) { int j = 0; - sample_t sum = input[0] * .5f; - CONVOLVE - output[i] = sum; + BEGINNING; CONVOLVE; END; } fifo_read(&p->fifo, 2 * num_out, NULL); } + + + #undef _ +#undef __ +#undef ____ +#undef BEGINNING +#undef END #undef COEFS #undef CONVOLVE -#undef FUNCTION +#undef FUNCTION_H diff --git a/soxr/src/half_coefs.h b/soxr/src/half_coefs.h deleted file mode 100644 index aac7769..0000000 --- a/soxr/src/half_coefs.h +++ /dev/null @@ -1,57 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -#if defined __GNUC__ - #pragma GCC system_header -#elif defined __SUNPRO_C - #pragma disable_warn -#elif defined _MSC_VER - #pragma warning(push, 1) -#endif - -static const sample_t half_fir_coefs_8[] = { - 0.3115465451887802, -0.08734497241282892, 0.03681452335604365, - -0.01518925831569441, 0.005454118437408876, -0.001564400922162005, - 0.0003181701445034203, -3.48001341225749e-5, -}; - -static const sample_t half_fir_coefs_9[] = { - 0.3122703613711853, -0.08922155288172305, 0.03913974805854332, - -0.01725059723447163, 0.006858970092378141, -0.002304518467568703, - 0.0006096426006051062, -0.0001132393923815236, 1.119795386287666e-5, -}; - -static const sample_t half_fir_coefs_10[] = { - 0.3128545521327376, -0.09075671986104322, 0.04109637155154835, - -0.01906629512749895, 0.008184039342054333, -0.0030766775017262, - 0.0009639607022414314, -0.0002358552746579827, 4.025184282444155e-5, - -3.629779111541012e-6, -}; - -static const sample_t half_fir_coefs_11[] = { - 0.3133358837508807, -0.09203588680609488, 0.04276515428384758, - -0.02067356614745591, 0.00942253142371517, -0.003856330993895144, - 0.001363470684892284, -0.0003987400965541919, 9.058629923971627e-5, - -1.428553070915318e-5, 1.183455238783835e-6, -}; - -static const sample_t half_fir_coefs_12[] = { - 0.3137392991811407, -0.0931182192961332, 0.0442050575271454, - -0.02210391200618091, 0.01057473015666001, -0.00462766983973885, - 0.001793630226239453, -0.0005961819959665878, 0.0001631475979359577, - -3.45557865639653e-5, 5.06188341942088e-6, -3.877010943315563e-7, -}; - -static const sample_t half_fir_coefs_13[] = { - 0.3140822554324578, -0.0940458550886253, 0.04545990399121566, - -0.02338339450796002, 0.01164429409071052, -0.005380686021429845, - 0.002242915773871009, -0.000822047600000082, 0.0002572510962395222, - -6.607320708956279e-5, 1.309926399120154e-5, -1.790719575255006e-6, - 1.27504961098836e-7, -}; - -#if defined __SUNPRO_C - #pragma enable_warn -#elif defined _MSC_VER - #pragma warning(pop) -#endif diff --git a/soxr/src/internal.h b/soxr/src/internal.h index 5d8d44e..08924d5 100644 --- a/soxr/src/internal.h +++ b/soxr/src/internal.h @@ -1,46 +1,84 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ #if !defined soxr_internal_included #define soxr_internal_included -#include "soxr-config.h" +#include "std-types.h" + + #undef min #undef max #define min(a, b) ((a) <= (b) ? (a) : (b)) #define max(a, b) ((a) >= (b) ? (a) : (b)) + + #define range_limit(x, lower, upper) (min(max(x, lower), upper)) #define linear_to_dB(x) (log10(x) * 20) #define array_length(a) (sizeof(a)/sizeof(a[0])) +#if !defined AL #define AL(a) array_length(a) +#endif #define iAL(a) (int)AL(a) #define sqr(a) ((a) * (a)) -#ifdef __GNUC__ + + +#if defined __GNUC__ #define UNUSED __attribute__ ((unused)) #else #define UNUSED #endif -#if defined NDEBUG || SOXR_SILENT + + +#if !WITH_DEV_TRACE #ifdef __GNUC__ void lsx_dummy(char const *, ...); #else static __inline void lsx_dummy(char const * x, ...) {} #endif #define lsx_debug if(0) lsx_dummy + #define lsx_debug_more lsx_debug #else - #include - #include - UNUSED static void lsx_debug(char const * fmt, ...) - { - va_list args; - va_start(args, fmt); - vfprintf(stderr, fmt, args); - fputc('\n', stderr); - va_end(args); - } + extern int _soxr_trace_level; + void _soxr_trace(char const * fmt, ...); + #define lsx_debug if (_soxr_trace_level > 0) _soxr_trace + #define lsx_debug_more if (_soxr_trace_level > 1) _soxr_trace #endif + + + +/* soxr_quality_spec_t.flags: */ + +#define SOXR_ROLLOFF_LSR2Q 3u /* Reserved for internal use. */ +#define SOXR_ROLLOFF_MASK 3u /* For masking these bits. */ +#define SOXR_MAINTAIN_3DB_PT 4u /* Reserved for internal use. */ +#define SOXR_PROMOTE_TO_LQ 64u /* Reserved for internal use. */ + + + +/* soxr_runtime_spec_t.flags: */ + +#define SOXR_STRICT_BUFFERING 4u /* Reserved for future use. */ +#define SOXR_NOSMALLINTOPT 8u /* For test purposes only. */ + + + +/* soxr_quality_spec recipe: */ + +#define SOXR_PRECISIONQ 11 /* Quality specified by the precision parameter. */ + +#define SOXR_PHASE_MASK 0x30 /* For masking these bits. */ + + + +/* soxr_quality_spec flags: */ + +#define RESET_ON_CLEAR (1u<<31) + + + #endif diff --git a/soxr/src/libsoxr-dev.src.in b/soxr/src/libsoxr-dev.src.in deleted file mode 100644 index ce879f9..0000000 --- a/soxr/src/libsoxr-dev.src.in +++ /dev/null @@ -1,2 +0,0 @@ -set(TARGET_HEADERS "@TARGET_HEADERS@") -set(TARGET_PCS "@TARGET_PCS@") diff --git a/soxr/src/libsoxr.src.in b/soxr/src/libsoxr.src.in deleted file mode 100644 index 1c926ff..0000000 --- a/soxr/src/libsoxr.src.in +++ /dev/null @@ -1 +0,0 @@ -set(TARGET_LIBS "@TARGET_LIBS@") diff --git a/soxr/src/lsr.c b/soxr/src/lsr.c deleted file mode 100644 index 64b5798..0000000 --- a/soxr/src/lsr.c +++ /dev/null @@ -1,114 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -/* Wrapper mostly compatible with `libsamplerate'. */ - -#include -#include -#include "soxr.h" - -/* Runtime casts: */ -typedef struct io_t { - float *in,*out; long ilen,olen,idone,odone; int eoi; double oi_ratio;} io_t; -#define SRC_DATA io_t -typedef struct soxr SRC_STATE; -#define src_callback_t soxr_input_fn_t -#define SRC_ERROR soxr_error_t -#define SRC_SRCTYPE unsigned - -#include "soxr-lsr.h" -#include "rint.h" - -soxr_error_t src_simple(io_t * p, unsigned id, int channels) -{ - size_t idone, odone; - soxr_error_t error; - soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + id, 0); - char const * e = getenv("SOXR_LSR_NUM_THREADS"); - soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1)); - assert (channels > 0); - assert (p->ilen >= 0); - assert (p->olen >= 0); - error = soxr_oneshot(1, p->oi_ratio, (unsigned)channels, - p->in, (size_t)p->ilen, &idone, p->out, (size_t)p->olen, &odone, - 0, &q_spec, &r_spec); - p->idone = (long)idone, p->odone = (long)odone; - return error; -} - -soxr_t src_callback_new(soxr_input_fn_t fn, unsigned id, int channels, SRC_ERROR * error0, void * p) -{ - soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + id, 0); - char const * e = getenv("SOXR_LSR_NUM_THREADS"); - soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1)); - soxr_error_t error; - soxr_t soxr = 0; - assert (channels > 0); - /* To minimise latency e.g. for real-time playback: - if (id == 2) - r_spec.log2_large_dft_size = r_spec.log2_min_dft_size = 8; - */ - soxr = soxr_create(0, 0, (unsigned)channels, &error, 0, &q_spec, &r_spec); - if (soxr) - error = soxr_set_input_fn(soxr, fn, p, 0); - if (error0) - *(int *)error0 = (int)(ptrdiff_t)error; - return soxr; -} - -soxr_error_t src_process(soxr_t p, io_t * io) -{ - if (!p || !io) return "null pointer"; - soxr_set_error(p, soxr_set_io_ratio(p, 1/io->oi_ratio, (size_t)io->olen)); - - { size_t idone , odone; - soxr_process(p, io->in, (size_t)(io->eoi? ~io->ilen : io->ilen), /* hack */ - &idone, io->out, (size_t)io->olen, &odone); - io->idone = (long)idone, io->odone = (long)odone; - return soxr_error(p); } -} - -long src_callback_read(soxr_t p, double oi_ratio, long olen, float * obuf) -{ - if (!p || olen < 0) return -1; - soxr_set_error(p, soxr_set_io_ratio(p, 1/oi_ratio, (size_t)olen)); - return (long)soxr_output(p, obuf, (size_t)olen); -} - -void src_float_to_short_array(float const * src, short * dest, int len) -{ - double d, N = 1. + SHRT_MAX; - assert (src && dest); - while (len--) d = src[len] * N, dest[len] = (short)(d > N - 1? (short)(N - 1) : d < -N? (short)-N : rint16(d)); -} - -void src_short_to_float_array(short const * src, float * dest, int len) -{ - assert (src && dest); - while (len--) dest[len] = (float)(src[len] * (1 / (1. + SHRT_MAX))); -} - -void src_float_to_int_array(float const * src, int * dest, int len) -{ - double d, N = 32768. * 65536.; /* N.B. int32, not int! (Also next fn.) */ - assert (src && dest); - while (len--) d = src[len] * N, dest[len] = d >= N - 1? (int)(N - 1) : d < -N? (int)(-N) : rint32(d); -} - -void src_int_to_float_array(int const * src, float * dest, int len) -{ - assert (src && dest); - while (len--) dest[len] = (float)(src[len] * (1 / (32768. * 65536.))); -} - -static char const * const names[] = {"LSR best sinc", "LSR medium sinc", "LSR fastest sinc", "LSR ZOH", "LSR linear", "SoX VHQ"}; -char const * src_get_name(unsigned n) {return n < 5u + !getenv("SOXR_LSR_STRICT")? names[n] : 0;} -char const * src_get_description(unsigned id) {return src_get_name(id);} -char const * src_get_version(void) {return soxr_version();} -char const * src_strerror(soxr_error_t error) {return error == (soxr_error_t)1? "Placeholder." : sizeof(int) >= sizeof(char *) || !error ? soxr_strerror(error) : "soxr error";} -int src_is_valid_ratio(double oi_ratio) {return getenv("SOXR_LSR_STRICT")? oi_ratio >= 1./256 && oi_ratio <= 256 : oi_ratio > 0;} -soxr_error_t src_error(soxr_t p) {return soxr_error(p);} -soxr_error_t src_reset(soxr_t p) {return soxr_clear(p);} -soxr_t src_delete(soxr_t p) {soxr_delete(p); return 0;} -soxr_error_t src_set_ratio(soxr_t p, double oi_ratio) {return soxr_set_io_ratio(p, 1/oi_ratio, 0);} -soxr_t src_new(unsigned id, int channels, SRC_ERROR * error) {return src_callback_new(0, id, channels, error, 0);} diff --git a/soxr/src/math-wrap.h b/soxr/src/math-wrap.h new file mode 100644 index 0000000..8a526f1 --- /dev/null +++ b/soxr/src/math-wrap.h @@ -0,0 +1,31 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined soxr_math_wrap_included +#define soxr_math_wrap_included + +#include + +#if defined __STRICT_ANSI__ + #define sinf(x) (float)sin ((double)(x)) + #define cosf(x) (float)cos ((double)(x)) + #define atanf(x) (float)atan((double)(x)) +#endif + +#if !defined M_PI + #define M_PI 3.141592653589793238462643383279502884 +#endif + +#if !defined M_LN10 + #define M_LN10 2.302585092994045684017991454684364208 +#endif + +#if !defined M_SQRT2 + #define M_SQRT2 1.414213562373095048801688724209698079 +#endif + +#if !defined M_LN2 + #define M_LN2 0.693147180559945309417232121458176568 +#endif + +#endif diff --git a/soxr/src/pffft-avx.h b/soxr/src/pffft-avx.h new file mode 100644 index 0000000..ace19b5 --- /dev/null +++ b/soxr/src/pffft-avx.h @@ -0,0 +1,40 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +/* AVX support macros */ + +#if !defined soxr_avx_included +#define soxr_avx_included + +#include + +typedef __m256d v4sf; +#define VZERO() _mm256_setzero_pd() +#define VMUL(a,b) _mm256_mul_pd(a,b) +#define VADD(a,b) _mm256_add_pd(a,b) +#define VMADD(a,b,c) VADD(VMUL(a,b),c) /* Note: gcc -mfma will `fuse' these */ +#define VSUB(a,b) _mm256_sub_pd(a,b) +#define LD_PS1(p) _mm256_set1_pd(p) +#define INTERLEAVE2(in1, in2, out1, out2) {v4sf \ + t1 = _mm256_unpacklo_pd(in1, in2), \ + t2 = _mm256_unpackhi_pd(in1, in2); \ + out1 = _mm256_permute2f128_pd(t1,t2,0x20); \ + out2 = _mm256_permute2f128_pd(t1,t2,0x31); } +#define UNINTERLEAVE2(in1, in2, out1, out2) {v4sf \ + t1 = _mm256_permute2f128_pd(in1,in2,0x20), \ + t2 = _mm256_permute2f128_pd(in1,in2,0x31); \ + out1 = _mm256_unpacklo_pd(t1, t2); \ + out2 = _mm256_unpackhi_pd(t1, t2);} +#define VTRANSPOSE4(x0,x1,x2,x3) {v4sf \ + t0 = _mm256_shuffle_pd(x0,x1, 0x0), \ + t2 = _mm256_shuffle_pd(x0,x1, 0xf), \ + t1 = _mm256_shuffle_pd(x2,x3, 0x0), \ + t3 = _mm256_shuffle_pd(x2,x3, 0xf); \ + x0 = _mm256_permute2f128_pd(t0,t1, 0x20); \ + x1 = _mm256_permute2f128_pd(t2,t3, 0x20); \ + x2 = _mm256_permute2f128_pd(t0,t1, 0x31); \ + x3 = _mm256_permute2f128_pd(t2,t3, 0x31);} +#define VSWAPHL(a,b) _mm256_permute2f128_pd(b, a, 0x30) +#define VALIGNED(ptr) ((((long)(ptr)) & 0x1F) == 0) + +#endif diff --git a/soxr/src/pffft-wrap.c b/soxr/src/pffft-wrap.c new file mode 100644 index 0000000..c920f06 --- /dev/null +++ b/soxr/src/pffft-wrap.c @@ -0,0 +1,110 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined PFFT_MACROS_ONLY + +#include "math-wrap.h" + +#if PFFFT_DOUBLE + #include "util64s.h" +#else + #include "util32s.h" + #define sin(x) sinf(x) + #define cos(x) cosf(x) +#endif + +#define pffft_aligned_free SIMD_ALIGNED_FREE +#define pffft_aligned_malloc SIMD_ALIGNED_MALLOC +#define pffft_aligned_calloc SIMD_ALIGNED_CALLOC + +#undef inline +#define inline __inline + +#endif + + + +#include "pffft.c" + + + +#if !defined PFFT_MACROS_ONLY + +#if !defined PFFFT_SIMD_DISABLE + +static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) { + int i, Ncvec = s->Ncvec; + const v4sf * /*RESTRICT*/ va = (const v4sf*)a; + const v4sf * RESTRICT vb = (const v4sf*)b; + v4sf * /*RESTRICT*/ vab = (v4sf*)ab; + + float ar, ai, br, bi; + +#ifdef __arm__ + __builtin_prefetch(va); + __builtin_prefetch(vb); + __builtin_prefetch(va+2); + __builtin_prefetch(vb+2); + __builtin_prefetch(va+4); + __builtin_prefetch(vb+4); + __builtin_prefetch(va+6); + __builtin_prefetch(vb+6); +#endif + + assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); + ar = ((v4sf_union*)va)[0].f[0]; + ai = ((v4sf_union*)va)[1].f[0]; + br = ((v4sf_union*)vb)[0].f[0]; + bi = ((v4sf_union*)vb)[1].f[0]; + + for (i=0; i < Ncvec; i += 2) { + v4sf ar, ai, br, bi; + ar = va[2*i+0]; ai = va[2*i+1]; + br = vb[2*i+0]; bi = vb[2*i+1]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+0] = ar; + vab[2*i+1] = ai; + ar = va[2*i+2]; ai = va[2*i+3]; + br = vb[2*i+2]; bi = vb[2*i+3]; + VCPLXMUL(ar, ai, br, bi); + vab[2*i+2] = ar; + vab[2*i+3] = ai; + } + if (s->transform == PFFFT_REAL) { + ((v4sf_union*)vab)[0].f[0] = ar*br; + ((v4sf_union*)vab)[1].f[0] = ai*bi; + } +} + +#else + +static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) { + int i, Ncvec = s->Ncvec; + + if (s->transform == PFFFT_REAL) { + /* take care of the fftpack ordering */ + ab[0] = a[0]*b[0]; + ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1]; + ++ab; ++a; ++b; --Ncvec; + } + for (i=0; i < Ncvec; ++i) { + float ar, ai, br, bi; + ar = a[2*i+0]; ai = a[2*i+1]; + br = b[2*i+0]; bi = b[2*i+1]; + VCPLXMUL(ar, ai, br, bi); + ab[2*i+0] = ar; + ab[2*i+1] = ai; + } +} + +#endif + +#include + +static void pffft_reorder_back(int length, void * setup, float * data, float * work) +{ + memcpy(work, data, (unsigned)length * sizeof(*work)); + pffft_zreorder(setup, work, data, PFFFT_BACKWARD); +} + +#endif diff --git a/soxr/src/pffft.c b/soxr/src/pffft.c index 957e604..46c841e 100644 --- a/soxr/src/pffft.c +++ b/soxr/src/pffft.c @@ -1,4 +1,7 @@ -/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com ) +/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.c + * with minor changes for libsoxr. */ + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) Based on original fortran 77 code from FFTPACKv4 from NETLIB (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber @@ -57,29 +60,12 @@ - 2011/10/02, version 1: This is the very first release of this file. */ -#if !defined PFFT_MACROS_ONLY #include "pffft.h" -#include "simd.h" -#include #include +#include #include #include -#define pffft_aligned_free _soxr_simd_aligned_free -#define pffft_aligned_malloc _soxr_simd_aligned_malloc -#define pffft_aligned_calloc _soxr_simd_aligned_calloc -#endif - -/* - vector support macros: the rest of the code is independant of - SSE/Altivec/NEON -- adding support for other platforms with 4-element - vectors should be limited to these macros -*/ - - -/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */ -/*#define PFFFT_SIMD_DISABLE */ - /* detect compiler flavour */ #if defined(_MSC_VER) # define COMPILER_MSVC @@ -91,14 +77,25 @@ # define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline)) # define NEVER_INLINE(return_type) return_type __attribute__ ((noinline)) # define RESTRICT __restrict -/*# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; */ +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; #elif defined(COMPILER_MSVC) # define ALWAYS_INLINE(return_type) __forceinline return_type # define NEVER_INLINE(return_type) __declspec(noinline) return_type # define RESTRICT __restrict -/*# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (v4sf*)_alloca(size__ * sizeof(type__)) */ +# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__)) #endif + +/* + vector support macros: the rest of the code is independant of + SSE/Altivec/NEON -- adding support for other platforms with 4-element + vectors should be limited to these macros +*/ + + +/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */ +/*#define PFFFT_SIMD_DISABLE */ + /* Altivec support macros */ @@ -136,9 +133,11 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p */ #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86)) +# define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */ + +#if !PFFFT_DOUBLE #include typedef __m128 v4sf; -# define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */ # define VZERO() _mm_setzero_ps() # define VMUL(a,b) _mm_mul_ps(a,b) # define VADD(a,b) _mm_add_ps(a,b) @@ -151,10 +150,14 @@ typedef __m128 v4sf; # define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0)) # define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0) +#else +#include "pffft-avx.h" +#endif + /* ARM NEON support macros */ -#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__arm64__) || defined(__aarch64__)) +#elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__) # include typedef float32x4_t v4sf; # define SIMD_SZ 4 @@ -166,7 +169,7 @@ typedef float32x4_t v4sf; # define LD_PS1(p) vld1q_dup_f32(&(p)) # define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } # define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; } -# define VTRANSPOSE4_(x0,x1,x2,x3) { \ +# define VTRANSPOSE4(x0,x1,x2,x3) { \ float32x4x2_t t0_ = vzipq_f32(x0, x2); \ float32x4x2_t t1_ = vzipq_f32(x1, x3); \ float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \ @@ -174,7 +177,7 @@ typedef float32x4_t v4sf; x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \ } /* marginally faster version */ -# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } +/*# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } */ # define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a)) # define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0) #else @@ -184,6 +187,10 @@ typedef float32x4_t v4sf; # endif #endif +#if PFFFT_DOUBLE +#define float double +#endif + /* fallback mode for situations where SSE/Altivec are not available, use scalar mode instead */ #ifdef PFFFT_SIMD_DISABLE typedef float v4sf; @@ -200,6 +207,12 @@ typedef float v4sf; /* shortcuts for complex multiplcations */ #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); } #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); } +#ifndef SVMUL +/* multiply a scalar with a vector */ +#define SVMUL(f,v) VMUL(LD_PS1(f),v) +#endif + +#if !defined PFFT_MACROS_ONLY #if !defined(PFFFT_SIMD_DISABLE) typedef union v4sf_union { @@ -213,7 +226,8 @@ typedef union v4sf_union { #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3)) /* detect bugs with the vector support macros */ -void validate_pffft_simd() { +void validate_pffft_simd(void); +void validate_pffft_simd(void) { float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 }; v4sf_union a0, a1, a2, a3, t, u; memcpy(a0.f, f, 4*sizeof(float)); @@ -229,7 +243,6 @@ void validate_pffft_simd() { printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77); t.v = VMADD(a1.v, a2.v,a0.v); printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80); - INTERLEAVE2(a1.v,a2.v,t.v,u.v); printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]); assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11); @@ -252,20 +265,23 @@ void validate_pffft_simd() { #endif #endif /*!PFFFT_SIMD_DISABLE */ -#if !defined PFFT_MACROS_ONLY +#if 0 +/* SSE and co like 16-bytes aligned pointers */ +#define MALLOC_V4SF_ALIGNMENT 64 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */ +void *pffft_aligned_malloc(size_t nb_bytes) { + void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT); + if (!p0) return (void *) 0; + p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1)))); + *((void **) p - 1) = p0; + return p; +} +void pffft_aligned_free(void *p) { + if (p) free(*((void **) p - 1)); +} -#if defined (COMPILER_MSVC) - #define sin (float)sin - #define cos (float)cos -#else - #define sin sinf - #define cos cosf -#endif - -/* int pffft_simd_size() { return SIMD_SZ; } -*/ +#endif /* passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2 @@ -299,6 +315,7 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c /* passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3 */ +#if 0 static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, const float *wa2, float fsign) { static const float taur = -0.5f; @@ -311,13 +328,13 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch, for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) { for (i=0; i 2); + for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) { + for (i = 0; i < ido-1; i += 2) { + ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5)); + ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5)); + ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4)); + ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4)); + tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5)); + tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4)); + tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4)); + ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3)); + ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3)); + cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3))); + ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3))); + cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3))); + ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3))); + cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4)); + ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4)); + cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4)); + ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4)); + dr3 = VSUB(cr3, ci4); + dr4 = VADD(cr3, ci4); + di3 = VADD(ci3, cr4); + di4 = VSUB(ci3, cr4); + dr5 = VADD(cr2, ci5); + dr2 = VSUB(cr2, ci5); + di5 = VSUB(ci2, cr5); + di2 = VADD(ci2, cr5); + wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1]; + wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1]; + VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1)); + ch_ref(i - 1, 2) = dr2; + ch_ref(i, 2) = di2; + VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2)); + ch_ref(i - 1, 3) = dr3; + ch_ref(i, 3) = di3; + VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3)); + ch_ref(i - 1, 4) = dr4; + ch_ref(i, 4) = di4; + VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4)); + ch_ref(i - 1, 5) = dr5; + ch_ref(i, 5) = di5; + } + } +#undef ch_ref +#undef cc_ref +} +#endif + static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) { static const float minus_one = -1.f; int i, k, l1ido = l1*ido; @@ -425,7 +515,7 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4 if (ido % 2 == 1) return; } for (k=0; k < l1ido; k += ido) { - ch[2*k + ido] = VMUL(LD_PS1(minus_one), cc[ido-1 + k + l1ido]); + ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]); ch[2*k + ido-1] = cc[k + ido-1]; } } /* radf2 */ @@ -460,10 +550,11 @@ static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, co for (k = 0; k < l1ido; k += ido) { a = cc[2*k + ido-1]; b = cc[2*k + ido]; ch[k + ido-1] = VADD(a,a); - ch[k + ido-1 + l1ido] = VMUL(LD_PS1(minus_two), b); + ch[k + ido-1 + l1ido] = SVMUL(minus_two, b); } } /* radb2 */ +#if 0 static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1, const float *wa2) { static const float taur = -0.5f; @@ -473,8 +564,8 @@ static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT for (k=0; k= 32); } - if (transform == PFFFT_COMPLEX) { assert(N >= 16); } + int k, m; + if (!s) return s; + /* unfortunately, the fft size must be a multiple of 16 for complex FFTs + and 32 for real FFTs -- a lot of stuff would need to be rewritten to + handle other cases (or maybe just switch to a scalar fft, I don't know..) */ + if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); } + if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); } /*assert((N % 32) == 0); */ s->N = N; s->transform = transform; /* nb of complex simd vectors */ s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ; s->data = (v4sf*)pffft_aligned_malloc(2*(size_t)s->Ncvec * sizeof(v4sf)); - if (!s->data) { - free(s); - return 0; - } + if (!s->data) {free(s); return 0;} s->e = (float*)s->data; s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ); @@ -988,15 +1288,22 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) { } cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac); } + + /* check that N is decomposable with allowed prime factors */ + for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; } + if (m != N/SIMD_SZ) { + pffft_destroy_setup(s); s = 0; + } + return s; } -static void pffft_destroy_setup(PFFFT_Setup *s) { - if(s){ - pffft_aligned_free(s->data); - free(s); - } +static +void pffft_destroy_setup(PFFFT_Setup *s) { + if (!s) return; + pffft_aligned_free(s->data); + free(s); } #if !defined(PFFFT_SIMD_DISABLE) @@ -1035,7 +1342,8 @@ static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) { UNINTERLEAVE2(h0, g1, out[0], out[1]); } -static void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { +static +void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { int k, N = setup->N, Ncvec = setup->Ncvec; const v4sf *vin = (const v4sf*)in; v4sf *vout = (v4sf*)out; @@ -1072,7 +1380,8 @@ static void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pfff } } -static void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { +static +void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ v4sf r0, i0, r1, i1, r2, i2, r3, i3; v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; @@ -1116,7 +1425,8 @@ static void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf } } -static void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { +static +void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) { int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */ v4sf r0, i0, r1, i1, r2, i2, r3, i3; v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1; @@ -1342,22 +1652,23 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf } -static void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch, +static +void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch, pffft_direction_t direction, int ordered) { int k, Ncvec = setup->Ncvec; int nf_odd = (setup->ifac[1] & 1); +#if 0 /* temporary buffer is allocated on the stack if the scratch pointer is NULL */ - /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */ - /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */ + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); +#endif - int ib = (nf_odd ^ ordered ? 1 : 0); const v4sf *vinput = (const v4sf*)finput; v4sf *voutput = (v4sf*)foutput; v4sf *buff[2]; - buff[0] = voutput, buff[1] = scratch /*? scratch : scratch_on_stack*/; - - /*if (scratch == 0) scratch = scratch_on_stack; */ + int ib = (nf_odd ^ ordered ? 1 : 0); + buff[0] = voutput; buff[1] = scratch; assert(VALIGNED(finput) && VALIGNED(foutput)); @@ -1415,8 +1726,8 @@ static void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, fl } #if 0 -static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { - int i, Ncvec = s->Ncvec; +void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { + int Ncvec = s->Ncvec; const v4sf * RESTRICT va = (const v4sf*)a; const v4sf * RESTRICT vb = (const v4sf*)b; v4sf * RESTRICT vab = (v4sf*)ab; @@ -1434,10 +1745,16 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo __builtin_prefetch(va+6); __builtin_prefetch(vb+6); __builtin_prefetch(vab+6); +# ifndef __clang__ +# define ZCONVOLVE_USING_INLINE_NEON_ASM +# endif #endif float ar, ai, br, bi, abr, abi; +#ifndef ZCONVOLVE_USING_INLINE_ASM v4sf vscal = LD_PS1(scaling); + int i; +#endif assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); ar = ((v4sf_union*)va)[0].f[0]; @@ -1447,8 +1764,7 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo abr = ((v4sf_union*)vab)[0].f[0]; abi = ((v4sf_union*)vab)[1].f[0]; -#ifdef __arm__ -# if 1 /* inline asm version */ +#ifdef ZCONVOLVE_USING_INLINE_ASM /* inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc */ const float *a_ = a, *b_ = b; float *ab_ = ab; int N = Ncvec; asm volatile("mov r8, %2 \n" @@ -1484,49 +1800,7 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo "subs %3, #2 \n" "bne 1b \n" : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory"); - -# else /* neon instrinsics version, 30% slower that the asm one with gcc 4.6 */ - v4sf a1r, a1i, b1r, b1i; - v4sf a2r, a2i, b2r, b2i; - v4sf ab1r, ab1i, ab2r, ab2i; - for (i=0; i < Ncvec; i += 2) { - __builtin_prefetch(va+8); - __builtin_prefetch(va+10); - - a1r = *va++; a1i = *va++; - a2r = *va++; a2i = *va++; - b1r = *vb++; b1i = *vb++; - b2r = *vb++; b2i = *vb++; - ab1r = vab[0]; ab1i = vab[1]; - ab2r = vab[2]; ab2i = vab[3]; - - v4sf z1r = VMUL(a1r, b1r); - v4sf z2r = VMUL(a2r, b2r); - v4sf z1i = VMUL(a1r, b1i); - v4sf z2i = VMUL(a2r, b2i); - - __builtin_prefetch(vb+4); - __builtin_prefetch(vb+6); - - z1r = vmlsq_f32(z1r, a1i, b1i); - z2r = vmlsq_f32(z2r, a2i, b2i); - z1i = vmlaq_f32(z1i, a1i, b1r); - z2i = vmlaq_f32(z2i, a2i, b2r); - - __builtin_prefetch(vab+4); - __builtin_prefetch(vab+6); - - ab1r = vmlaq_f32(ab1r, z1r, vscal); - ab2r = vmlaq_f32(ab2r, z2r, vscal); - ab1i = vmlaq_f32(ab1i, z1i, vscal); - ab2i = vmlaq_f32(ab2i, z2i, vscal); - - *vab++ = ab1r; *vab++ = ab1i; - *vab++ = ab2r; *vab++ = ab2i; - } -# endif - -#else /* not ARM, no need to use a special routine */ +#else /* default routine, works fine for non-arm cpus with current compilers */ for (i=0; i < Ncvec; i += 2) { v4sf ar, ai, br, bi; ar = va[2*i+0]; ai = va[2*i+1]; @@ -1548,50 +1822,14 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo } #endif -static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) { - int i, Ncvec = s->Ncvec; - const v4sf * /*RESTRICT*/ va = (const v4sf*)a; - const v4sf * RESTRICT vb = (const v4sf*)b; - v4sf * /*RESTRICT*/ vab = (v4sf*)ab; - - float ar, ai, br, bi; - -#ifdef __arm__ -#error -#endif - assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab)); - ar = ((v4sf_union*)va)[0].f[0]; - ai = ((v4sf_union*)va)[1].f[0]; - br = ((v4sf_union*)vb)[0].f[0]; - bi = ((v4sf_union*)vb)[1].f[0]; - - for (i=0; i < Ncvec; i += 2) { - v4sf ar, ai, br, bi; - ar = va[2*i+0]; ai = va[2*i+1]; - br = vb[2*i+0]; bi = vb[2*i+1]; - VCPLXMUL(ar, ai, br, bi); - vab[2*i+0] = ar; - vab[2*i+1] = ai; - ar = va[2*i+2]; ai = va[2*i+3]; - br = vb[2*i+2]; bi = vb[2*i+3]; - VCPLXMUL(ar, ai, br, bi); - vab[2*i+2] = ar; - vab[2*i+3] = ai; - } - if (s->transform == PFFFT_REAL) { - ((v4sf_union*)vab)[0].f[0] = ar*br; - ((v4sf_union*)vab)[1].f[0] = ai*bi; - } -} - - #else /* defined(PFFFT_SIMD_DISABLE) */ /* standard routine using scalar floats, without SIMD stuff. */ #define pffft_zreorder_nosimd pffft_zreorder -static void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { +static +void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) { int k, N = setup->N; if (setup->transform == PFFFT_COMPLEX) { for (k=0; k < 2*N; ++k) out[k] = in[k]; @@ -1611,19 +1849,22 @@ static void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *ou } #define pffft_transform_internal_nosimd pffft_transform_internal -static void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch, +static +void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch, pffft_direction_t direction, int ordered) { int Ncvec = setup->Ncvec; int nf_odd = (setup->ifac[1] & 1); +#if 0 /* temporary buffer is allocated on the stack if the scratch pointer is NULL */ - /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */ - /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */ - /*if (scratch == 0) scratch = scratch_on_stack; */ - - int ib; + int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); + VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); +#endif float *buff[2]; - buff[0] = output, buff[1] = scratch; + int ib; + /* if (scratch == 0) scratch = scratch_on_stack; */ + buff[0] = output; buff[1] = scratch; + if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */ ib = (nf_odd ^ ordered ? 1 : 0); @@ -1669,7 +1910,7 @@ static void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *inp #if 0 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate -static void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, +void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) { int i, Ncvec = s->Ncvec; @@ -1690,40 +1931,16 @@ static void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, co } #endif -#define pffft_zconvolve_nosimd pffft_zconvolve -static void pffft_zconvolve_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab) { - int i, Ncvec = s->Ncvec; - - if (s->transform == PFFFT_REAL) { - /* take care of the fftpack ordering */ - ab[0] = a[0]*b[0]; - ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1]; - ++ab; ++a; ++b; --Ncvec; - } - for (i=0; i < Ncvec; ++i) { - float ar, ai, br, bi; - ar = a[2*i+0]; ai = a[2*i+1]; - br = b[2*i+0]; bi = b[2*i+1]; - VCPLXMUL(ar, ai, br, bi); - ab[2*i+0] = ar; - ab[2*i+1] = ai; - } -} - #endif /* defined(PFFFT_SIMD_DISABLE) */ -static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { +static +void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0); } -static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { +static +void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) { pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1); } - -static void pffft_reorder_back(int length, void * setup, float * data, float * work) -{ - memcpy(work, data, (unsigned)length * sizeof(*work)); - pffft_zreorder(setup, work, data, PFFFT_BACKWARD); -} #endif diff --git a/soxr/src/pffft.h b/soxr/src/pffft.h index 78d936b..63522ca 100644 --- a/soxr/src/pffft.h +++ b/soxr/src/pffft.h @@ -1,4 +1,9 @@ -/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com ) +/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.h + * with minor changes for libsoxr. */ + +#if !defined PFFT_MACROS_ONLY + +/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com ) Based on original fortran 77 code from FFTPACKv4 from NETLIB, authored by Dr Paul Swarztrauber of NCAR, in 1985. @@ -60,8 +65,9 @@ - 1D transforms only, with 32-bit single precision. - supports only transforms for inputs of length N of the form - N=(2^a)*(3^b), a >= 5 and b >=0 (32, 48, 64, 96, 128, 144 etc - are all acceptable lengths). Performance is best for 128<=N<=8192. + N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128, + 144, 160, etc are all acceptable lengths). Performance is best for + 128<=N<=8192. - all (float*) pointers in the functions below are expected to have an "simd-compatible" alignment, that is 16 bytes on x86 and @@ -80,6 +86,10 @@ #ifdef __cplusplus extern "C" { +#endif + +#if PFFFT_DOUBLE +#define float double #endif /* opaque struct holding internal stuff (precomputed twiddle factors) @@ -99,8 +109,10 @@ extern "C" { PFFFT_Setup structure is read-only so it can safely be shared by multiple concurrent threads. */ - static PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); - static void pffft_destroy_setup(PFFFT_Setup *); + static + PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform); + static + void pffft_destroy_setup(PFFFT_Setup *); /* Perform a Fourier transform , The z-domain data is stored in the most efficient order for transforming it back, or using it for @@ -113,13 +125,14 @@ extern "C" { Typically you will want to scale the backward transform by 1/N. The 'work' pointer should point to an area of N (2*N for complex - fft) floats, properly aligned. [del]If 'work' is NULL, then stack will - be used instead (this is probably the beest strategy for small - FFTs, say for N < 16384).[/del] + fft) floats, properly aligned. If 'work' is NULL, then stack will + be used instead (this is probably the best strategy for small + FFTs, say for N < 16384). input and output may alias. */ - static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + static + void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); /* Similar to pffft_transform, but makes sure that the output is @@ -128,7 +141,8 @@ extern "C" { input and output may alias. */ - static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); + static + void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction); /* call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(..., @@ -142,7 +156,8 @@ extern "C" { input and output should not alias. */ - static void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); + static + void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction); /* Perform a multiplication of the frequency components of dft_a and @@ -155,23 +170,28 @@ extern "C" { the operation performed is: dft_ab += (dft_a * fdt_b)*scaling The dft_a, dft_b and dft_ab pointers may alias. - void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); */ + void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling); /* - the operation performed is: dft_ab = (dft_a * fdt_b) - - The dft_a, dft_b and dft_ab pointers may alias. + the float buffers must have the correct alignment (16-byte boundary + on intel and powerpc). This function may be used to obtain such + correctly aligned buffers. */ - static void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab); +#if 0 + void *pffft_aligned_malloc(size_t nb_bytes); + void pffft_aligned_free(void *); /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */ - int pffft_simd_size(void); + int pffft_simd_size(); +#endif - static void pffft_reorder_back(int length, void * setup, float * data, float * work); +#undef float #ifdef __cplusplus } #endif #endif + +#endif diff --git a/soxr/src/pffft32.c b/soxr/src/pffft32.c index 21bd845..c4c8e0a 100644 --- a/soxr/src/pffft32.c +++ b/soxr/src/pffft32.c @@ -1,11 +1,14 @@ /* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ -#define _soxr_simd_aligned_free free -#define _soxr_simd_aligned_malloc malloc +#define SIMD_ALIGNED_FREE free +#define SIMD_ALIGNED_MALLOC malloc #define PFFFT_SIMD_DISABLE -#include "pffft.c" +#define PFFFT_DOUBLE 0 +#include "pffft-wrap.c" + #include "filter.h" +#include "rdft_t.h" static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);} static void delete_setup(void * setup) {pffft_destroy_setup(setup);} @@ -15,18 +18,22 @@ static void backward (int length, void * setup, float * H, float * scratch) {pff static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;} static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H); (void)length;} static int multiplier(void) {return 1;} +static int flags(void) {return RDFT_NEEDS_SCRATCH;} -typedef void (* fn_t)(void); -fn_t _soxr_rdft32_cb[] = { - (fn_t)setup, - (fn_t)setup, - (fn_t)delete_setup, - (fn_t)forward, - (fn_t)oforward, - (fn_t)backward, - (fn_t)obackward, - (fn_t)convolve, - (fn_t)_soxr_ordered_partial_convolve_f, - (fn_t)multiplier, - (fn_t)pffft_reorder_back, +rdft_cb_table _soxr_rdft32_cb = { + setup, + setup, + delete_setup, + forward, + oforward, + backward, + obackward, + convolve, + _soxr_ordered_partial_convolve_f, + multiplier, + pffft_reorder_back, + malloc, + calloc, + free, + flags, }; diff --git a/soxr/src/pffft32s.c b/soxr/src/pffft32s.c index d049990..06f8fd5 100644 --- a/soxr/src/pffft32s.c +++ b/soxr/src/pffft32s.c @@ -1,27 +1,34 @@ /* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ -#include "pffft.c" +#define PFFFT_DOUBLE 0 +#include "pffft-wrap.c" + +#include "rdft_t.h" static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);} static void forward (int length, void * setup, float * h, float * scratch) {pffft_transform (setup, h, h, scratch, PFFFT_FORWARD); (void)length;} static void oforward (int length, void * setup, float * h, float * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;} static void backward (int length, void * setup, float * H, float * scratch) {pffft_transform (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;} static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;} -static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H); (void)length;} +static void convolve(int length, void * setup, float * H, float const * with) {pffft_zconvolve(setup, H, with, H); (void)length;} static int multiplier(void) {return 1;} +static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;} -typedef void (* fn_t)(void); -fn_t _soxr_rdft32s_cb[] = { - (fn_t)setup, - (fn_t)setup, - (fn_t)pffft_destroy_setup, - (fn_t)forward, - (fn_t)oforward, - (fn_t)backward, - (fn_t)obackward, - (fn_t)convolve, - (fn_t)_soxr_ordered_partial_convolve_simd, - (fn_t)multiplier, - (fn_t)pffft_reorder_back, +rdft_cb_table _soxr_rdft32s_cb = { + setup, + setup, + pffft_destroy_setup, + forward, + oforward, + backward, + obackward, + convolve, + ORDERED_PARTIAL_CONVOLVE_SIMD, + multiplier, + pffft_reorder_back, + SIMD_ALIGNED_MALLOC, + SIMD_ALIGNED_CALLOC, + SIMD_ALIGNED_FREE, + flags, }; diff --git a/soxr/src/pffft64s.c b/soxr/src/pffft64s.c new file mode 100644 index 0000000..82f6504 --- /dev/null +++ b/soxr/src/pffft64s.c @@ -0,0 +1,34 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#define PFFFT_DOUBLE 1 +#include "pffft-wrap.c" + +#include "rdft_t.h" + +static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);} +static void forward (int length, void * setup, double * h, double * scratch) {pffft_transform (setup, h, h, scratch, PFFFT_FORWARD); (void)length;} +static void oforward (int length, void * setup, double * h, double * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;} +static void backward (int length, void * setup, double * H, double * scratch) {pffft_transform (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;} +static void obackward(int length, void * setup, double * H, double * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;} +static void convolve(int length, void * setup, double * H, double const * with) {pffft_zconvolve(setup, H, with, H); (void)length;} +static int multiplier(void) {return 1;} +static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;} + +rdft_cb_table _soxr_rdft64s_cb = { + setup, + setup, + pffft_destroy_setup, + forward, + oforward, + backward, + obackward, + convolve, + ORDERED_PARTIAL_CONVOLVE_SIMD, + multiplier, + pffft_reorder_back, + SIMD_ALIGNED_MALLOC, + SIMD_ALIGNED_CALLOC, + SIMD_ALIGNED_FREE, + flags, +}; diff --git a/soxr/src/poly-fir.h b/soxr/src/poly-fir.h index f7b4261..d138e03 100644 --- a/soxr/src/poly-fir.h +++ b/soxr/src/poly-fir.h @@ -1,97 +1,149 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ -/* Resample using an interpolated poly-phase FIR with length LEN.*/ -/* Input must be followed by LEN-1 samples. */ +/* Resample using an interpolated poly-phase FIR with length LEN. */ +/* Input must be followed by FIR_LENGTH-1 samples. */ -#define a (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 0,j)) -#define b (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 1,j)) -#define c (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 2,j)) -#define d (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 3,j)) -#if COEF_INTERP == 0 - #define _ sum += a *in[j], ++j; -#elif COEF_INTERP == 1 - #define _ sum += (b *x + a)*in[j], ++j; -#elif COEF_INTERP == 2 - #define _ sum += ((c *x + b)*x + a)*in[j], ++j; -#elif COEF_INTERP == 3 - #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j; -#else +#if COEF_INTERP != 1 && COEF_INTERP != 2 && COEF_INTERP != 3 #error COEF_INTERP #endif +#if SIMD_AVX || SIMD_SSE || SIMD_NEON + #define N (FIR_LENGTH>>2) + + #if COEF_INTERP == 1 + #define _ sum=vMac(vMac(b,X,a),vLdu(in+j*4),sum), ++j; + #elif COEF_INTERP == 2 + #define _ sum=vMac(vMac(vMac(c,X,b),X,a),vLdu(in+j*4),sum), ++j; + #else + #define _ sum=vMac(vMac(vMac(vMac(d,X,c),X,b),X,a),vLdu(in+j*4),sum), ++j; + #endif + + #define a coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-0)] + #define b coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-1)] + #define c coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-2)] + #define d coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-3)] + + #define BEGINNING v4_t X = vLds(x), sum = vZero(); \ + v4_t const * const __restrict coefs = (v4_t *)COEFS + #define END vStorSum(output+i, sum) + #define cc(n) case n: core(n); break + #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);} +#else + #define N FIR_LENGTH + + #if COEF_INTERP == 1 + #define _ sum += (b*x + a)*in[j], ++j; + #elif COEF_INTERP == 2 + #define _ sum += ((c*x + b)*x + a)*in[j], ++j; + #else + #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j; + #endif + + #define a (coef(COEFS, COEF_INTERP, N, phase, 0,j)) + #define b (coef(COEFS, COEF_INTERP, N, phase, 1,j)) + #define c (coef(COEFS, COEF_INTERP, N, phase, 2,j)) + #define d (coef(COEFS, COEF_INTERP, N, phase, 3,j)) + + #define BEGINNING sample_t sum = 0 + #define END output[i] = sum + #define CORE(n) core(n) +#endif + + + +#define floatPrecCore(n) { \ + float_step_t at = p->at.flt; \ + for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { \ + sample_t const * const __restrict in = input + (int)at; \ + float_step_t frac = at - (int)at; \ + int phase = (int)(frac * (1 << PHASE_BITS)); \ + sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); \ + int j = 0; \ + BEGINNING; CONVOLVE(n); END; \ + } \ + fifo_read(&p->fifo, (int)at, NULL); \ + p->at.flt = at - (int)at; } /* Could round to 1 in some cirmcumstances. */ + + + +#define highPrecCore(n) { \ + step_t at; at.fix = p->at.fix; \ + for (i = 0; at.integer < num_in; ++i, \ + at.fix.ls.all += p->step.fix.ls.all, \ + at.whole += p->step.whole + (at.fix.ls.all < p->step.fix.ls.all)) { \ + sample_t const * const __restrict in = input + at.integer; \ + uint32_t frac = at.fraction; \ + int phase = (int)(frac >> (32 - PHASE_BITS)); /* High-order bits */ \ + /* Low-order bits, scaled to [0,1): */ \ + sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \ + int j = 0; \ + BEGINNING; CONVOLVE(n); END; \ + } \ + fifo_read(&p->fifo, at.integer, NULL); \ + p->at.whole = at.fraction; \ + p->at.fix.ls = at.fix.ls; } + + + +#define stdPrecCore(n) { \ + int64p_t at; at.all = p->at.whole; \ + for (i = 0; at.parts.ms < num_in; ++i, at.all += p->step.whole) { \ + sample_t const * const __restrict in = input + at.parts.ms; \ + uint32_t const frac = at.parts.ls; \ + int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \ + /* Low-order bits, scaled to [0,1): */ \ + sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \ + int j = 0; \ + BEGINNING; CONVOLVE(n); END; \ + } \ + fifo_read(&p->fifo, at.parts.ms, NULL); \ + p->at.whole = at.parts.ls; } + + + +#if WITH_FLOAT_STD_PREC_CLOCK + #define SPCORE floatPrecCore +#else + #define SPCORE stdPrecCore +#endif + + + +#if WITH_HI_PREC_CLOCK + #define core(n) if (p->use_hi_prec_clock) highPrecCore(n) else SPCORE(n) +#else + #define core(n) SPCORE(n) +#endif + + + static void FUNCTION(stage_t * p, fifo_t * output_fifo) { sample_t const * input = stage_read_p(p); - int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio); - sample_t * output = fifo_reserve(output_fifo, max_num_out); + int num_in = min(stage_occupancy(p), p->input_size); + int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio); + sample_t * const __restrict output = fifo_reserve(output_fifo, max_num_out); -#if defined HI_PREC_CLOCK -#if FLOAT_HI_PREC_CLOCK - if (p->use_hi_prec_clock) { - float_step_t at = p->at.flt; - for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { - sample_t const * in = input + (int)at; - float_step_t frac = at - (int)at; - int phase = (int)(frac * (1 << PHASE_BITS)); -#if COEF_INTERP > 0 - sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); -#endif - sample_t sum = 0; - int j = 0; - CONVOLVE - output[i] = sum; - } - fifo_read(&p->fifo, (int)at, NULL); - p->at.flt = at - (int)at; - } else -#else - if (p->use_hi_prec_clock) { - for (i = 0; p->at.integer < num_in; ++i, - p->at.fix.ls.all += p->step.fix.ls.all, - p->at.whole += p->step.whole + (p->at.fix.ls.all < p->step.fix.ls.all)) { - sample_t const * in = input + p->at.integer; - uint32_t frac = p->at.fraction; - int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ -#if COEF_INTERP > 0 /* low-order bits, scaled to [0,1) */ - sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); -#endif - sample_t sum = 0; - int j = 0; - CONVOLVE - output[i] = sum; - } - fifo_read(&p->fifo, p->at.integer, NULL); - p->at.integer = 0; - } else -#endif -#endif - { - for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) { - sample_t const * in = input + p->at.integer; - uint32_t frac = p->at.fraction; - int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ -#if COEF_INTERP > 0 /* low-order bits, scaled to [0,1) */ - sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); -#endif - sample_t sum = 0; - int j = 0; - CONVOLVE - output[i] = sum; - } - fifo_read(&p->fifo, p->at.integer, NULL); - p->at.integer = 0; - } + CORE(N); assert(max_num_out - i >= 0); fifo_trim_by(output_fifo, max_num_out - i); } + + #undef _ #undef a #undef b #undef c #undef d +#undef CORE +#undef cc +#undef core #undef COEF_INTERP +#undef N +#undef BEGINNING +#undef END #undef CONVOLVE #undef FIR_LENGTH #undef FUNCTION diff --git a/soxr/src/poly-fir0.h b/soxr/src/poly-fir0.h index 52d85b3..76fca2d 100644 --- a/soxr/src/poly-fir0.h +++ b/soxr/src/poly-fir0.h @@ -1,32 +1,56 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ -/* Resample using a non-interpolated poly-phase FIR with length LEN.*/ -/* Input must be followed by LEN-1 samples. */ +/* Resample using a non-interpolated poly-phase FIR with length LEN. */ +/* Input must be followed by FIR_LENGTH-1 samples. */ -#define _ sum += (coef(p->shared->poly_fir_coefs, 0, FIR_LENGTH, rem, 0, j)) *at[j], ++j; +#if SIMD_AVX || SIMD_SSE || SIMD_NEON + #define N (FIR_LENGTH>>2) + #define BEGINNING v4_t sum = vZero(); \ + v4_t const * const __restrict coefs = (v4_t *)COEFS + N * rem; + #define _ sum = vMac(vLdu(at+j*4), coefs[j], sum), ++j; + #define END vStorSum(output+i, sum) + #define cc(n) case n: core(n); break + #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);} +#else + #define N FIR_LENGTH + #define BEGINNING sample_t sum = 0; \ + sample_t const * const __restrict coefs = (sample_t *)COEFS + N * rem; + #define _ sum += coefs[j]*at[j], ++j; + #define END output[i] = sum + #define CORE(n) core(n) +#endif + +#define core(n) \ + for (i = 0; at < num_in * p->L; ++i, at += step) { \ + int const div = at / p->L, rem = at % p->L; \ + sample_t const * const __restrict at = input + div; \ + int j = 0; BEGINNING; CONVOLVE(n); END;} static void FUNCTION(stage_t * p, fifo_t * output_fifo) { - sample_t const * input = stage_read_p(p); - int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio); - sample_t * output = fifo_reserve(output_fifo, max_num_out); + int num_in = min(stage_occupancy(p), p->input_size); + if (num_in) { + sample_t const * input = stage_read_p(p); + int at = p->at.integer, step = p->step.integer; + int i, num_out = (num_in * p->L - at + step - 1) / step; + sample_t * __restrict output = fifo_reserve(output_fifo, num_out); - for (i = 0; p->at.integer < num_in * p->L; ++i, p->at.integer += p->step.integer) { - int div = p->at.integer / p->L, rem = p->at.integer % p->L; - sample_t const * at = input + div; - sample_t sum = 0; - int j = 0; - CONVOLVE - output[i] = sum; + CORE(N); + assert(i == num_out); + fifo_read(&p->fifo, at / p->L, NULL); + p->at.integer = at % p->L; } - assert(max_num_out - i >= 0); - fifo_trim_by(output_fifo, max_num_out - i); - fifo_read(&p->fifo, p->at.integer / p->L, NULL); - p->at.integer = p->at.integer % p->L; } #undef _ +#undef CORE +#undef cc +#undef core +#undef N +#undef BEGINNING +#undef MIDDLE +#undef END #undef CONVOLVE #undef FIR_LENGTH #undef FUNCTION diff --git a/soxr/src/rate.h b/soxr/src/rate.h deleted file mode 100644 index f6d055a..0000000 --- a/soxr/src/rate.h +++ /dev/null @@ -1,726 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-14 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -#include -#include -#include -#include - -#include "filter.h" - -#if defined SOXR_LIB -#include "internal.h" - -typedef void (* fn_t)(void); -extern fn_t RDFT_CB[11]; - -#define rdft_forward_setup (*(void * (*)(int))RDFT_CB[0]) -#define rdft_backward_setup (*(void * (*)(int))RDFT_CB[1]) -#define rdft_delete_setup (*(void (*)(void *))RDFT_CB[2]) -#define rdft_forward (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[3]) -#define rdft_oforward (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[4]) -#define rdft_backward (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[5]) -#define rdft_obackward (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[6]) -#define rdft_convolve (*(void (*)(int, void *, sample_t *, sample_t const *))RDFT_CB[7]) -#define rdft_convolve_portion (*(void (*)(int, sample_t *, sample_t const *))RDFT_CB[8]) -#define rdft_multiplier (*(int (*)(void))RDFT_CB[9]) -#define rdft_reorder_back (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[10]) - -#endif - -#if RATE_SIMD /* Align for SIMD: */ - #include "simd.h" -#if 0 /* Not using this yet. */ - #define RATE_SIMD_POLY 1 - #define num_coefs4 ((num_coefs + 3) & ~3) - #define coefs4_check(i) ((i) < num_coefs) -#else - #define RATE_SIMD_POLY 0 - #define num_coefs4 num_coefs - #define coefs4_check(i) 1 -#endif - - #define aligned_free _soxr_simd_aligned_free - #define aligned_malloc _soxr_simd_aligned_malloc - #define aligned_calloc _soxr_simd_aligned_calloc -#if 0 - #define FIFO_REALLOC aligned_realloc - #define FIFO_MALLOC aligned_malloc - #define FIFO_FREE aligned_free - - static void * aligned_realloc(void * q, size_t nb_bytes, size_t copy_bytes) { - void * p = aligned_malloc(nb_bytes); - if (p) memcpy(p, q, copy_bytes); - aligned_free(q); - return p; - } -#endif -#else - #define RATE_SIMD_POLY 0 - #define num_coefs4 num_coefs - #define coefs4_check(i) 1 - - #define aligned_free free - #define aligned_malloc malloc - #define aligned_calloc calloc -#endif - -#define FIFO_SIZE_T int -#include "fifo.h" - -typedef union { /* Int64 in parts */ - #if WORDS_BIGENDIAN - struct {int32_t ms; uint32_t ls;} parts; - #else - struct {uint32_t ls; int32_t ms;} parts; - #endif - int64_t all; -} int64p_t; - -typedef union { /* Uint64 in parts */ - #if WORDS_BIGENDIAN - struct {uint32_t ms, ls;} parts; - #else - struct {uint32_t ls, ms;} parts; - #endif - uint64_t all; -} uint64p_t; - -#define FLOAT_HI_PREC_CLOCK 0 /* Non-float hi-prec has ~96 bits. */ -#define float_step_t long double /* __float128 is also a (slow) option */ - -#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) coef_p[(fir_len) * ((interp_order) + 1) * (phase_num) + ((interp_order) + 1) * (fir_coef_num) + (interp_order - coef_interp_num)] - -#define raw_coef_t double - -static sample_t * prepare_coefs(raw_coef_t const * coefs, int num_coefs, - int num_phases, int interp_order, double multiplier) -{ - int i, j, length = num_coefs4 * num_phases; - sample_t * result = malloc((size_t)(length * (interp_order + 1)) * sizeof(*result)); - double fm1 = coefs[0], f1 = 0, f2 = 0; - - for (i = num_coefs4 - 1; i >= 0; --i) - for (j = num_phases - 1; j >= 0; --j) { - double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */ - int pos = i * num_phases + j - 1; - fm1 = coefs4_check(i) && pos > 0 ? coefs[pos - 1] * multiplier : 0; - switch (interp_order) { - case 1: b = f1 - f0; break; - case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break; - case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break; - default: if (interp_order) assert(0); - } - #define coef_coef(x) \ - coef(result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i) - coef_coef(0) = (sample_t)f0; - if (interp_order > 0) coef_coef(1) = (sample_t)b; - if (interp_order > 1) coef_coef(2) = (sample_t)c; - if (interp_order > 2) coef_coef(3) = (sample_t)d; - #undef coef_coef - f2 = f1, f1 = f0; - } - return result; -} - -typedef struct { - int dft_length, num_taps, post_peak; - void * dft_forward_setup, * dft_backward_setup; - sample_t * coefs; -} dft_filter_t; - -typedef struct { /* So generated filter coefs may be shared between channels */ - sample_t * poly_fir_coefs; - dft_filter_t dft_filter[2]; -} rate_shared_t; - -typedef enum { - irrational_stage = 1, - cubic_stage, - dft_stage, - half_stage, - rational_stage -} stage_type_t; - -struct stage; -typedef void (* stage_fn_t)(struct stage * input, fifo_t * output); -#define MULT32 (65536. * 65536.) - -typedef union { /* Fixed point arithmetic */ - struct {uint64p_t ls; int64p_t ms;} fix; - float_step_t flt; -} step_t; - -typedef struct stage { - /* Common to all stage types: */ - stage_type_t type; - stage_fn_t fn; - fifo_t fifo; - int pre; /* Number of past samples to store */ - int pre_post; /* pre + number of future samples to store */ - int preload; /* Number of zero samples to pre-load the fifo */ - double out_in_ratio; /* For buffer management. */ - - /* For a stage with variable (run-time generated) filter coefs: */ - rate_shared_t * shared; - unsigned dft_filter_num; /* Which, if any, of the 2 DFT filters to use */ - sample_t * dft_scratch, * dft_out; - - /* For a stage with variable L/M: */ - step_t at, step; - bool use_hi_prec_clock; - int L, remM; - int n, phase_bits, block_len; - double mult, phase0; -} stage_t; - -#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post) -#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre) - -static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo) -{ - int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio); - sample_t const * input = stage_read_p(p); - sample_t * output = fifo_reserve(output_fifo, max_num_out); - -#define integer fix.ms.parts.ms -#define fraction fix.ms.parts.ls -#define whole fix.ms.all - for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) { - sample_t const * s = input + p->at.integer; - double x = p->at.fraction * (1 / MULT32); - double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b); - double c = s[1]-*s-a-b; - output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s)); - } - assert(max_num_out - i >= 0); - fifo_trim_by(output_fifo, max_num_out - i); - fifo_read(&p->fifo, p->at.integer, NULL); - p->at.integer = 0; -} - -#if RATE_SIMD - #define dft_out p->dft_out -#else - #define dft_out output -#endif - -static void dft_stage_fn(stage_t * p, fifo_t * output_fifo) -{ - sample_t * output; - int i, j, num_in = max(0, fifo_occupancy(&p->fifo)); - rate_shared_t const * s = p->shared; - dft_filter_t const * f = &s->dft_filter[p->dft_filter_num]; - int const overlap = f->num_taps - 1; - - while (p->at.integer + p->L * num_in >= f->dft_length) { - div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L); - sample_t const * input = fifo_read_ptr(&p->fifo); - fifo_read(&p->fifo, divd.quot, NULL); - num_in -= divd.quot; - - output = fifo_reserve(output_fifo, f->dft_length); - - if (lsx_is_power_of_2(p->L)) { /* F-domain */ - int portion = f->dft_length / p->L; - memcpy(dft_out, input, (unsigned)portion * sizeof(*dft_out)); - rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch); - for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */ - dft_out[i] = dft_out[(portion << 1) - i], - dft_out[i+1] = -dft_out[(portion << 1) - i + 1]; - dft_out[portion] = dft_out[1]; - dft_out[portion + 1] = 0; - dft_out[1] = dft_out[0]; - - for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) { - memcpy(dft_out + i, dft_out, (size_t)portion * sizeof(*dft_out)); - dft_out[i + 1] = 0; - } - if (p->step.integer > 0) - rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch); - } else { - if (p->L == 1) - memcpy(dft_out, input, (size_t)f->dft_length * sizeof(*dft_out)); - else { - memset(dft_out, 0, (size_t)f->dft_length * sizeof(*dft_out)); - for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L) - dft_out[i] = input[j]; - p->at.integer = p->L - 1 - divd.rem; - } - if (p->step.integer > 0) - rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch); - else - rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch); - } - - if (p->step.integer > 0) { - rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs); - rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch); -#if RATE_SIMD - if (p->step.integer == 1) - memcpy(output, dft_out, (size_t)f->dft_length * sizeof(sample_t)); -#endif - if (p->step.integer != 1) { - for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j, - i += p->step.integer) - output[j] = dft_out[i]; - p->remM = i - (f->dft_length - overlap); - fifo_trim_by(output_fifo, f->dft_length - j); - } - else fifo_trim_by(output_fifo, overlap); - } - else { /* F-domain */ - int m = -p->step.integer; - rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs); - rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch); -#if RATE_SIMD - memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof(sample_t)); -#endif - fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m); - } - } -} - -#undef dft_out - -/* Set to 4 x nearest power of 2 */ -/* or half of that if danger of causing too many cache misses. */ -static int set_dft_length(int num_taps, int min, int large) -{ - double d = log((double)num_taps) / log(2.); - return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large)); -} - -static void dft_stage_init( - unsigned instance, double Fp, double Fs, double Fn, double att, - double phase, stage_t * p, int L, int M, double * multiplier, - int min_dft_size, int large_dft_size) -{ - dft_filter_t * f = &p->shared->dft_filter[instance]; - int num_taps = 0, dft_length = f->dft_length, i; - bool f_domain_m = abs(3-M) == 1 && Fs <= 1; - - if (!dft_length) { - int k = phase == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4; - double * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.); - - if (phase != 50) - lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase); - else f->post_peak = num_taps / 2; - - dft_length = set_dft_length(num_taps, min_dft_size, large_dft_size); - f->coefs = aligned_calloc((size_t)dft_length, sizeof(*f->coefs)); - for (i = 0; i < num_taps; ++i) - f->coefs[(i + dft_length - num_taps + 1) & (dft_length - 1)] - = (sample_t)(h[i] * ((1. / dft_length) * rdft_multiplier() * L * *multiplier)); - free(h); - } - -#if RATE_SIMD - p->dft_out = aligned_malloc(sizeof(sample_t) * (size_t)dft_length); -#endif -#if 1 /* In fact, currently, only pffft needs this. */ - p->dft_scratch = aligned_malloc(2 * sizeof(sample_t) * (size_t)dft_length); -#endif - - if (!f->dft_length) { - void * coef_setup = rdft_forward_setup(dft_length); - int Lp = lsx_is_power_of_2(L)? L : 1; - int Mp = f_domain_m? M : 1; - f->dft_forward_setup = rdft_forward_setup(dft_length / Lp); - f->dft_backward_setup = rdft_backward_setup(dft_length / Mp); - if (Mp == 1) - rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch); - else - rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch); - rdft_delete_setup(coef_setup); - f->num_taps = num_taps; - f->dft_length = dft_length; - lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i", - num_taps, dft_length, Fp, Fs, Fn, att, L, M); - } - *multiplier = 1; - p->out_in_ratio = (double)L / M; - p->type = dft_stage; - p->fn = dft_stage_fn; - p->preload = f->post_peak / L; - p->at.integer = f->post_peak % L; - p->L = L; - p->step.integer = f_domain_m? -M/2 : M; - p->dft_filter_num = instance; - p->block_len = f->dft_length - (f->num_taps - 1); - p->phase0 = p->at.integer / p->L; -} - -#include "filters.h" - -typedef struct { - double factor; - uint64_t samples_in, samples_out; - int num_stages; - stage_t * stages; -} rate_t; - -#define pre_stage p->stages[shift] -#define arb_stage p->stages[shift + have_pre_stage] -#define post_stage p->stages[shift + have_pre_stage + have_arb_stage] -#define have_pre_stage (preM * preL != 1) -#define have_arb_stage (arbM * arbL != 1) -#define have_post_stage (postM * postL != 1) - -#define TO_3dB(a) ((1.6e-6*a-7.5e-4)*a+.646) -#define LOW_Q_BW0 (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */ - -typedef enum { - rolloff_none, rolloff_small /* <= 0.01 dB */, rolloff_medium /* <= 0.35 dB */ -} rolloff_t; - - -static char const * rate_init( - /* Private work areas (to be supplied by the client): */ - rate_t * p, /* Per audio channel. */ - rate_shared_t * shared, /* Between channels (undergoing same rate change)*/ - - /* Public parameters: Typically */ - double factor, /* Input rate divided by output rate. */ - double bits, /* Required bit-accuracy (pass + stop) 16|20|28 */ - double phase, /* Linear/minimum etc. filter phase. 50 */ - double passband_end, /* 0dB pt. bandwidth to preserve; nyquist=1 0.913*/ - double stopband_begin, /* Aliasing/imaging control; > passband_end 1 */ - rolloff_t rolloff, /* Pass-band roll-off small */ - bool maintain_3dB_pt, /* true */ - double multiplier, /* Linear gain to apply during conversion. 1 */ - - /* Primarily for test/development purposes: */ - bool use_hi_prec_clock, /* Increase irrational ratio accuracy. false */ - int interpolator, /* Force a particular coef interpolator. -1 */ - size_t max_coefs_size, /* k bytes of coefs to try to keep below. 400 */ - bool noSmallIntOpt, /* Disable small integer optimisations. false */ - int log2_min_dft_size, - int log2_large_dft_size) -{ - double att = (bits + 1) * linear_to_dB(2.), attArb = att; /* pass + stop */ - double tbw0 = 1 - passband_end, Fs_a = stopband_begin; - double arbM = factor, tbw_tighten = 1; - int n = 0, i, preL = 1, preM = 1, shift = 0, arbL = 1, postL = 1, postM = 1; - bool upsample = false, rational = false, iOpt = !noSmallIntOpt; - int mode = rolloff > rolloff_small? factor > 1 || passband_end > LOW_Q_BW0: - (int)ceil(2 + (bits - 17) / 4); - stage_t * s; - - assert(factor > 0); - assert(!bits || (15 <= bits && bits <= 33)); - assert(0 <= phase && phase <= 100); - assert(.53 <= passband_end); - assert(stopband_begin <= 1.2); - assert(passband_end + .005 < stopband_begin); - - p->factor = factor; - if (bits) while (!n++) { /* Determine stages: */ - int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 : - (int)ceil((double)max_coefs_size * 1000. / (U100_l * sizeof(sample_t))); - double d, epsilon = 0, frac; - upsample = arbM < 1; - for (i = (int)(arbM * .5), shift = 0; i >>= 1; arbM *= .5, ++shift); - preM = upsample || (arbM > 1.5 && arbM < 2); - postM = 1 + (arbM > 1 && preM), arbM /= postM; - preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL; - if ((frac = arbM - (int)arbM)) - epsilon = fabs((uint32_t)(frac * MULT32 + .5) / (frac * MULT32) - 1); - for (i = 1, rational = !frac; i <= maxL && !rational; ++i) { - d = frac * i, try = (int)(d + .5); - if ((rational = fabs(try / d - 1) <= epsilon)) { /* No long doubles! */ - if (try == i) - arbM = ceil(arbM), shift += arbM > 2, arbM /= 1 + (arbM > 2); - else arbM = i * (int)arbM + try, arbL = i; - } - } - L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x; - if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) { - for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1); - arbM = arbM * postL / arbL / preL, arbL = 1, n = 0; - } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt)) - preL = L, preM = M, arbM = arbL = postM = 1; - if (!mode && (!rational || !n)) - ++mode, n = 0; - } - - p->num_stages = shift + have_pre_stage + have_arb_stage + have_post_stage; - if (!p->num_stages && multiplier != 1) { - bits = arbL = 0; /* Use cubic_stage in this case. */ - ++p->num_stages; - } - p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages)); - for (i = 0; i < p->num_stages; ++i) - p->stages[i].shared = shared; - - if ((n = p->num_stages) > 1) { /* Att. budget: */ - if (have_arb_stage) - att += linear_to_dB(2.), attArb = att, --n; - att += linear_to_dB((double)n); - } - - for (n = 0; (size_t)n + 1 < array_length(half_firs) && att > half_firs[n].att; ++n); - for (i = 0, s = p->stages; i < shift; ++i, ++s) { - s->type = half_stage; - s->fn = half_firs[n].fn; - s->pre_post = 4 * half_firs[n].num_coefs; - s->preload = s->pre = s->pre_post >> 1; - } - - if (have_pre_stage) { - if (maintain_3dB_pt && have_post_stage) { /* Trans. bands overlapping. */ - double tbw3 = tbw0 * TO_3dB(att); /* FFS: consider Fs_a. */ - double x = ((2.1429e-4 - 5.2083e-7 * att) * att - .015863) * att + 3.95; - x = att * pow((tbw0 - tbw3) / (postM / (factor * postL) - 1 + tbw0), x); - if (x > .035) { - tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014; - lsx_debug("x=%g tbw_tighten=%g", x, tbw_tighten); - } - } - dft_stage_init(0, 1 - tbw0 * tbw_tighten, Fs_a, preM? max(preL, preM) : - arbM / arbL, att, phase, &pre_stage, preL, max(preM, 1), &multiplier, - log2_min_dft_size, log2_large_dft_size); - } - - if (!bits && have_arb_stage) { /* `Quick' cubic arb stage: */ - arb_stage.type = cubic_stage; - arb_stage.fn = cubic_stage_fn; - arb_stage.mult = multiplier, multiplier = 1; - arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5); - arb_stage.pre_post = max(3, arb_stage.step.integer); - arb_stage.preload = arb_stage.pre = 1; - arb_stage.out_in_ratio = MULT32 / (double)arb_stage.step.whole; - } - else if (have_arb_stage) { /* Higher quality arb stage: */ - poly_fir_t const * f = &poly_firs[6*(upsample + !!preM) + mode - !upsample]; - int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases; - size_t coefs_size; - double x = .5, at, Fp, Fs, Fn, mult = upsample? 1 : arbL / arbM; - poly_fir1_t const * f1; - - Fn = !upsample && preM? x = arbM / arbL : 1; - Fp = !preM? mult : mode? .5 : 1; - Fs = 2 - Fp; /* Ignore Fs_a; it would have little benefit here. */ - Fp *= 1 - tbw0; - if (rolloff > rolloff_small && mode) - Fp = !preM? mult * .5 - .125 : mult * .05 + .1; - else if (rolloff == rolloff_small) - Fp = Fs - (Fs - .148 * x - Fp * .852) * (.00813 * bits + .973); - - i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1; - do { - f1 = &f->interp[++i]; - assert(f1->fn); - if (i) - arbM /= arbL, arbL = 1, rational = false; - phase_bits = (int)ceil(f1->scalar + log(mult)/log(2.)); - phases = !rational? (1 << phase_bits) : arbL; - if (!f->interp[0].scalar) { - int phases0 = max(phases, 19), n0 = 0; - lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta); - num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM; - } - if ((num_coefs & 1) && rational && (arbL & 1)) - phases <<= 1, arbL <<= 1, arbM *= 2; - at = arbL * (arb_stage.phase0 = .5 * (num_coefs & 1)); - order = i + (i && mode > 4); - coefs_size = (size_t)(num_coefs4 * phases * (order + 1)) * sizeof(sample_t); - } while (interpolator < 0 && i < 2 && f->interp[i+1].fn && - coefs_size / 1000 > max_coefs_size); - - if (!arb_stage.shared->poly_fir_coefs) { - int num_taps = num_coefs * phases - 1; - raw_coef_t * coefs = lsx_design_lpf( - Fp, Fs, Fn, attArb, &num_taps, phases, f->beta); - arb_stage.shared->poly_fir_coefs = prepare_coefs( - coefs, num_coefs, phases, order, multiplier); - lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk", - num_coefs, phases, order, (double)coefs_size / 1000.); - free(coefs); - } - multiplier = 1; - arb_stage.type = rational? rational_stage : irrational_stage; - arb_stage.fn = f1->fn; - arb_stage.pre_post = num_coefs4 - 1; - arb_stage.preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs); - arb_stage.n = num_coefs4; - arb_stage.phase_bits = phase_bits; - arb_stage.L = arbL; - arb_stage.use_hi_prec_clock = mode > 1 && use_hi_prec_clock && !rational; -#if FLOAT_HI_PREC_CLOCK - if (arb_stage.use_hi_prec_clock) { - arb_stage.at.flt = at; - arb_stage.step.flt = arbM; - arb_stage.out_in_ratio = (double)(arbL / arb_stage.step.flt); - } else -#endif - { - arb_stage.at.whole = (int64_t)(at * MULT32 + .5); -#if !FLOAT_HI_PREC_CLOCK - if (arb_stage.use_hi_prec_clock) { - arb_stage.at.fix.ls.parts.ms = 0x80000000ul; - arbM *= MULT32; - arb_stage.step.whole = (int64_t)arbM; - arbM -= (double)arb_stage.step.whole; - arbM *= MULT32 * MULT32; - arb_stage.step.fix.ls.all = (uint64_t)arbM; - } else -#endif - arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5); - arb_stage.out_in_ratio = MULT32 * arbL / (double)arb_stage.step.whole; - } - } - - if (have_post_stage) - dft_stage_init(1, 1 - (1 - (1 - tbw0) * - (upsample? factor * postL / postM : 1)) * tbw_tighten, Fs_a, - (double)max(postL, postM), att, phase, &post_stage, postL, postM, - &multiplier, log2_min_dft_size, log2_large_dft_size); - - - lsx_debug("%g: »%i⋅%i/%i⋅%i/%g⋅%i/%i", - 1/factor, shift, preL, preM, arbL, arbM, postL, postM); - for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) { - fifo_create(&s->fifo, (int)sizeof(sample_t)); - memset(fifo_reserve(&s->fifo, s->preload), 0, sizeof(sample_t) * (size_t)s->preload); - lsx_debug("%5i|%-5i preload=%i remL=%i o/i=%g", - s->pre, s->pre_post - s->pre, s->preload, s->at.integer, s->out_in_ratio); - } - fifo_create(&s->fifo, (int)sizeof(sample_t)); - return 0; -} - -static void rate_process(rate_t * p) -{ - stage_t * stage = p->stages; - int i; - for (i = 0; i < p->num_stages; ++i, ++stage) - stage->fn(stage, &(stage+1)->fifo); -} - -static sample_t * rate_input(rate_t * p, sample_t const * samples, size_t n) -{ - p->samples_in += n; - return fifo_write(&p->stages[0].fifo, (int)n, samples); -} - -static sample_t const * rate_output(rate_t * p, sample_t * samples, size_t * n) -{ - fifo_t * fifo = &p->stages[p->num_stages].fifo; - p->samples_out += *n = min(*n, (size_t)fifo_occupancy(fifo)); - return fifo_read(fifo, (int)*n, samples); -} - -static void rate_flush(rate_t * p) -{ - fifo_t * fifo = &p->stages[p->num_stages].fifo; -#if defined _MSC_VER && _MSC_VER == 1200 - uint64_t samples_out = (uint64_t)(int64_t)((double)(int64_t)p->samples_in / p->factor + .5); -#else - uint64_t samples_out = (uint64_t)((double)p->samples_in / p->factor + .5); -#endif - size_t remaining = (size_t)(samples_out - p->samples_out); - - if ((size_t)fifo_occupancy(fifo) < remaining) { - uint64_t samples_in = p->samples_in; - sample_t * buff = calloc(1024, sizeof(*buff)); - - while ((size_t)fifo_occupancy(fifo) < remaining) { - rate_input(p, buff, 1024); - rate_process(p); - } - fifo_trim_to(fifo, (int)remaining); - p->samples_in = samples_in; - free(buff); - } -} - -static void rate_close(rate_t * p) -{ - rate_shared_t * shared = p->stages[0].shared; - int i; - - for (i = 0; i <= p->num_stages; ++i) { - stage_t * s = &p->stages[i]; - aligned_free(s->dft_scratch); - aligned_free(s->dft_out); - fifo_delete(&s->fifo); - } - if (shared) { - for (i = 0; i < 2; ++i) { - dft_filter_t * f= &shared->dft_filter[i]; - aligned_free(f->coefs); - rdft_delete_setup(f->dft_forward_setup); - rdft_delete_setup(f->dft_backward_setup); - } - free(shared->poly_fir_coefs); - memset(shared, 0, sizeof(*shared)); - } - free(p->stages); -} - -#if defined SOXR_LIB -static double rate_delay(rate_t * p) -{ -#if defined _MSC_VER && _MSC_VER == 1200 - double samples_out = (double)(int64_t)p->samples_in / p->factor; - return max(0, samples_out - (double)(int64_t)p->samples_out); -#else - double samples_out = (double)p->samples_in / p->factor; - return max(0, samples_out - (double)p->samples_out); -#endif -} - -static void rate_sizes(size_t * shared, size_t * channel) -{ - *shared = sizeof(rate_shared_t); - *channel = sizeof(rate_t); -} - -#include "soxr.h" - -static char const * rate_create( - void * channel, - void * shared, - double io_ratio, - soxr_quality_spec_t * q_spec, - soxr_runtime_spec_t * r_spec, - double scale) -{ - return rate_init( - channel, shared, - io_ratio, - q_spec->precision, - q_spec->phase_response, - q_spec->passband_end, - q_spec->stopband_begin, - "\1\2\0"[q_spec->flags & 3], - !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT), - scale, - !!(q_spec->flags & SOXR_HI_PREC_CLOCK), - (int)(r_spec->flags & 3) - 1, - r_spec->coef_size_kbytes, - !!(r_spec->flags & SOXR_NOSMALLINTOPT), - (int)r_spec->log2_min_dft_size, - (int)r_spec->log2_large_dft_size); -} - -static char const * id(void) -{ - return RATE_ID; -} - -fn_t RATE_CB[] = { - (fn_t)rate_input, - (fn_t)rate_process, - (fn_t)rate_output, - (fn_t)rate_flush, - (fn_t)rate_close, - (fn_t)rate_delay, - (fn_t)rate_sizes, - (fn_t)rate_create, - (fn_t)0, - (fn_t)id, -}; -#endif diff --git a/soxr/src/rate32.c b/soxr/src/rate32.c deleted file mode 100644 index d6dd3b9..0000000 --- a/soxr/src/rate32.c +++ /dev/null @@ -1,9 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -#define sample_t float -#define RATE_SIMD 0 -#define RDFT_CB _soxr_rdft32_cb -#define RATE_CB _soxr_rate32_cb -#define RATE_ID "single-precision" -#include "rate.h" diff --git a/soxr/src/rate32s.c b/soxr/src/rate32s.c deleted file mode 100644 index 26a371a..0000000 --- a/soxr/src/rate32s.c +++ /dev/null @@ -1,9 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -#define sample_t float -#define RATE_SIMD 1 -#define RDFT_CB _soxr_rdft32s_cb -#define RATE_CB _soxr_rate32s_cb -#define RATE_ID "single-precision-SIMD" -#include "rate.h" diff --git a/soxr/src/rate64.c b/soxr/src/rate64.c deleted file mode 100644 index 6289911..0000000 --- a/soxr/src/rate64.c +++ /dev/null @@ -1,9 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -#define sample_t double -#define RATE_SIMD 0 -#define RDFT_CB _soxr_rdft64_cb -#define RATE_CB _soxr_rate64_cb -#define RATE_ID "double-precision" -#include "rate.h" diff --git a/soxr/src/rdft.h b/soxr/src/rdft.h index 59ba174..4ecd247 100644 --- a/soxr/src/rdft.h +++ b/soxr/src/rdft.h @@ -1,9 +1,11 @@ /* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ -void ORDERED_CONVOLVE(int n, void * not_used, DFT_FLOAT * a, const DFT_FLOAT * b) +void ORDERED_CONVOLVE(int n, void * not_used, void * A, const void * B) { int i; + DFT_FLOAT* a = A; + const DFT_FLOAT* b = B; a[0] *= b[0]; a[1] *= b[1]; for (i = 2; i < n; i += 2) { @@ -14,9 +16,11 @@ void ORDERED_CONVOLVE(int n, void * not_used, DFT_FLOAT * a, const DFT_FLOAT * b (void)not_used; } -void ORDERED_PARTIAL_CONVOLVE(int n, DFT_FLOAT * a, const DFT_FLOAT * b) +void ORDERED_PARTIAL_CONVOLVE(int n, void * A, const void * B) { int i; + DFT_FLOAT* a = A; + const DFT_FLOAT* b = B; a[0] *= b[0]; for (i = 2; i < n; i += 2) { DFT_FLOAT tmp = a[i]; diff --git a/soxr/src/rdft_t.h b/soxr/src/rdft_t.h new file mode 100644 index 0000000..7e44134 --- /dev/null +++ b/soxr/src/rdft_t.h @@ -0,0 +1,40 @@ +/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +typedef struct { + void * (* forward_setup)(int); + void * (* backward_setup)(int); + void (* delete_setup)(void *); + void (* forward)(int, void *, void *, void *); + void (* oforward)(int, void *, void *, void *); + void (* backward)(int, void *, void *, void *); + void (* obackward)(int, void *, void *, void *); + void (* convolve)(int, void *, void *, void const *); + void (* convolve_portion)(int, void *, void const *); + int (* multiplier)(void); + void (* reorder_back)(int, void *, void *, void *); + void * (* malloc)(size_t); + void * (* calloc)(size_t, size_t); + void (* free)(void *); + int (* flags)(void); +} rdft_cb_table; + +#define rdft_forward_setup RDFT_CB->forward_setup +#define rdft_backward_setup RDFT_CB->backward_setup +#define rdft_delete_setup RDFT_CB->delete_setup +#define rdft_forward RDFT_CB->forward +#define rdft_oforward RDFT_CB->oforward +#define rdft_backward RDFT_CB->backward +#define rdft_obackward RDFT_CB->obackward +#define rdft_convolve RDFT_CB->convolve +#define rdft_convolve_portion RDFT_CB->convolve_portion +#define rdft_multiplier RDFT_CB->multiplier +#define rdft_reorder_back RDFT_CB->reorder_back +#define rdft_malloc RDFT_CB->malloc +#define rdft_calloc RDFT_CB->calloc +#define rdft_free RDFT_CB->free +#define rdft_flags RDFT_CB->flags + +/* Flag templates: */ +#define RDFT_IS_SIMD 1 +#define RDFT_NEEDS_SCRATCH 2 diff --git a/soxr/src/rint-clip.h b/soxr/src/rint-clip.h index 06764a8..bfb6458 100644 --- a/soxr/src/rint-clip.h +++ b/soxr/src/rint-clip.h @@ -1,9 +1,9 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ #if defined DITHER -#define DITHERING (1./32)*(int)(((ran1>>=3)&31)-((ran2>>=3)&31)) +#define DITHERING + (1./32)*(int)(((ran1>>=3)&31)-((ran2>>=3)&31)) #define DITHER_RAND (seed = 1664525UL * seed + 1013904223UL) >> 3 #define DITHER_VARS unsigned long ran1 = DITHER_RAND, ran2 = DITHER_RAND #define SEED_ARG , unsigned long * seed0 @@ -12,10 +12,11 @@ #define COPY_SEED1 unsigned long seed1 = seed #define PASS_SEED1 , &seed1 #define PASS_SEED , &seed +#define FLOATD double #else -#define DITHERING 0 +#define DITHERING #define DITHER_VARS #define SEED_ARG #define SAVE_SEED @@ -23,9 +24,12 @@ #define COPY_SEED1 #define PASS_SEED1 #define PASS_SEED +#define FLOATD FLOATX #endif +#define DO_16 _;_;_;_;_;_;_;_;_;_;_;_;_;_;_;_ + #if defined FE_INVALID && defined FPU_RINT @@ -35,8 +39,8 @@ static void RINT_CLIP(RINT_T * const dest, FLOATX const * const src, COPY_SEED DITHER_VARS; for (; i < n; ++i) { - double d = src[i] + DITHERING; - dest[stride * i] = RINT(d); + FLOATD const d = src[i] DITHERING; + RINT(dest[stride * i], d); if (fe_test_invalid()) { fe_clear_invalid(); dest[stride * i] = d > 0? RINT_MAX : -RINT_MAX - 1; @@ -56,29 +60,29 @@ static size_t LSX_RINT_CLIP(void * * const dest0, FLOATX const * const src, RINT_T * dest = *dest0; COPY_SEED #if defined FE_INVALID && defined FPU_RINT -#define _ dest[i] = RINT(src[i] + DITHERING), ++i, - fe_clear_invalid(); - for (i = 0; i < (n & ~7u);) { +#define _ RINT(dest[i], src[i] DITHERING); ++i + for (i = 0; i < (n & ~15u);) { COPY_SEED1; DITHER_VARS; - _ _ _ _ _ _ _ _ (void)0; + DO_16; if (fe_test_invalid()) { fe_clear_invalid(); - RINT_CLIP(dest, src, 1, i - 8, i, &clips PASS_SEED1); + RINT_CLIP(dest, src, 1, i - 16, i, &clips PASS_SEED1); } } RINT_CLIP(dest, src, 1, i, n, &clips PASS_SEED); #else -#define _ d = src[i] + DITHERING, dest[i++] = (RINT_T)(d > 0? d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5), +#define _ d = src[i] DITHERING, dest[i++] = (RINT_T)(d > 0? \ + d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5) const double N = 1. + RINT_MAX; double d; - for (i = 0; i < (n & ~7u);) { + for (i = 0; i < (n & ~15u);) { DITHER_VARS; - _ _ _ _ _ _ _ _ (void)0; + DO_16; } { DITHER_VARS; - for (; i < n; _ (void)0); + for (; i < n; _); } #endif SAVE_SEED; @@ -97,34 +101,34 @@ static size_t LSX_RINT_CLIP_2(void * * dest0, FLOATX const * const * srcs, RINT_T * dest = *dest0; COPY_SEED #if defined FE_INVALID && defined FPU_RINT -#define _ dest[stride * i] = RINT(src[i] + DITHERING), ++i, - fe_clear_invalid(); +#define _ RINT(dest[stride * i], src[i] DITHERING); ++i for (j = 0; j < stride; ++j, ++dest) { FLOATX const * const src = srcs[j]; - for (i = 0; i < (n & ~7u);) { + for (i = 0; i < (n & ~15u);) { COPY_SEED1; DITHER_VARS; - _ _ _ _ _ _ _ _ (void)0; + DO_16; if (fe_test_invalid()) { fe_clear_invalid(); - RINT_CLIP(dest, src, stride, i - 8, i, &clips PASS_SEED1); + RINT_CLIP(dest, src, stride, i - 16, i, &clips PASS_SEED1); } } RINT_CLIP(dest, src, stride, i, n, &clips PASS_SEED); } #else -#define _ d = src[i] + DITHERING, dest[stride * i++] = (RINT_T)(d > 0? d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5), +#define _ d = src[i] DITHERING, dest[stride * i++] = (RINT_T)(d > 0? \ + d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5) const double N = 1. + RINT_MAX; double d; for (j = 0; j < stride; ++j, ++dest) { FLOATX const * const src = srcs[j]; - for (i = 0; i < (n & ~7u);) { + for (i = 0; i < (n & ~15u);) { DITHER_VARS; - _ _ _ _ _ _ _ _ (void)0; + DO_16; } { DITHER_VARS; - for (; i < n; _ (void)0); + for (; i < n; _); } } #endif @@ -134,6 +138,7 @@ static size_t LSX_RINT_CLIP_2(void * * dest0, FLOATX const * const * srcs, } #undef _ +#undef FLOATD #undef PASS_SEED #undef PASS_SEED1 #undef COPY_SEED1 diff --git a/soxr/src/rint.h b/soxr/src/rint.h index 529e4bb..2f1dfbe 100644 --- a/soxr/src/rint.h +++ b/soxr/src/rint.h @@ -1,68 +1,102 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ #if !defined soxr_rint_included #define soxr_rint_included -#include "soxr-config.h" - - - -#if HAVE_LRINT && LONG_MAX == 2147483647L - #include - #define FPU_RINT32 - #define rint32 lrint -#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__) - #define FPU_RINT32 - static __inline int32_t rint32(double input) { - int32_t result; - __asm__ __volatile__("fistpl %0": "=m"(result): "t"(input): "st"); - return result; - } -#elif defined __GNUC__ && defined __arm__ - #define FPU_RINT32 - static __inline int32_t rint32(double input) { - register int32_t result; - __asm__ __volatile__ ("ftosid %0, %P1": "=w"(result): "w"(input)); - return result; - } -#elif defined _MSC_VER && defined _M_IX86 /* FIXME need solution for MSVC x64 */ - #define FPU_RINT32 - static __inline int32_t rint32(double input) { - int32_t result; - _asm { - fld input - fistp result - } - return result; - } -#else - #define rint32(x) (int32_t)((x) < 0? x - .5 : x + .5) -#endif - +#include "std-types.h" +/* For x86, compiler-supplied versions of these functions (where available) + * can have poor performance (e.g. mingw32), so prefer these asm versions: */ #if defined __GNUC__ && (defined __i386__ || defined __x86_64__) + #define FPU_RINT32 #define FPU_RINT16 - static __inline int16_t rint16(double input) { - int16_t result; - __asm__ __volatile__("fistps %0": "=m"(result): "t"(input): "st"); - return result; + #define rint32D(a,b) __asm__ __volatile__("fistpl %0": "=m"(a): "t"(b): "st") + #define rint16D(a,b) __asm__ __volatile__("fistps %0": "=m"(a): "t"(b): "st") + #define rint32F rint32D + #define rint16F rint16D + #define FE_INVALID 1 + static __inline int fe_test_invalid(void) { + int status_word; + __asm__ __volatile__("fnstsw %%ax": "=a"(status_word)); + return status_word & FE_INVALID; } -#elif defined _MSC_VER && defined _M_IX86 /* FIXME need solution for MSVC x64 */ + static __inline int fe_clear_invalid(void) { + int32_t status[7]; + __asm__ __volatile__("fnstenv %0": "=m"(status)); + status[1] &= ~FE_INVALID; + __asm__ __volatile__("fldenv %0": : "m"(*status)); + return 0; + } +#elif defined _MSC_VER && defined _M_IX86 + #define FPU_RINT32 #define FPU_RINT16 - static __inline int16_t rint16(double input) { - int16_t result; - _asm { - fld input - fistp result - } - return result; + #define rint_fn(N,Y,X) \ + static __inline void N(Y *y, X x) {Y t; {__asm fld x __asm fistp t} *y=t;} + rint_fn(rint32d, int32_t, double) + rint_fn(rint32f, int32_t, float ) + rint_fn(rint16d, int16_t, double) + rint_fn(rint16f, int16_t, float ) + #define rint32D(y,x) rint32d(&(y),x) + #define rint32F(y,x) rint32f(&(y),x) + #define rint16D(y,x) rint16d(&(y),x) + #define rint16F(y,x) rint16f(&(y),x) + #define FE_INVALID 1 + static __inline int fe_test_invalid(void) { + short status_word; + __asm fnstsw status_word + return status_word & FE_INVALID; } -#else - #define rint16(x) (int16_t)((x) < 0? x - .5 : x + .5) + static __inline int fe_clear_invalid(void) { + int32_t status[7]; + __asm fnstenv status + status[1] &= ~FE_INVALID; + __asm fldenv status + return 0; + } +#elif defined _MSC_VER && defined _M_X64 + #include + #include + #define FPU_RINT32 + #define FPU_RINT16 + static __inline void rint32d(int32_t *y, double x) { + *y = _mm_cvtsd_si32(_mm_load_sd(&x));} + static __inline void rint32f(int32_t *y, float x) { + *y = _mm_cvtss_si32(_mm_load_ss(&x));} + static __inline void rint16d(int16_t *y, double x) { + x = x*65536+32738; *y = (int16_t)(_mm_cvtsd_si32(_mm_load_sd(&x)) >> 16);} + #define rint32D(y,x) rint32d(&(y),x) + #define rint32F(y,x) rint32f(&(y),x) + #define rint16D(y,x) rint16d(&(y),x) + #define rint16F(y,x) rint16d(&(y),(double)(x)) + #define FE_INVALID 1 + #define fe_test_invalid() (_statusfp() & _SW_INVALID) + #define fe_clear_invalid _clearfp /* Note: clears all. */ +#elif HAVE_LRINT && LONG_MAX == 2147483647L && HAVE_FENV_H + #include + #include + #define FPU_RINT32 + #define rint32D(y,x) ((y)=lrint(x)) + #define rint32F(y,x) ((y)=lrintf(x)) + #define fe_test_invalid() fetestexcept(FE_INVALID) + #define fe_clear_invalid() feclearexcept(FE_INVALID) #endif +#if !defined FPU_RINT32 + #define rint32D(y,x) ((y)=(int32_t)((x) < 0? x - .5 : x + .5)) + #define rint32F(y,x) rint32D(y,(double)(x)) +#endif +#if !defined FPU_RINT16 + #define rint16D(y,x) ((y)=(int16_t)((x) < 0? x - .5 : x + .5)) + #define rint16F(y,x) rint16D(y,(double)(x)) +#endif + +static __inline int32_t rint32(double input) { + int32_t result; rint32D(result, input); return result;} + +static __inline int16_t rint16(double input) { + int16_t result; rint16D(result, input); return result;} #endif diff --git a/soxr/src/simd-dev.h b/soxr/src/simd-dev.h deleted file mode 100644 index 019325c..0000000 --- a/soxr/src/simd-dev.h +++ /dev/null @@ -1,5 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -#define PFFT_MACROS_ONLY -#include "pffft.c" diff --git a/soxr/src/simd.h b/soxr/src/simd.h deleted file mode 100644 index 71eefc6..0000000 --- a/soxr/src/simd.h +++ /dev/null @@ -1,16 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -#if !defined simd_included -#define simd_included - -#include - -void * _soxr_simd_aligned_malloc(size_t); -void * _soxr_simd_aligned_calloc(size_t, size_t); -void _soxr_simd_aligned_free(void *); - -void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b); -void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b); - -#endif diff --git a/soxr/src/soxr-lsr.c b/soxr/src/soxr-lsr.c new file mode 100644 index 0000000..58ab50a --- /dev/null +++ b/soxr/src/soxr-lsr.c @@ -0,0 +1,198 @@ +/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +/* Wrapper mostly compatible with `libsamplerate'. */ + +#include +#include +#include "soxr.h" +#include "soxr-lsr.h" +#include "rint.h" + + + +SRC_STATE *src_new(SRC_SRCTYPE id, int channels, SRC_ERROR * error) +{ + return src_callback_new(0, id, channels, error, 0); +} + + + +SRC_ERROR src_process(SRC_STATE *p, SRC_DATA * io) +{ + size_t idone , odone; + + if (!p || !io) return -1; + + soxr_set_error( + p, soxr_set_io_ratio(p, 1/io->src_ratio, (size_t)io->output_frames)); + + soxr_process(p, io->data_in, /* hack: */ + (size_t)(io->end_of_input? ~io->input_frames : io->input_frames), + &idone, io->data_out, (size_t)io->output_frames, &odone); + + io->input_frames_used = (long)idone, io->output_frames_gen = (long)odone; + return -!!soxr_error(p); +} + + + +SRC_ERROR src_set_ratio(SRC_STATE * p, double oi_ratio) +{ + return -!!soxr_set_io_ratio(p, 1/oi_ratio, 0); +} + + + +SRC_ERROR src_reset(SRC_STATE * p) +{ + return -!!soxr_clear(p); +} + + + +SRC_ERROR src_error(SRC_STATE * p) +{ + return -!!soxr_error(p); +} + + + +SRC_STATE * src_delete(SRC_STATE * p) +{ + soxr_delete(p); + return 0; +} + + + +SRC_STATE *src_callback_new(src_callback_t fn, + SRC_SRCTYPE id, int channels, SRC_ERROR * error0, void * p) +{ + soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + (unsigned)id, 0); + char const * e = getenv("SOXR_LSR_NUM_THREADS"); + soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1)); + soxr_error_t error; + soxr_t soxr = 0; + + assert (channels > 0); + soxr = soxr_create(0, 0, (unsigned)channels, &error, 0, &q_spec, &r_spec); + + if (soxr) + error = soxr_set_input_fn(soxr, (soxr_input_fn_t)fn, p, 0); + + if (error0) + *error0 = -!!error; + + return soxr; +} + + + +long src_callback_read(SRC_STATE *p, double oi_ratio, long olen, float * obuf) +{ + if (!p || olen < 0) return -1; + + soxr_set_error(p, soxr_set_io_ratio(p, 1/oi_ratio, (size_t)olen)); + return (long)soxr_output(p, obuf, (size_t)olen); +} + + + +SRC_ERROR src_simple(SRC_DATA * io, SRC_SRCTYPE id, int channels) +{ + size_t idone, odone; + soxr_error_t error; + soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + (unsigned)id, 0); + char const * e = getenv("SOXR_LSR_NUM_THREADS"); + soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1)); + + if (!io || channels<=0 || io->input_frames<0 || io->output_frames<0) return-1; + + error = soxr_oneshot(1, io->src_ratio, (unsigned)channels, io->data_in, + (size_t)io->input_frames, &idone, io->data_out, (size_t)io->output_frames, + &odone, 0, &q_spec, &r_spec); + + io->input_frames_used = (long)idone, io->output_frames_gen = (long)odone; + + return -!!error; +} + + + +char const * src_get_name(SRC_SRCTYPE id) +{ + static char const * const names[] = { + "LSR best sinc", "LSR medium sinc", "LSR fastest sinc", + "LSR ZOH", "LSR linear", "SoX VHQ"}; + + return (unsigned)id < 5u + !getenv("SOXR_LSR_STRICT")? names[id] : 0; +} + + + +char const * src_get_description(SRC_SRCTYPE id) +{ + return src_get_name(id); +} + + + +char const * src_get_version(void) +{ + return soxr_version(); +} + + + +char const * src_strerror(SRC_ERROR error) +{ + return error == 1? "Placeholder." : error ? "soxr error" : soxr_strerror(0); +} + + + +int src_is_valid_ratio(double oi_ratio) +{ + return getenv("SOXR_LSR_STRICT")? + oi_ratio >= 1./256 && oi_ratio <= 256 : oi_ratio > 0; +} + + + +void src_short_to_float_array(short const * src, float * dest, int len) +{ + assert (src && dest); + + while (len--) dest[len] = (float)(src[len] * (1 / (1. + SHRT_MAX))); +} + + + +void src_float_to_short_array(float const * src, short * dest, int len) +{ + double d, N = 1. + SHRT_MAX; + assert (src && dest); + + while (len--) d = src[len] * N, dest[len] = + (short)(d > N - 1? (short)(N - 1) : d < -N? (short)-N : rint16(d)); +} + + + +void src_int_to_float_array(int const * src, float * dest, int len) +{ + assert (src && dest); + while (len--) dest[len] = (float)(src[len] * (1 / (32768. * 65536.))); +} + + + +void src_float_to_int_array(float const * src, int * dest, int len) +{ + double d, N = 32768. * 65536.; /* N.B. int32, not int! (Also above fn.) */ + assert (src && dest); + + while (len--) d = src[len] * N, dest[len] = + d >= N - 1? (int)(N - 1) : d < -N? (int)(-N) : rint32(d); +} diff --git a/soxr/src/soxr-lsr.h b/soxr/src/soxr-lsr.h index c0923aa..b1cc247 100644 --- a/soxr/src/soxr-lsr.h +++ b/soxr/src/soxr-lsr.h @@ -1,4 +1,4 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by @@ -37,13 +37,12 @@ #endif typedef float SRC_SAMPLE; -#if !defined SOXR_LIB enum SRC_SRCTYPE_e {SRC_SINC_BEST_QUALITY, SRC_SINC_MEDIUM_QUALITY, SRC_SINC_FASTEST, SRC_ZERO_ORDER_HOLD, SRC_LINEAR}; typedef int SRC_SRCTYPE; typedef int SRC_ERROR; typedef long (* src_callback_t)(void *, SRC_SAMPLE * *); -typedef struct SRC_STATE SRC_STATE; +typedef struct soxr SRC_STATE; typedef struct SRC_DATA { SRC_SAMPLE * data_in, * data_out; long input_frames, output_frames; @@ -51,7 +50,6 @@ typedef struct SRC_DATA { int end_of_input; double src_ratio; } SRC_DATA; -#endif SOXR SRC_STATE * src_new(SRC_SRCTYPE, int num_channels, SRC_ERROR *); SOXR SRC_ERROR src_process (SRC_STATE *, SRC_DATA *); SOXR SRC_ERROR src_set_ratio(SRC_STATE *, double); diff --git a/soxr/src/soxr.c b/soxr/src/soxr.c index 5acace1..cdbfb9a 100644 --- a/soxr/src/soxr.c +++ b/soxr/src/soxr.c @@ -1,4 +1,4 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ #include @@ -10,6 +10,30 @@ #include "data-io.h" #include "internal.h" +#if AVUTIL_FOUND + #include +#endif + + + +#if WITH_DEV_TRACE + +#include +#include + +int _soxr_trace_level; + +void _soxr_trace(char const * fmt, ...) +{ + va_list args; + va_start(args, fmt); + vfprintf(stderr, fmt, args); + fputc('\n', stderr); + va_end(args); +} + +#endif + char const * soxr_version(void) @@ -19,21 +43,9 @@ char const * soxr_version(void) +#include "cb_t.h" + typedef void sample_t; /* float or double */ -typedef void (* fn_t)(void); -typedef fn_t control_block_t[10]; - -#define resampler_input (*(sample_t * (*)(void *, sample_t * samples, size_t n))p->control_block[0]) -#define resampler_process (*(void (*)(void *, size_t))p->control_block[1]) -#define resampler_output (*(sample_t const * (*)(void *, sample_t * samples, size_t * n))p->control_block[2]) -#define resampler_flush (*(void (*)(void *))p->control_block[3]) -#define resampler_close (*(void (*)(void *))p->control_block[4]) -#define resampler_delay (*(double (*)(void *))p->control_block[5]) -#define resampler_sizes (*(void (*)(size_t * shared, size_t * channel))p->control_block[6]) -#define resampler_create (*(char const * (*)(void * channel, void * shared, double io_ratio, soxr_quality_spec_t * q_spec, soxr_runtime_spec_t * r_spec, double scale))p->control_block[7]) -#define resampler_set_io_ratio (*(void (*)(void *, double io_ratio, size_t len))p->control_block[8]) -#define resampler_id (*(char const * (*)(void))p->control_block[9]) - typedef void * resampler_t; /* For one channel. */ typedef void * resampler_shared_t; /* Between channels. */ typedef void (* deinterleave_t)(sample_t * * dest, @@ -67,45 +79,52 @@ struct soxr { -#define RESET_ON_CLEAR (1u<<31) +#if WITH_CR32 || WITH_CR32S || WITH_CR64 || WITH_CR64S + #include "filter.h" +#else + #define lsx_to_3dB(x) ((x)/(x)) +#endif + -/* TODO: these should not be here. */ -#define TO_3dB(a) ((1.6e-6*a-7.5e-4)*a+.646) -#define LOW_Q_BW0 (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */ soxr_quality_spec_t soxr_quality_spec(unsigned long recipe, unsigned long flags) { soxr_quality_spec_t spec, * p = &spec; - unsigned quality = recipe & 0xf; + unsigned q = recipe & 0xf; /* TODO: move to soxr-lsr.c: */ + unsigned quality = q > SOXR_LSR2Q+2? SOXR_VHQ : q > SOXR_LSR2Q? SOXR_QQ : q; double rej; memset(p, 0, sizeof(*p)); - if (quality > 13) { + if (quality > SOXR_PRECISIONQ) { p->e = "invalid quality type"; return spec; } - flags |= quality < SOXR_LSR0Q? RESET_ON_CLEAR : 0; - if (quality == 13) - quality = 6; - else if (quality > 10) - quality = 0; - p->phase_response = "\62\31\144"[(recipe & 0x30) >> 4]; + flags |= quality < SOXR_LSR0Q ? RESET_ON_CLEAR : 0; + p->phase_response = "\62\31\144"[(recipe & 0x30)>>4]; p->stopband_begin = 1; - p->precision = !quality? 0: quality < 3? 16 : quality < 8? 4 + quality * 4 : 55 - quality * 4; + p->precision = + quality == SOXR_QQ ? 0 : + quality <= SOXR_16_BITQ ? 16 : + quality <= SOXR_32_BITQ ? 4 + quality * 4 : + quality <= SOXR_LSR2Q ? 55 - quality * 4 : /* TODO: move to soxr-lsr.c */ + 0; rej = p->precision * linear_to_dB(2.); p->flags = flags; - if (quality < 8) { - p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / TO_3dB(rej); + if (quality <= SOXR_32_BITQ || quality == SOXR_PRECISIONQ) { + #define LOW_Q_BW0 (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */ + p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / lsx_to_3dB(rej); if (quality <= 2) p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM; } - else { + else { /* TODO: move to soxr-lsr.c */ static float const bw[] = {.931f, .832f, .663f}; - p->passband_end = bw[quality - 8]; - if (quality - 8 == 2) - p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM; + p->passband_end = bw[quality - SOXR_LSR0Q]; + if (quality == SOXR_LSR2Q) { + p->flags &= ~SOXR_ROLLOFF_NONE; + p->flags |= SOXR_ROLLOFF_LSR2Q | SOXR_PROMOTE_TO_LQ; + } } if (recipe & SOXR_STEEP_FILTER) - p->passband_end = 1 - .01 / TO_3dB(rej); + p->passband_end = 1 - .01 / lsx_to_3dB(rej); return spec; } @@ -163,39 +182,165 @@ soxr_io_spec_t soxr_io_spec( -#if HAVE_SIMD -static bool cpu_has_simd(void) -{ -#if defined __x86_64__ || defined _M_X64 - return true; -#elif defined __GNUC__ && defined i386 - uint32_t eax, ebx, ecx, edx; - __asm__ __volatile__ ( - "pushl %%ebx \n\t" - "cpuid \n\t" - "movl %%ebx, %1\n\t" - "popl %%ebx \n\t" - : "=a"(eax), "=r"(ebx), "=c"(ecx), "=d"(edx) - : "a"(1) - : "cc" ); - return !!(edx & 0x06000000); -#elif defined _MSC_VER && defined _M_IX86 - uint32_t d; - __asm { - xor eax, eax - inc eax - push ebx - cpuid - pop ebx - mov d, edx - } - return !!(d & 0x06000000); -#endif - return false; -} +#if (WITH_CR32S && WITH_CR32) || (WITH_CR64S && WITH_CR64) + #if defined __GNUC__ && defined __x86_64__ + #define CPUID(type, eax_, ebx_, ecx_, edx_) \ + __asm__ __volatile__ ( \ + "cpuid \n\t" \ + : "=a" (eax_), "=b" (ebx_), "=c" (ecx_), "=d" (edx_) \ + : "a" (type), "c" (0)); + #elif defined __GNUC__ && defined __i386__ + #define CPUID(type, eax_, ebx_, ecx_, edx_) \ + __asm__ __volatile__ ( \ + "mov %%ebx, %%edi \n\t" \ + "cpuid \n\t" \ + "xchg %%edi, %%ebx \n\t" \ + : "=a" (eax_), "=D" (ebx_), "=c" (ecx_), "=d" (edx_) \ + : "a" (type), "c" (0)); + #elif defined _M_X64 && defined _MSC_VER && _MSC_VER > 1500 + void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue); + #pragma intrinsic(__cpuidex) + #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \ + int regs[4]; \ + __cpuidex(regs, type, 0); \ + eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \ + } while(0) + #elif defined _M_X64 && defined _MSC_VER + void __cpuidex(int CPUInfo[4], int info_type); + #pragma intrinsic(__cpuidex) + #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \ + int regs[4]; \ + __cpuidex(regs, type); \ + eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \ + } while(0) + #elif defined _M_IX86 && defined _MSC_VER + #define CPUID(type, eax_, ebx_, ecx_, edx_) \ + __asm pushad \ + __asm mov eax, type \ + __asm xor ecx, ecx \ + __asm cpuid \ + __asm mov eax_, eax \ + __asm mov ebx_, ebx \ + __asm mov ecx_, ecx \ + __asm mov edx_, edx \ + __asm popad + #endif #endif -extern control_block_t _soxr_rate32s_cb, _soxr_rate32_cb, _soxr_rate64_cb, _soxr_vr32_cb; + + +#if WITH_CR32S && WITH_CR32 + static bool cpu_has_simd32(void) + { + #if defined __x86_64__ || defined _M_X64 + return true; + #elif defined __i386__ || defined _M_IX86 + enum {SSE = 1 << 25, SSE2 = 1 << 26}; + unsigned eax_, ebx_, ecx_, edx_; + CPUID(1, eax_, ebx_, ecx_, edx_); + return (edx_ & (SSE|SSE2)) != 0; + #elif defined AV_CPU_FLAG_NEON + return !!(av_get_cpu_flags() & AV_CPU_FLAG_NEON); + #else + return false; + #endif + } + + static bool should_use_simd32(void) + { + char const * e; + return ((e = getenv("SOXR_USE_SIMD" )))? !!atoi(e) : + ((e = getenv("SOXR_USE_SIMD32")))? !!atoi(e) : cpu_has_simd32(); + } +#else + #define should_use_simd32() true +#endif + + + +#if WITH_CR64S && WITH_CR64 + #if defined __GNUC__ + #define XGETBV(type, eax_, edx_) \ + __asm__ __volatile__ ( \ + ".byte 0x0f, 0x01, 0xd0\n" \ + : "=a"(eax_), "=d"(edx_) : "c" (type)); + #elif defined _M_X64 && defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219 + #include + #define XGETBV(type, eax_, edx_) do { \ + union {uint64_t x; uint32_t y[2];} a = {_xgetbv(0)}; \ + eax_ = a.y[0], edx_ = a.y[1]; \ + } while(0) + #elif defined _M_IX86 && defined _MSC_VER + #define XGETBV(type, eax_, edx_) \ + __asm pushad \ + __asm mov ecx, type \ + __asm _emit 0x0f \ + __asm _emit 0x01 \ + __asm _emit 0xd0 \ + __asm mov eax_, eax \ + __asm mov edx_, edx \ + __asm popad + #else + #define XGETBV(type, eax_, edx_) eax_ = edx_ = 0 + #endif + + static bool cpu_has_simd64(void) + { + enum {OSXSAVE = 1 << 27, AVX = 1 << 28}; + unsigned eax_, ebx_, ecx_, edx_; + CPUID(1, eax_, ebx_, ecx_, edx_); + if ((ecx_ & (OSXSAVE|AVX)) == (OSXSAVE|AVX)) { + XGETBV(0, eax_, edx_); + return (eax_ & 6) == 6; + } + return false; + } + + static bool should_use_simd64(void) + { + char const * e; + return ((e = getenv("SOXR_USE_SIMD" )))? !!atoi(e) : + ((e = getenv("SOXR_USE_SIMD64")))? !!atoi(e) : cpu_has_simd64(); + } +#else + #define should_use_simd64() true +#endif + + + +extern control_block_t + _soxr_rate32_cb, + _soxr_rate32s_cb, + _soxr_rate64_cb, + _soxr_rate64s_cb, + _soxr_vr32_cb; + + + +static void runtime_num(char const * env_name, + int min, int max, unsigned * field) +{ + char const * e = getenv(env_name); + if (e) { + int i = atoi(e); + if (i >= min && i <= max) + *field = (unsigned)i; + } +} + + + +static void runtime_flag(char const * env_name, + unsigned n_bits, unsigned n_shift, unsigned long * flags) +{ + char const * e = getenv(env_name); + if (e) { + int i = atoi(e); + unsigned long mask = (1UL << n_bits) - 1; + if (i >= 0 && i <= (int)mask) + *flags &= ~(mask << n_shift), *flags |= ((unsigned long)i << n_shift); + } +} @@ -207,11 +352,30 @@ soxr_t soxr_create( soxr_quality_spec_t const * q_spec, soxr_runtime_spec_t const * runtime_spec) { - double io_ratio = output_rate? input_rate? input_rate / output_rate : -1 : input_rate? -1 : 0; + double io_ratio = output_rate!=0? input_rate!=0? + input_rate / output_rate : -1 : input_rate!=0? -1 : 0; static const float datatype_full_scale[] = {1, 1, 65536.*32768, 32768}; soxr_t p = 0; soxr_error_t error = 0; +#if WITH_DEV_TRACE +#define _(x) (char)(sizeof(x)>=10? 'a'+(char)(sizeof(x)-10):'0'+(char)sizeof(x)) + char const * e = getenv("SOXR_TRACE"); + _soxr_trace_level = e? atoi(e) : 0; + { + static char const arch[] = {_(char), _(short), _(int), _(long), _(long long) + , ' ', _(float), _(double), _(long double) + , ' ', _(int *), _(int (*)(int)) + , ' ', HAVE_BIGENDIAN ? 'B' : 'L' +#if defined _OPENMP + , ' ', 'O', 'M', 'P' +#endif + , 0}; +#undef _ + lsx_debug("arch: %s", arch); + } +#endif + if (q_spec && q_spec->e) error = q_spec->e; else if (io_spec && (io_spec->itype | io_spec->otype) >= SOXR_SPLIT * 2) error = "invalid io datatype(s)"; @@ -219,6 +383,8 @@ soxr_t soxr_create( if (!error && !(p = calloc(sizeof(*p), 1))) error = "malloc failed"; if (p) { + control_block_t * control_block; + p->q_spec = q_spec? *q_spec : soxr_quality_spec(SOXR_HQ, 0); if (q_spec) { /* Backwards compatibility with original API: */ @@ -236,35 +402,59 @@ soxr_t soxr_create( p->io_spec.scale = 1; p->runtime_spec = runtime_spec? *runtime_spec : soxr_runtime_spec(1); + + runtime_num("SOXR_MIN_DFT_SIZE", 8, 15, &p->runtime_spec.log2_min_dft_size); + runtime_num("SOXR_LARGE_DFT_SIZE", 8, 20, &p->runtime_spec.log2_large_dft_size); + runtime_num("SOXR_COEFS_SIZE", 100, 800, &p->runtime_spec.coef_size_kbytes); + runtime_num("SOXR_NUM_THREADS", 0, 64, &p->runtime_spec.num_threads); + runtime_flag("SOXR_COEF_INTERP", 2, 0, &p->runtime_spec.flags); + + runtime_flag("SOXR_STRICT_BUF", 1, 2, &p->runtime_spec.flags); + runtime_flag("SOXR_NOSMALLINTOPT", 1, 3, &p->runtime_spec.flags); + p->io_spec.scale *= datatype_full_scale[p->io_spec.otype & 3] / datatype_full_scale[p->io_spec.itype & 3]; + p->seed = (unsigned long)time(0) ^ (unsigned long)(size_t)p; -#if HAVE_SINGLE_PRECISION - if (!HAVE_DOUBLE_PRECISION || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION)) - || (p->q_spec.flags & SOXR_VR)) { +#if WITH_CR32 || WITH_CR32S || WITH_VR32 + if (0 +#if WITH_VR32 + || ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR)) +#endif +#if WITH_CR32 || WITH_CR32S + || !(WITH_CR64 || WITH_CR64S) || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION)) +#endif + ) { p->deinterleave = (deinterleave_t)_soxr_deinterleave_f; p->interleave = (interleave_t)_soxr_interleave_f; - memcpy(&p->control_block, - (p->q_spec.flags & SOXR_VR)? &_soxr_vr32_cb : -#if HAVE_SIMD - cpu_has_simd()? &_soxr_rate32s_cb : + control_block = +#if WITH_VR32 + ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR))? &_soxr_vr32_cb : #endif - &_soxr_rate32_cb, sizeof(p->control_block)); +#if WITH_CR32S + !WITH_CR32 || should_use_simd32()? &_soxr_rate32s_cb : +#endif + &_soxr_rate32_cb; } -#if HAVE_DOUBLE_PRECISION +#if WITH_CR64 || WITH_CR64S else #endif #endif -#if HAVE_DOUBLE_PRECISION +#if WITH_CR64 || WITH_CR64S { p->deinterleave = (deinterleave_t)_soxr_deinterleave; p->interleave = (interleave_t)_soxr_interleave; - memcpy(&p->control_block, &_soxr_rate64_cb, sizeof(p->control_block)); + control_block = +#if WITH_CR64S + !WITH_CR64 || should_use_simd64()? &_soxr_rate64s_cb : +#endif + &_soxr_rate64_cb; } #endif + memcpy(&p->control_block, control_block, sizeof(p->control_block)); - if (p->num_channels && io_ratio) + if (p->num_channels && io_ratio!=0) error = soxr_set_io_ratio(p, io_ratio, 0); } if (error) @@ -307,7 +497,8 @@ static void soxr_delete0(soxr_t p) double soxr_delay(soxr_t p) { - return (p && !p->error && p->resamplers)? resampler_delay(p->resamplers[0]) : 0; + return + (p && !p->error && p->resamplers)? resampler_delay(p->resamplers[0]) : 0; } @@ -375,13 +566,13 @@ soxr_error_t soxr_set_io_ratio(soxr_t p, double io_ratio, size_t slew_len) p->io_ratio = io_ratio; return initialise(p); } - if (p->control_block[8]) { + if (resampler_set_io_ratio) { for (i = 0; !error && i < p->num_channels; ++i) resampler_set_io_ratio(p->resamplers[i], io_ratio, slew_len); return error; } return fabs(p->io_ratio - io_ratio) < 1e-15? 0 : - "Varying O/I ratio is not supported with this quality level"; + "varying O/I ratio is not supported with this quality level"; } @@ -406,7 +597,7 @@ soxr_error_t soxr_clear(soxr_t p) /* TODO: this, properly. */ p->io_spec = tmp.io_spec; p->num_channels = tmp.num_channels; p->input_fn_state = tmp.input_fn_state; - memcpy(p->control_block, tmp.control_block, sizeof(p->control_block)); + memcpy(&p->control_block, &tmp.control_block, sizeof(p->control_block)); p->deinterleave = tmp.deinterleave; p->interleave = tmp.interleave; return (p->q_spec.flags & RESET_ON_CLEAR)? @@ -481,13 +672,8 @@ static size_t soxr_output_no_callback(soxr_t p, soxr_buf_t out, size_t len) done = done1; } else #endif - { - if (p->num_channels > 1) { - for (u = 0; u < p->num_channels; ++u) - done = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], len, separated); - } else - done = soxr_output_1ch(p, 0, out, len, separated); - } + for (u = 0; u < p->num_channels; ++u) + done = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], len, separated); if (!separated) p->clips += (p->interleave)(p->io_spec.otype, &out, (sample_t const * const *)p->channel_ptrs, @@ -616,7 +802,7 @@ soxr_error_t soxr_oneshot( soxr_quality_spec_t const * q_spec, soxr_runtime_spec_t const * runtime_spec) { - soxr_t resampler = NULL; + soxr_t resampler; soxr_error_t error = q_spec? q_spec->e : 0; if (!error) { soxr_quality_spec_t q_spec1; diff --git a/soxr/src/soxr.h b/soxr/src/soxr.h index 8d9622d..09ec7c4 100644 --- a/soxr/src/soxr.h +++ b/soxr/src/soxr.h @@ -1,4 +1,4 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net * * This library is free software; you can redistribute it and/or modify it * under the terms of the GNU Lesser General Public License as published by @@ -65,8 +65,8 @@ input or output (e.g. ilen, olen). */ /* E.g. #if SOXR_THIS_VERSION >= SOXR_VERSION(0,1,1) ... */ #define SOXR_VERSION(x,y,z) (((x)<<16)|((y)<<8)|(z)) -#define SOXR_THIS_VERSION SOXR_VERSION(0,1,2) -#define SOXR_THIS_VERSION_STR "0.1.2" +#define SOXR_THIS_VERSION SOXR_VERSION(0,1,3) +#define SOXR_THIS_VERSION_STR "0.1.3" @@ -173,7 +173,7 @@ SOXR size_t /*odone*/ soxr_output(/* Resample and output a block of data.*/ SOXR soxr_error_t soxr_error(soxr_t); /* Query error status. */ SOXR size_t * soxr_num_clips(soxr_t); /* Query int. clip counter (for R/W). */ SOXR double soxr_delay(soxr_t); /* Query current delay in output samples.*/ -SOXR char const * soxr_engine(soxr_t p); /* Query resampling engine name. */ +SOXR char const * soxr_engine(soxr_t); /* Query resampling engine name. */ SOXR soxr_error_t soxr_clear(soxr_t); /* Ready for fresh signal, same config. */ SOXR void soxr_delete(soxr_t); /* Free resources. */ @@ -249,7 +249,6 @@ struct soxr_quality_spec { /* Typically */ #define SOXR_ROLLOFF_MEDIUM 1u /* <= 0.35 dB */ #define SOXR_ROLLOFF_NONE 2u /* For Chebyshev bandwidth. */ -#define SOXR_MAINTAIN_3DB_PT 4u /* Reserved for internal use. */ #define SOXR_HI_PREC_CLOCK 8u /* Increase `irrational' ratio accuracy. */ #define SOXR_DOUBLE_PRECISION 16u /* Use D.P. calcs even if precision <= 20. */ #define SOXR_VR 32u /* Variable-rate resampling. */ @@ -257,21 +256,18 @@ struct soxr_quality_spec { /* Typically */ struct soxr_runtime_spec { /* Typically */ - unsigned log2_min_dft_size;/* For DFT efficiency. [8,15] 10 */ - unsigned log2_large_dft_size;/* For DFT efficiency. [16,20] 17 */ - unsigned coef_size_kbytes; /* For SOXR_COEF_INTERP_AUTO (below). 400 */ - unsigned num_threads; /* If built so. 0 means `automatic'. 1 */ - void * e; /* Reserved for internal use. 0 */ - unsigned long flags; /* Per the following #defines. 0 */ + unsigned log2_min_dft_size; /* For DFT efficiency. [8,15] 10 */ + unsigned log2_large_dft_size; /* For DFT efficiency. [8,20] 17 */ + unsigned coef_size_kbytes; /* For SOXR_COEF_INTERP_AUTO (below). 400 */ + unsigned num_threads; /* 0: per OMP_NUM_THREADS; 1: 1 thread. 1 */ + void * e; /* Reserved for internal use. 0 */ + unsigned long flags; /* Per the following #defines. 0 */ }; /* For `irrational' ratios only: */ #define SOXR_COEF_INTERP_AUTO 0u /* Auto select coef. interpolation. */ #define SOXR_COEF_INTERP_LOW 2u /* Man. select: less CPU, more memory. */ #define SOXR_COEF_INTERP_HIGH 3u /* Man. select: more CPU, less memory. */ -#define SOXR_STRICT_BUFFERING 4u /* Reserved for future use. */ -#define SOXR_NOSMALLINTOPT 8u /* For test purposes only. */ - /* -------------------------- API type constructors ------------------------- */ @@ -296,7 +292,7 @@ SOXR soxr_quality_spec_t soxr_quality_spec( #define SOXR_24_BITQ 5 #define SOXR_28_BITQ 6 #define SOXR_32_BITQ 7 - /* Libsamplerate equivalent qualities: */ + /* Reserved for internal use (to be removed): */ #define SOXR_LSR0Q 8 /* 'Best sinc'. */ #define SOXR_LSR1Q 9 /* 'Medium sinc'. */ #define SOXR_LSR2Q 10 /* 'Fast sinc'. */ @@ -304,8 +300,8 @@ SOXR soxr_quality_spec_t soxr_quality_spec( #define SOXR_LINEAR_PHASE 0x00 #define SOXR_INTERMEDIATE_PHASE 0x10 #define SOXR_MINIMUM_PHASE 0x30 + #define SOXR_STEEP_FILTER 0x40 -#define SOXR_ALLOW_ALIASING 0x80 /* Reserved for future use. */ diff --git a/soxr/src/sse2neon.h b/soxr/src/sse2neon.h deleted file mode 100644 index 65efed3..0000000 --- a/soxr/src/sse2neon.h +++ /dev/null @@ -1,6292 +0,0 @@ -#ifndef SSE2NEON_H -#define SSE2NEON_H - -// This header file provides a simple API translation layer -// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions -// -// This header file does not yet translate all of the SSE intrinsics. -// -// Contributors to this work are: -// John W. Ratcliff -// Brandon Rowlett -// Ken Fast -// Eric van Beurden -// Alexander Potylitsin -// Hasindu Gamaarachchi -// Jim Huang -// Mark Cheng -// Malcolm James MacLeod -// Devin Hussey (easyaspi314) -// Sebastian Pop -// Developer Ecosystem Engineering -// Danila Kutenin -// François Turban (JishinMaster) -// Pei-Hsuan Hung -// Yang-Hao Yuan - -/* - * sse2neon is freely redistributable under the MIT License. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -/* Tunable configurations */ - -/* Enable precise implementation of _mm_min_ps and _mm_max_ps - * This would slow down the computation a bit, but gives consistent result with - * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result) - */ -#ifndef SSE2NEON_PRECISE_MINMAX -#define SSE2NEON_PRECISE_MINMAX (0) -#endif - -#if defined(__GNUC__) || defined(__clang__) -#pragma push_macro("FORCE_INLINE") -#pragma push_macro("ALIGN_STRUCT") -#define FORCE_INLINE static inline __attribute__((always_inline)) -#define ALIGN_STRUCT(x) __attribute__((aligned(x))) -#else -#error "Macro name collisions may happen with unsupported compiler." -#ifdef FORCE_INLINE -#undef FORCE_INLINE -#endif -#define FORCE_INLINE static inline -#ifndef ALIGN_STRUCT -#define ALIGN_STRUCT(x) __declspec(align(x)) -#endif -#endif - -#include -#include - -/* Architecture-specific build options */ -/* FIXME: #pragma GCC push_options is only available on GCC */ -#if defined(__GNUC__) -#if defined(__arm__) && __ARM_ARCH == 7 -/* According to ARM C Language Extensions Architecture specification, - * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) - * architecture supported. - */ -#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) -#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." -#endif -#pragma GCC push_options -#pragma GCC target("fpu=neon") -#elif defined(__aarch64__) -#pragma GCC push_options -#pragma GCC target("+simd") -#else -#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." -#endif -#endif - -#include - -/* Rounding functions require either Aarch64 instructions or libm failback */ -#if !defined(__aarch64__) -#include -#endif - -/* "__has_builtin" can be used to query support for built-in functions - * provided by gcc/clang and other compilers that support it. - */ -#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ -/* Compatibility with gcc <= 9 */ -#if __GNUC__ <= 9 -#define __has_builtin(x) HAS##x -#define HAS__builtin_popcount 1 -#define HAS__builtin_popcountll 1 -#else -#define __has_builtin(x) 0 -#endif -#endif - -/** - * MACRO for shuffle parameter for _mm_shuffle_ps(). - * Argument fp3 is a digit[0123] that represents the fp from argument "b" - * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same - * for fp2 in result. fp1 is a digit[0123] that represents the fp from - * argument "a" of mm_shuffle_ps that will be places in fp1 of result. - * fp0 is the same for fp0 of result. - */ -#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ - (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) - -/* Rounding mode macros. */ -#define _MM_FROUND_TO_NEAREST_INT 0x00 -#define _MM_FROUND_TO_NEG_INF 0x01 -#define _MM_FROUND_TO_POS_INF 0x02 -#define _MM_FROUND_TO_ZERO 0x03 -#define _MM_FROUND_CUR_DIRECTION 0x04 -#define _MM_FROUND_NO_EXC 0x08 - -/* indicate immediate constant argument in a given range */ -#define __constrange(a, b) const - -/* A few intrinsics accept traditional data types like ints or floats, but - * most operate on data types that are specific to SSE. - * If a vector type ends in d, it contains doubles, and if it does not have - * a suffix, it contains floats. An integer vector type can contain any type - * of integer, from chars to shorts to unsigned long longs. - */ -typedef int64x1_t __m64; -typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ -// On ARM 32-bit architecture, the float64x2_t is not supported. -// The data type __m128d should be represented in a different way for related -// intrinsic conversion. -#if defined(__aarch64__) -typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ -#else -typedef float32x4_t __m128d; -#endif -typedef int64x2_t __m128i; /* 128-bit vector containing integers */ - -/* type-safe casting between types */ - -#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) -#define vreinterpretq_m128_f32(x) (x) -#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) - -#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) -#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) -#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) -#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) - -#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) -#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) -#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) -#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) - -#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) -#define vreinterpretq_f32_m128(x) (x) -#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) - -#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) -#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) -#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) -#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) - -#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) -#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) -#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) -#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) - -#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) -#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) -#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) -#define vreinterpretq_m128i_s64(x) (x) - -#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) -#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) -#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) -#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) - -#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) -#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) -#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) -#define vreinterpretq_s64_m128i(x) (x) - -#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) -#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) -#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) -#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) - -#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) -#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) -#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) -#define vreinterpret_m64_s64(x) (x) - -#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) -#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) -#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) -#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) - -#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) -#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) -#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) - -#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) -#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) -#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) -#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) - -#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) -#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) -#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) -#define vreinterpret_s64_m64(x) (x) - -#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) - -#if defined(__aarch64__) -#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) -#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) - -#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) -#define vreinterpretq_m128d_f64(x) (x) - -#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) - -#define vreinterpretq_f64_m128d(x) (x) -#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) -#else -#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) -#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) -#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) - -#define vreinterpretq_m128d_f32(x) (x) - -#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) - -#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) - -#define vreinterpretq_f32_m128d(x) (x) -#endif - -// A struct is defined in this header file called 'SIMDVec' which can be used -// by applications which attempt to access the contents of an _m128 struct -// directly. It is important to note that accessing the __m128 struct directly -// is bad coding practice by Microsoft: @see: -// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx -// -// However, some legacy source code may try to access the contents of an __m128 -// struct directly so the developer can use the SIMDVec as an alias for it. Any -// casting must be done manually by the developer, as you cannot cast or -// otherwise alias the base NEON data type for intrinsic operations. -// -// union intended to allow direct access to an __m128 variable using the names -// that the MSVC compiler provides. This union should really only be used when -// trying to access the members of the vector as integer values. GCC/clang -// allow native access to the float members through a simple array access -// operator (in C since 4.6, in C++ since 4.8). -// -// Ideally direct accesses to SIMD vectors should not be used since it can cause -// a performance hit. If it really is needed however, the original __m128 -// variable can be aliased with a pointer to this union and used to access -// individual components. The use of this union should be hidden behind a macro -// that is used throughout the codebase to access the members instead of always -// declaring this type of variable. -typedef union ALIGN_STRUCT(16) SIMDVec { - float m128_f32[4]; // as floats - DON'T USE. Added for convenience. - int8_t m128_i8[16]; // as signed 8-bit integers. - int16_t m128_i16[8]; // as signed 16-bit integers. - int32_t m128_i32[4]; // as signed 32-bit integers. - int64_t m128_i64[2]; // as signed 64-bit integers. - uint8_t m128_u8[16]; // as unsigned 8-bit integers. - uint16_t m128_u16[8]; // as unsigned 16-bit integers. - uint32_t m128_u32[4]; // as unsigned 32-bit integers. - uint64_t m128_u64[2]; // as unsigned 64-bit integers. -} SIMDVec; - -// casting using SIMDVec -#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) -#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) -#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) - -/* Backwards compatibility for compilers with lack of specific type support */ - -// Older gcc does not define vld1q_u8_x4 type -#if defined(__GNUC__) && !defined(__clang__) -#if __GNUC__ <= 9 -FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p) -{ - uint8x16x4_t ret; - ret.val[0] = vld1q_u8(p + 0); - ret.val[1] = vld1q_u8(p + 16); - ret.val[2] = vld1q_u8(p + 32); - ret.val[3] = vld1q_u8(p + 48); - return ret; -} -#endif -#endif - -/* Function Naming Conventions - * The naming convention of SSE intrinsics is straightforward. A generic SSE - * intrinsic function is given as follows: - * _mm__ - * - * The parts of this format are given as follows: - * 1. describes the operation performed by the intrinsic - * 2. identifies the data type of the function's primary arguments - * - * This last part, , is a little complicated. It identifies the - * content of the input values, and can be set to any of the following values: - * + ps - vectors contain floats (ps stands for packed single-precision) - * + pd - vectors cantain doubles (pd stands for packed double-precision) - * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit - * signed integers - * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit - * unsigned integers - * + si128 - unspecified 128-bit vector or 256-bit vector - * + m128/m128i/m128d - identifies input vector types when they are different - * than the type of the returned vector - * - * For example, _mm_setzero_ps. The _mm implies that the function returns - * a 128-bit vector. The _ps at the end implies that the argument vectors - * contain floats. - * - * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) - * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits - * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); - * // Set packed 8-bit integers - * // 128 bits, 16 chars, per 8 bits - * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, - * 4, 5, 12, 13, 6, 7, 14, 15); - * // Shuffle packed 8-bit integers - * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb - * - * Data (Number, Binary, Byte Index): - +------+------+-------------+------+------+-------------+ - | 1 | 2 | 3 | 4 | Number - +------+------+------+------+------+------+------+------+ - | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary - +------+------+------+------+------+------+------+------+ - | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index - +------+------+------+------+------+------+------+------+ - - +------+------+------+------+------+------+------+------+ - | 5 | 6 | 7 | 8 | Number - +------+------+------+------+------+------+------+------+ - | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary - +------+------+------+------+------+------+------+------+ - | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index - +------+------+------+------+------+------+------+------+ - * Index (Byte Index): - +------+------+------+------+------+------+------+------+ - | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | - +------+------+------+------+------+------+------+------+ - - +------+------+------+------+------+------+------+------+ - | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | - +------+------+------+------+------+------+------+------+ - * Result: - +------+------+------+------+------+------+------+------+ - | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index - +------+------+------+------+------+------+------+------+ - | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary - +------+------+------+------+------+------+------+------+ - | 256 | 2 | 5 | 6 | Number - +------+------+------+------+------+------+------+------+ - - +------+------+------+------+------+------+------+------+ - | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index - +------+------+------+------+------+------+------+------+ - | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary - +------+------+------+------+------+------+------+------+ - | 3 | 7 | 4 | 8 | Number - +------+------+------+------+------+------+-------------+ - */ - -/* Set/get methods */ - -/* Constants for use with _mm_prefetch. */ -enum _mm_hint { - _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ - _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ - _MM_HINT_T1 = 2, /* load data to L2 cache only */ - _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ - _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */ - _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */ - _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */ - _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ -}; - -// Loads one cache line of data from address p to a location closer to the -// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx -FORCE_INLINE void _mm_prefetch(const void *p, int i) -{ - (void) i; - __builtin_prefetch(p); -} - -// Copy the lower single-precision (32-bit) floating-point element of a to dst. -// -// dst[31:0] := a[31:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 -FORCE_INLINE float _mm_cvtss_f32(__m128 a) -{ - return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); -} - -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 32-bit integer, and store the result in dst. -// -// dst[31:0] := Convert_FP32_To_Int32(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 -#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) - -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 64-bit integer, and store the result in dst. -// -// dst[63:0] := Convert_FP32_To_Int64(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 -FORCE_INLINE int _mm_cvtss_si64(__m128 a) -{ -#if defined(__aarch64__) - return vgetq_lane_s64( - vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0); -#else - float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32_t diff = data - floor(data); - if (diff > 0.5) - return (int64_t) ceil(data); - if (diff == 0.5) { - int64_t f = (int64_t) floor(data); - int64_t c = (int64_t) ceil(data); - return c & 1 ? f : c; - } - return (int64_t) floor(data); -#endif -} - -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed 32-bit integers with truncation, and store the results in dst. -// -// FOR j := 0 to 1 -// i := 32*j -// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi -FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) -{ - return vreinterpret_m64_s32( - vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); -} - -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 32-bit integer with truncation, and store the result in dst. -// -// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si -FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) -{ - return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); -} - -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed 32-bit integers with truncation, and store the results in dst. -// -// FOR j := 0 to 1 -// i := 32*j -// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 -#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) - -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 32-bit integer with truncation, and store the result in dst. -// -// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 -#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) - -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 64-bit integer with truncation, and store the result in dst. -// -// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 -FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) -{ - return vgetq_lane_s64( - vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0); -} - -// Sets the 128-bit value to zero -// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx -FORCE_INLINE __m128i _mm_setzero_si128(void) -{ - return vreinterpretq_m128i_s32(vdupq_n_s32(0)); -} - -// Clears the four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setzero_ps(void) -{ - return vreinterpretq_m128_f32(vdupq_n_f32(0)); -} - -// Return vector of type __m128d with all elements set to zero. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd -FORCE_INLINE __m128d _mm_setzero_pd(void) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vdupq_n_f64(0)); -#else - return vreinterpretq_m128d_f32(vdupq_n_f32(0)); -#endif -} - -// Sets the four single-precision, floating-point values to w. -// -// r0 := r1 := r2 := r3 := w -// -// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set1_ps(float _w) -{ - return vreinterpretq_m128_f32(vdupq_n_f32(_w)); -} - -// Sets the four single-precision, floating-point values to w. -// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set_ps1(float _w) -{ - return vreinterpretq_m128_f32(vdupq_n_f32(_w)); -} - -// Sets the four single-precision, floating-point values to the four inputs. -// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx -FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) -{ - float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; - return vreinterpretq_m128_f32(vld1q_f32(data)); -} - -// Copy single-precision (32-bit) floating-point element a to the lower element -// of dst, and zero the upper 3 elements. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss -FORCE_INLINE __m128 _mm_set_ss(float a) -{ - float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; - return vreinterpretq_m128_f32(vld1q_f32(data)); -} - -// Sets the four single-precision, floating-point values to the four inputs in -// reverse order. -// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx -FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) -{ - float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; - return vreinterpretq_m128_f32(vld1q_f32(data)); -} - -// Sets the 8 signed 16-bit integer values in reverse order. -// -// Return Value -// r0 := w0 -// r1 := w1 -// ... -// r7 := w7 -FORCE_INLINE __m128i _mm_setr_epi16(short w0, - short w1, - short w2, - short w3, - short w4, - short w5, - short w6, - short w7) -{ - int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; - return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); -} - -// Sets the 4 signed 32-bit integer values in reverse order -// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx -FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) -{ - int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; - return vreinterpretq_m128i_s32(vld1q_s32(data)); -} - -// Set packed 64-bit integers in dst with the supplied values in reverse order. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 -FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) -{ - return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); -} - -// Sets the 16 signed 8-bit integer values to b. -// -// r0 := b -// r1 := b -// ... -// r15 := b -// -// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set1_epi8(signed char w) -{ - return vreinterpretq_m128i_s8(vdupq_n_s8(w)); -} - -// Broadcast double-precision (64-bit) floating-point value a to all elements of -// dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd -FORCE_INLINE __m128d _mm_set1_pd(double d) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vdupq_n_f64(d)); -#else - return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); -#endif -} - -// Sets the 8 signed 16-bit integer values to w. -// -// r0 := w -// r1 := w -// ... -// r7 := w -// -// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx -FORCE_INLINE __m128i _mm_set1_epi16(short w) -{ - return vreinterpretq_m128i_s16(vdupq_n_s16(w)); -} - -// Sets the 16 signed 8-bit integer values. -// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx -FORCE_INLINE __m128i _mm_set_epi8(signed char b15, - signed char b14, - signed char b13, - signed char b12, - signed char b11, - signed char b10, - signed char b9, - signed char b8, - signed char b7, - signed char b6, - signed char b5, - signed char b4, - signed char b3, - signed char b2, - signed char b1, - signed char b0) -{ - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; - return (__m128i) vld1q_s8(data); -} - -// Sets the 8 signed 16-bit integer values. -// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx -FORCE_INLINE __m128i _mm_set_epi16(short i7, - short i6, - short i5, - short i4, - short i3, - short i2, - short i1, - short i0) -{ - int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - return vreinterpretq_m128i_s16(vld1q_s16(data)); -} - -// Sets the 16 signed 8-bit integer values in reverse order. -// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx -FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, - signed char b1, - signed char b2, - signed char b3, - signed char b4, - signed char b5, - signed char b6, - signed char b7, - signed char b8, - signed char b9, - signed char b10, - signed char b11, - signed char b12, - signed char b13, - signed char b14, - signed char b15) -{ - int8_t ALIGN_STRUCT(16) - data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, - (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, - (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, - (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; - return (__m128i) vld1q_s8(data); -} - -// Sets the 4 signed 32-bit integer values to i. -// -// r0 := i -// r1 := i -// r2 := i -// r3 := I -// -// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set1_epi32(int _i) -{ - return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); -} - -// Sets the 2 signed 64-bit integer values to i. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) -FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) -{ - return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); -} - -// Sets the 2 signed 64-bit integer values to i. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x -FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) -{ - return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); -} - -// Sets the 4 signed 32-bit integer values. -// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx -FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) -{ - int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; - return vreinterpretq_m128i_s32(vld1q_s32(data)); -} - -// Returns the __m128i structure with its two 64-bit integer values -// initialized to the values of the two 64-bit integers passed in. -// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx -FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) -{ - int64_t ALIGN_STRUCT(16) data[2] = {i2, i1}; - return vreinterpretq_m128i_s64(vld1q_s64(data)); -} - -// Returns the __m128i structure with its two 64-bit integer values -// initialized to the values of the two 64-bit integers passed in. -// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx -FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) -{ - return _mm_set_epi64x((int64_t) i1, (int64_t) i2); -} - -// Set packed double-precision (64-bit) floating-point elements in dst with the -// supplied values. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd -FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) -{ - double ALIGN_STRUCT(16) data[2] = {e0, e1}; -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); -#else - return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); -#endif -} - -// Stores four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx -FORCE_INLINE void _mm_store_ps(float *p, __m128 a) -{ - vst1q_f32(p, vreinterpretq_f32_m128(a)); -} - -// Stores four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx -FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) -{ - vst1q_f32(p, vreinterpretq_f32_m128(a)); -} - -// Stores four 32-bit integer values as (as a __m128i value) at the address p. -// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx -FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) -{ - vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); -} - -// Stores four 32-bit integer values as (as a __m128i value) at the address p. -// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx -FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) -{ - vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); -} - -// Stores the lower single - precision, floating - point value. -// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx -FORCE_INLINE void _mm_store_ss(float *p, __m128 a) -{ - vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); -} - -// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point -// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary -// or a general-protection exception may be generated. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd -FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) -{ -#if defined(__aarch64__) - vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); -#else - vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); -#endif -} - -// Store the lower double-precision (64-bit) floating-point element from a into -// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte -// boundary or a general-protection exception may be generated. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 -FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) -{ -#if defined(__aarch64__) - float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); - vst1q_f64((float64_t *) mem_addr, - vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); -#else - float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); - vst1q_f32((float32_t *) mem_addr, - vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); -#endif -} - -// Store the lower double-precision (64-bit) floating-point element from a into -// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte -// boundary or a general-protection exception may be generated. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd -#define _mm_store1_pd _mm_store_pd1 - -// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point -// elements) from a into memory. mem_addr does not need to be aligned on any -// particular boundary. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd -FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) -{ - _mm_store_pd(mem_addr, a); -} - -// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. -// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx -FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) -{ - uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); - uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); - *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); -} - -// Stores the lower two single-precision floating point values of a to the -// address p. -// -// *p0 := a0 -// *p1 := a1 -// -// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx -FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) -{ - *p = vreinterpret_m64_f32(vget_low_f32(a)); -} - -// Stores the upper two single-precision, floating-point values of a to the -// address p. -// -// *p0 := a2 -// *p1 := a3 -// -// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx -FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) -{ - *p = vreinterpret_m64_f32(vget_high_f32(a)); -} - -// Loads a single single-precision, floating-point value, copying it into all -// four words -// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load1_ps(const float *p) -{ - return vreinterpretq_m128_f32(vld1q_dup_f32(p)); -} - -// Load a single-precision (32-bit) floating-point element from memory into all -// elements of dst. -// -// dst[31:0] := MEM[mem_addr+31:mem_addr] -// dst[63:32] := MEM[mem_addr+31:mem_addr] -// dst[95:64] := MEM[mem_addr+31:mem_addr] -// dst[127:96] := MEM[mem_addr+31:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 -#define _mm_load_ps1 _mm_load1_ps - -// Sets the lower two single-precision, floating-point values with 64 -// bits of data loaded from the address p; the upper two values are passed -// through from a. -// -// Return Value -// r0 := *p0 -// r1 := *p1 -// r2 := a2 -// r3 := a3 -// -// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx -FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) -{ - return vreinterpretq_m128_f32( - vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); -} - -// Load 4 single-precision (32-bit) floating-point elements from memory into dst -// in reverse order. mem_addr must be aligned on a 16-byte boundary or a -// general-protection exception may be generated. -// -// dst[31:0] := MEM[mem_addr+127:mem_addr+96] -// dst[63:32] := MEM[mem_addr+95:mem_addr+64] -// dst[95:64] := MEM[mem_addr+63:mem_addr+32] -// dst[127:96] := MEM[mem_addr+31:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps -FORCE_INLINE __m128 _mm_loadr_ps(const float *p) -{ - float32x4_t v = vrev64q_f32(vld1q_f32(p)); - return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); -} - -// Sets the upper two single-precision, floating-point values with 64 -// bits of data loaded from the address p; the lower two values are passed -// through from a. -// -// r0 := a0 -// r1 := a1 -// r2 := *p0 -// r3 := *p1 -// -// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx -FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) -{ - return vreinterpretq_m128_f32( - vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); -} - -// Loads four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx -FORCE_INLINE __m128 _mm_load_ps(const float *p) -{ - return vreinterpretq_m128_f32(vld1q_f32(p)); -} - -// Loads four single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_loadu_ps(const float *p) -{ - // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are - // equivalent for neon - return vreinterpretq_m128_f32(vld1q_f32(p)); -} - -// Load unaligned 16-bit integer from memory into the first element of dst. -// -// dst[15:0] := MEM[mem_addr+15:mem_addr] -// dst[MAX:16] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 -FORCE_INLINE __m128i _mm_loadu_si16(const void *p) -{ - return vreinterpretq_m128i_s16( - vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); -} - -// Load unaligned 64-bit integer from memory into the first element of dst. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[MAX:64] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 -FORCE_INLINE __m128i _mm_loadu_si64(const void *p) -{ - return vreinterpretq_m128i_s64( - vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); -} - -// Load a double-precision (64-bit) floating-point element from memory into the -// lower of dst, and zero the upper element. mem_addr does not need to be -// aligned on any particular boundary. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd -FORCE_INLINE __m128d _mm_load_sd(const double *p) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); -#else - const float *fp = (const float *) p; - float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; - return vreinterpretq_m128d_f32(vld1q_f32(data)); -#endif -} - -// Loads two double-precision from 16-byte aligned memory, floating-point -// values. -// -// dst[127:0] := MEM[mem_addr+127:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd -FORCE_INLINE __m128d _mm_load_pd(const double *p) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vld1q_f64(p)); -#else - const float *fp = (const float *) p; - float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; - return vreinterpretq_m128d_f32(vld1q_f32(data)); -#endif -} - -// Loads two double-precision from unaligned memory, floating-point values. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd -FORCE_INLINE __m128d _mm_loadu_pd(const double *p) -{ - return _mm_load_pd(p); -} - -// Loads an single - precision, floating - point value into the low word and -// clears the upper three words. -// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_load_ss(const float *p) -{ - return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); -} - -FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) -{ - /* Load the lower 64 bits of the value pointed to by p into the - * lower 64 bits of the result, zeroing the upper 64 bits of the result. - */ - return vreinterpretq_m128i_s32( - vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); -} - -// Load a double-precision (64-bit) floating-point element from memory into the -// lower element of dst, and copy the upper element from a to dst. mem_addr does -// not need to be aligned on any particular boundary. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := a[127:64] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd -FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); -#else - return vreinterpretq_m128d_f32( - vcombine_f32(vld1_f32((const float *) p), - vget_high_f32(vreinterpretq_f32_m128d(a)))); -#endif -} - -// Load 2 double-precision (64-bit) floating-point elements from memory into dst -// in reverse order. mem_addr must be aligned on a 16-byte boundary or a -// general-protection exception may be generated. -// -// dst[63:0] := MEM[mem_addr+127:mem_addr+64] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd -FORCE_INLINE __m128d _mm_loadr_pd(const double *p) -{ -#if defined(__aarch64__) - float64x2_t v = vld1q_f64(p); - return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); -#else - int64x2_t v = vld1q_s64((const int64_t *) p); - return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); -#endif -} - -// Sets the low word to the single-precision, floating-point value of b -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) -FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) -{ - return vreinterpretq_m128_f32( - vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), - vreinterpretq_f32_m128(a), 0)); -} - -// Move the lower double-precision (64-bit) floating-point element from b to the -// lower element of dst, and copy the upper element from a to the upper element -// of dst. -// -// dst[63:0] := b[63:0] -// dst[127:64] := a[127:64] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd -FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) -{ - return vreinterpretq_m128d_f32( - vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), - vget_high_f32(vreinterpretq_f32_m128d(a)))); -} - -// Copy the lower 64-bit integer in a to the lower element of dst, and zero the -// upper element. -// -// dst[63:0] := a[63:0] -// dst[127:64] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 -FORCE_INLINE __m128i _mm_move_epi64(__m128i a) -{ - return vreinterpretq_m128i_s64( - vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); -} - -/* Logic/Binary operations */ - -// Computes the bitwise AND-NOT of the four single-precision, floating-point -// values of a and b. -// -// r0 := ~a0 & b0 -// r1 := ~a1 & b1 -// r2 := ~a2 & b2 -// r3 := ~a3 & b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx -FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_s32( - vbicq_s32(vreinterpretq_s32_m128(b), - vreinterpretq_s32_m128(a))); // *NOTE* argument swap -} - -// Compute the bitwise NOT of packed double-precision (64-bit) floating-point -// elements in a and then AND with b, and store the results in dst. -// -// FOR j := 0 to 1 -// i := j*64 -// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd -FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) -{ - // *NOTE* argument swap - return vreinterpretq_m128d_s64( - vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); -} - -// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the -// 128-bit value in a. -// -// r := (~a) & b -// -// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( - vbicq_s32(vreinterpretq_s32_m128i(b), - vreinterpretq_s32_m128i(a))); // *NOTE* argument swap -} - -// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in -// b. -// -// r := a & b -// -// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( - vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Computes the bitwise AND of the four single-precision, floating-point values -// of a and b. -// -// r0 := a0 & b0 -// r1 := a1 & b1 -// r2 := a2 & b2 -// r3 := a3 & b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_s32( - vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); -} - -// Compute the bitwise AND of packed double-precision (64-bit) floating-point -// elements in a and b, and store the results in dst. -// -// FOR j := 0 to 1 -// i := j*64 -// dst[i+63:i] := a[i+63:i] AND b[i+63:i] -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd -FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) -{ - return vreinterpretq_m128d_s64( - vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); -} - -// Computes the bitwise OR of the four single-precision, floating-point values -// of a and b. -// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx -FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_s32( - vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); -} - -// Computes bitwise EXOR (exclusive-or) of the four single-precision, -// floating-point values of a and b. -// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_s32( - veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); -} - -// Compute the bitwise XOR of packed double-precision (64-bit) floating-point -// elements in a and b, and store the results in dst. -// -// FOR j := 0 to 1 -// i := j*64 -// dst[i+63:i] := a[i+63:i] XOR b[i+63:i] -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd -FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) -{ - return vreinterpretq_m128d_s64( - veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); -} - -// Compute the bitwise OR of packed double-precision (64-bit) floating-point -// elements in a and b, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd -FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) -{ - return vreinterpretq_m128d_s64( - vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); -} - -// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. -// -// r := a | b -// -// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx -FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( - vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in -// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx -FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( - veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Duplicate odd-indexed single-precision (32-bit) floating-point elements -// from a, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps -FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) -{ -#if __has_builtin(__builtin_shufflevector) - return vreinterpretq_m128_f32(__builtin_shufflevector( - vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); -#else - float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); - float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); - float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; - return vreinterpretq_m128_f32(vld1q_f32(data)); -#endif -} - -// Duplicate even-indexed single-precision (32-bit) floating-point elements -// from a, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps -FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) -{ -#if __has_builtin(__builtin_shufflevector) - return vreinterpretq_m128_f32(__builtin_shufflevector( - vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); -#else - float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); - float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; - return vreinterpretq_m128_f32(vld1q_f32(data)); -#endif -} - -// Moves the upper two values of B into the lower two values of A. -// -// r3 := a3 -// r2 := a2 -// r1 := b3 -// r0 := b2 -FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) -{ - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A)); - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B)); - return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); -} - -// Moves the lower two values of B into the upper two values of A. -// -// r3 := b1 -// r2 := b0 -// r1 := a1 -// r0 := a0 -FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) -{ - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); - return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); -} - -// Compute the absolute value of packed signed 32-bit integers in a, and store -// the unsigned results in dst. -// -// FOR j := 0 to 3 -// i := j*32 -// dst[i+31:i] := ABS(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 -FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) -{ - return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); -} - -// Compute the absolute value of packed signed 16-bit integers in a, and store -// the unsigned results in dst. -// -// FOR j := 0 to 7 -// i := j*16 -// dst[i+15:i] := ABS(a[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 -FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) -{ - return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); -} - -// Compute the absolute value of packed signed 8-bit integers in a, and store -// the unsigned results in dst. -// -// FOR j := 0 to 15 -// i := j*8 -// dst[i+7:i] := ABS(a[i+7:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 -FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) -{ - return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); -} - -// Compute the absolute value of packed signed 32-bit integers in a, and store -// the unsigned results in dst. -// -// FOR j := 0 to 1 -// i := j*32 -// dst[i+31:i] := ABS(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 -FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) -{ - return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); -} - -// Compute the absolute value of packed signed 16-bit integers in a, and store -// the unsigned results in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := ABS(a[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 -FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) -{ - return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); -} - -// Compute the absolute value of packed signed 8-bit integers in a, and store -// the unsigned results in dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := ABS(a[i+7:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 -FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) -{ - return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); -} - -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of b and places it into the high end of the result. -FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) -{ - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); -} - -// takes the lower two 32-bit values from a and swaps them and places in high -// end of result takes the higher two 32 bit values from b and swaps them and -// places in low end of result. -FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); - return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) -{ - float32x2_t a21 = vget_high_f32( - vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); - float32x2_t b03 = vget_low_f32( - vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); - return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) -{ - float32x2_t a03 = vget_low_f32( - vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); - float32x2_t b21 = vget_high_f32( - vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); - return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) -{ - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); - return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); -} - -// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the -// high -FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) -{ - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) -{ - float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) -{ - float32x2_t a22 = - vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) -{ - float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t b22 = - vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); - return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) -{ - float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32x2_t a22 = - vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); - float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) -{ - float32x2_t a33 = - vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); - float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); - return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) -{ - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - float32x2_t b20 = vset_lane_f32(b2, b00, 1); - return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) -{ - float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - float32x2_t b20 = vset_lane_f32(b2, b00, 1); - return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); -} - -FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) -{ - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32_t b2 = vgetq_lane_f32(b, 2); - float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); - float32x2_t b20 = vset_lane_f32(b2, b00, 1); - return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); -} - -// NEON does not support a general purpose permute intrinsic -// Selects four specific single-precision, floating-point values from a and b, -// based on the mask i. -// -// C equivalent: -// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, -// __constrange(0, 255) int imm) { -// __m128 ret; -// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; -// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; -// return ret; -// } -// -// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx -#define _mm_shuffle_ps_default(a, b, imm) \ - __extension__({ \ - float32x4_t ret; \ - ret = vmovq_n_f32( \ - vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ - ret = vsetq_lane_f32( \ - vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ - ret, 1); \ - ret = vsetq_lane_f32( \ - vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ - ret, 2); \ - ret = vsetq_lane_f32( \ - vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ - ret, 3); \ - vreinterpretq_m128_f32(ret); \ - }) - -// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) -// int imm) -#if __has_builtin(__builtin_shufflevector) -#define _mm_shuffle_ps(a, b, imm) \ - __extension__({ \ - float32x4_t _input1 = vreinterpretq_f32_m128(a); \ - float32x4_t _input2 = vreinterpretq_f32_m128(b); \ - float32x4_t _shuf = __builtin_shufflevector( \ - _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ - (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ - vreinterpretq_m128_f32(_shuf); \ - }) -#else // generic -#define _mm_shuffle_ps(a, b, imm) \ - __extension__({ \ - __m128 ret; \ - switch (imm) { \ - case _MM_SHUFFLE(1, 0, 3, 2): \ - ret = _mm_shuffle_ps_1032((a), (b)); \ - break; \ - case _MM_SHUFFLE(2, 3, 0, 1): \ - ret = _mm_shuffle_ps_2301((a), (b)); \ - break; \ - case _MM_SHUFFLE(0, 3, 2, 1): \ - ret = _mm_shuffle_ps_0321((a), (b)); \ - break; \ - case _MM_SHUFFLE(2, 1, 0, 3): \ - ret = _mm_shuffle_ps_2103((a), (b)); \ - break; \ - case _MM_SHUFFLE(1, 0, 1, 0): \ - ret = _mm_movelh_ps((a), (b)); \ - break; \ - case _MM_SHUFFLE(1, 0, 0, 1): \ - ret = _mm_shuffle_ps_1001((a), (b)); \ - break; \ - case _MM_SHUFFLE(0, 1, 0, 1): \ - ret = _mm_shuffle_ps_0101((a), (b)); \ - break; \ - case _MM_SHUFFLE(3, 2, 1, 0): \ - ret = _mm_shuffle_ps_3210((a), (b)); \ - break; \ - case _MM_SHUFFLE(0, 0, 1, 1): \ - ret = _mm_shuffle_ps_0011((a), (b)); \ - break; \ - case _MM_SHUFFLE(0, 0, 2, 2): \ - ret = _mm_shuffle_ps_0022((a), (b)); \ - break; \ - case _MM_SHUFFLE(2, 2, 0, 0): \ - ret = _mm_shuffle_ps_2200((a), (b)); \ - break; \ - case _MM_SHUFFLE(3, 2, 0, 2): \ - ret = _mm_shuffle_ps_3202((a), (b)); \ - break; \ - case _MM_SHUFFLE(3, 2, 3, 2): \ - ret = _mm_movehl_ps((b), (a)); \ - break; \ - case _MM_SHUFFLE(1, 1, 3, 3): \ - ret = _mm_shuffle_ps_1133((a), (b)); \ - break; \ - case _MM_SHUFFLE(2, 0, 1, 0): \ - ret = _mm_shuffle_ps_2010((a), (b)); \ - break; \ - case _MM_SHUFFLE(2, 0, 0, 1): \ - ret = _mm_shuffle_ps_2001((a), (b)); \ - break; \ - case _MM_SHUFFLE(2, 0, 3, 2): \ - ret = _mm_shuffle_ps_2032((a), (b)); \ - break; \ - default: \ - ret = _mm_shuffle_ps_default((a), (b), (imm)); \ - break; \ - } \ - ret; \ - }) -#endif - -// Takes the upper 64 bits of a and places it in the low end of the result -// Takes the lower 64 bits of a and places it into the high end of the result. -FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) -{ - int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); - int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); -} - -// takes the lower two 32-bit values from a and swaps them and places in low end -// of result takes the higher two 32 bit values from a and swaps them and places -// in high end of result. -FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) -{ - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); - return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); -} - -// rotates the least significant 32 bits into the most signficant 32 bits, and -// shifts the rest down -FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) -{ - return vreinterpretq_m128i_s32( - vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); -} - -// rotates the most significant 32 bits into the least signficant 32 bits, and -// shifts the rest up -FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) -{ - return vreinterpretq_m128i_s32( - vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); -} - -// gets the lower 64 bits of a, and places it in the upper 64 bits -// gets the lower 64 bits of a and places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) -{ - int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); -} - -// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the -// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) -{ - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); - return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); -} - -// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the -// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and -// places it in the lower 64 bits -FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) -{ - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) -{ - int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); - int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); - return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) -{ - int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); - int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); - return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); -} - -FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) -{ - int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); - int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); - return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); -} - -// Shuffle packed 8-bit integers in a according to shuffle control mask in the -// corresponding 8-bit element of b, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 -FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) -{ - int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a - uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b - uint8x16_t idx_masked = - vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits -#if defined(__aarch64__) - return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); -#elif defined(__GNUC__) - int8x16_t ret; - // %e and %f represent the even and odd D registers - // respectively. - __asm__ __volatile__( - "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" - "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" - : [ret] "=&w"(ret) - : [tbl] "w"(tbl), [idx] "w"(idx_masked)); - return vreinterpretq_m128i_s8(ret); -#else - // use this line if testing on aarch64 - int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; - return vreinterpretq_m128i_s8( - vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), - vtbl2_s8(a_split, vget_high_u8(idx_masked)))); -#endif -} - -// C equivalent: -// __m128i _mm_shuffle_epi32_default(__m128i a, -// __constrange(0, 255) int imm) { -// __m128i ret; -// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; -// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; -// return ret; -// } -#define _mm_shuffle_epi32_default(a, imm) \ - __extension__({ \ - int32x4_t ret; \ - ret = vmovq_n_s32( \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ - ret = vsetq_lane_s32( \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ - ret, 1); \ - ret = vsetq_lane_s32( \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ - ret, 2); \ - ret = vsetq_lane_s32( \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ - ret, 3); \ - vreinterpretq_m128i_s32(ret); \ - }) - -// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) -// int imm) -#if defined(__aarch64__) -#define _mm_shuffle_epi32_splat(a, imm) \ - __extension__({ \ - vreinterpretq_m128i_s32( \ - vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ - }) -#else -#define _mm_shuffle_epi32_splat(a, imm) \ - __extension__({ \ - vreinterpretq_m128i_s32( \ - vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ - }) -#endif - -// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. -// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx -// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, -// __constrange(0,255) int imm) -#if __has_builtin(__builtin_shufflevector) -#define _mm_shuffle_epi32(a, imm) \ - __extension__({ \ - int32x4_t _input = vreinterpretq_s32_m128i(a); \ - int32x4_t _shuf = __builtin_shufflevector( \ - _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ - ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ - vreinterpretq_m128i_s32(_shuf); \ - }) -#else // generic -#define _mm_shuffle_epi32(a, imm) \ - __extension__({ \ - __m128i ret; \ - switch (imm) { \ - case _MM_SHUFFLE(1, 0, 3, 2): \ - ret = _mm_shuffle_epi_1032((a)); \ - break; \ - case _MM_SHUFFLE(2, 3, 0, 1): \ - ret = _mm_shuffle_epi_2301((a)); \ - break; \ - case _MM_SHUFFLE(0, 3, 2, 1): \ - ret = _mm_shuffle_epi_0321((a)); \ - break; \ - case _MM_SHUFFLE(2, 1, 0, 3): \ - ret = _mm_shuffle_epi_2103((a)); \ - break; \ - case _MM_SHUFFLE(1, 0, 1, 0): \ - ret = _mm_shuffle_epi_1010((a)); \ - break; \ - case _MM_SHUFFLE(1, 0, 0, 1): \ - ret = _mm_shuffle_epi_1001((a)); \ - break; \ - case _MM_SHUFFLE(0, 1, 0, 1): \ - ret = _mm_shuffle_epi_0101((a)); \ - break; \ - case _MM_SHUFFLE(2, 2, 1, 1): \ - ret = _mm_shuffle_epi_2211((a)); \ - break; \ - case _MM_SHUFFLE(0, 1, 2, 2): \ - ret = _mm_shuffle_epi_0122((a)); \ - break; \ - case _MM_SHUFFLE(3, 3, 3, 2): \ - ret = _mm_shuffle_epi_3332((a)); \ - break; \ - case _MM_SHUFFLE(0, 0, 0, 0): \ - ret = _mm_shuffle_epi32_splat((a), 0); \ - break; \ - case _MM_SHUFFLE(1, 1, 1, 1): \ - ret = _mm_shuffle_epi32_splat((a), 1); \ - break; \ - case _MM_SHUFFLE(2, 2, 2, 2): \ - ret = _mm_shuffle_epi32_splat((a), 2); \ - break; \ - case _MM_SHUFFLE(3, 3, 3, 3): \ - ret = _mm_shuffle_epi32_splat((a), 3); \ - break; \ - default: \ - ret = _mm_shuffle_epi32_default((a), (imm)); \ - break; \ - } \ - ret; \ - }) -#endif - -// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified -// by imm. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) -// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, -// __constrange(0,255) int -// imm) -#define _mm_shufflelo_epi16_function(a, imm) \ - __extension__({ \ - int16x8_t ret = vreinterpretq_s16_m128i(a); \ - int16x4_t lowBits = vget_low_s16(ret); \ - ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ - ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ - 1); \ - ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ - 2); \ - ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ - 3); \ - vreinterpretq_m128i_s16(ret); \ - }) - -// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, -// __constrange(0,255) int imm) -#if __has_builtin(__builtin_shufflevector) -#define _mm_shufflelo_epi16(a, imm) \ - __extension__({ \ - int16x8_t _input = vreinterpretq_s16_m128i(a); \ - int16x8_t _shuf = __builtin_shufflevector( \ - _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ - (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ - vreinterpretq_m128i_s16(_shuf); \ - }) -#else // generic -#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) -#endif - -// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified -// by imm. -// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx -// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, -// __constrange(0,255) int -// imm) -#define _mm_shufflehi_epi16_function(a, imm) \ - __extension__({ \ - int16x8_t ret = vreinterpretq_s16_m128i(a); \ - int16x4_t highBits = vget_high_s16(ret); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ - 5); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ - 6); \ - ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ - 7); \ - vreinterpretq_m128i_s16(ret); \ - }) - -// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, -// __constrange(0,255) int imm) -#if __has_builtin(__builtin_shufflevector) -#define _mm_shufflehi_epi16(a, imm) \ - __extension__({ \ - int16x8_t _input = vreinterpretq_s16_m128i(a); \ - int16x8_t _shuf = __builtin_shufflevector( \ - _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ - (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ - (((imm) >> 6) & 0x3) + 4); \ - vreinterpretq_m128i_s16(_shuf); \ - }) -#else // generic -#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) -#endif - -// Blend packed 16-bit integers from a and b using control mask imm8, and store -// the results in dst. -// -// FOR j := 0 to 7 -// i := j*16 -// IF imm8[j] -// dst[i+15:i] := b[i+15:i] -// ELSE -// dst[i+15:i] := a[i+15:i] -// FI -// ENDFOR -// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, -// __constrange(0,255) int imm) -#define _mm_blend_epi16(a, b, imm) \ - __extension__({ \ - const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \ - ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \ - uint16x8_t _mask_vec = vld1q_u16(_mask); \ - uint16x8_t _a = vreinterpretq_u16_m128i(a); \ - uint16x8_t _b = vreinterpretq_u16_m128i(b); \ - vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ - }) - -// Blend packed 8-bit integers from a and b using mask, and store the results in -// dst. -// -// FOR j := 0 to 15 -// i := j*8 -// IF mask[i+7] -// dst[i+7:i] := b[i+7:i] -// ELSE -// dst[i+7:i] := a[i+7:i] -// FI -// ENDFOR -FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) -{ - // Use a signed shift right to create a mask with the sign bit - uint8x16_t mask = - vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); - uint8x16_t a = vreinterpretq_u8_m128i(_a); - uint8x16_t b = vreinterpretq_u8_m128i(_b); - return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); -} - -/* Shifts */ - - -// Shift packed 16-bit integers in a right by imm while shifting in sign -// bits, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 -FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) -{ - const int count = (imm & ~15) ? 15 : imm; - return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); -} - -// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while -// shifting in zeros. -// -// r0 := a0 << count -// r1 := a1 << count -// ... -// r7 := a7 << count -// -// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx -#define _mm_slli_epi16(a, imm) \ - __extension__({ \ - __m128i ret; \ - if ((imm) <= 0) { \ - ret = a; \ - } else if ((imm) > 15) { \ - ret = _mm_setzero_si128(); \ - } else { \ - ret = vreinterpretq_m128i_s16( \ - vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \ - } \ - ret; \ - }) - -// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while -// shifting in zeros. : -// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx -// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm) -FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) -{ - if (imm <= 0) /* TODO: add constant range macro: [0, 255] */ - return a; - if (imm > 31) /* TODO: add unlikely macro */ - return _mm_setzero_si128(); - return vreinterpretq_m128i_s32( - vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); -} - -// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and -// store the results in dst. -FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) -{ - if (imm <= 0) /* TODO: add constant range macro: [0, 255] */ - return a; - if (imm > 63) /* TODO: add unlikely macro */ - return _mm_setzero_si128(); - return vreinterpretq_m128i_s64( - vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); -} - -// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and -// store the results in dst. -// -// FOR j := 0 to 7 -// i := j*16 -// IF imm8[7:0] > 15 -// dst[i+15:i] := 0 -// ELSE -// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 -#define _mm_srli_epi16(a, imm) \ - __extension__({ \ - __m128i ret; \ - if ((imm) == 0) { \ - ret = a; \ - } else if (0 < (imm) && (imm) < 16) { \ - ret = vreinterpretq_m128i_u16( \ - vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \ - } else { \ - ret = _mm_setzero_si128(); \ - } \ - ret; \ - }) - -// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and -// store the results in dst. -// -// FOR j := 0 to 3 -// i := j*32 -// IF imm8[7:0] > 31 -// dst[i+31:i] := 0 -// ELSE -// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 -// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) -#define _mm_srli_epi32(a, imm) \ - __extension__({ \ - __m128i ret; \ - if ((imm) == 0) { \ - ret = a; \ - } else if (0 < (imm) && (imm) < 32) { \ - ret = vreinterpretq_m128i_u32( \ - vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \ - } else { \ - ret = _mm_setzero_si128(); \ - } \ - ret; \ - }) - -// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and -// store the results in dst. -// -// FOR j := 0 to 1 -// i := j*64 -// IF imm8[7:0] > 63 -// dst[i+63:i] := 0 -// ELSE -// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 -#define _mm_srli_epi64(a, imm) \ - __extension__({ \ - __m128i ret; \ - if ((imm) == 0) { \ - ret = a; \ - } else if (0 < (imm) && (imm) < 64) { \ - ret = vreinterpretq_m128i_u64( \ - vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \ - } else { \ - ret = _mm_setzero_si128(); \ - } \ - ret; \ - }) - -// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, -// and store the results in dst. -// -// FOR j := 0 to 3 -// i := j*32 -// IF imm8[7:0] > 31 -// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) -// ELSE -// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 -// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) -#define _mm_srai_epi32(a, imm) \ - __extension__({ \ - __m128i ret; \ - if ((imm) == 0) { \ - ret = a; \ - } else if (0 < (imm) && (imm) < 32) { \ - ret = vreinterpretq_m128i_s32( \ - vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ - } else { \ - ret = vreinterpretq_m128i_s32( \ - vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ - } \ - ret; \ - }) - -// Shifts the 128 - bit value in a right by imm bytes while shifting in -// zeros.imm must be an immediate. -// -// r := srl(a, imm*8) -// -// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx -// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm) -#define _mm_srli_si128(a, imm) \ - __extension__({ \ - __m128i ret; \ - if ((imm) <= 0) { \ - ret = a; \ - } else if ((imm) > 15) { \ - ret = _mm_setzero_si128(); \ - } else { \ - ret = vreinterpretq_m128i_s8( \ - vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \ - } \ - ret; \ - }) - -// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm -// must be an immediate. -// -// r := a << (imm * 8) -// -// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx -// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm) -#define _mm_slli_si128(a, imm) \ - __extension__({ \ - __m128i ret; \ - if ((imm) <= 0) { \ - ret = a; \ - } else if ((imm) > 15) { \ - ret = _mm_setzero_si128(); \ - } else { \ - ret = vreinterpretq_m128i_s8(vextq_s8( \ - vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \ - } \ - ret; \ - }) - -// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while -// shifting in zeros. -// -// r0 := a0 << count -// r1 := a1 << count -// ... -// r7 := a7 << count -// -// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) -{ - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (c > 15) - return _mm_setzero_si128(); - - int16x8_t vc = vdupq_n_s16((int16_t) c); - return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); -} - -// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while -// shifting in zeros. -// -// r0 := a0 << count -// r1 := a1 << count -// r2 := a2 << count -// r3 := a3 << count -// -// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) -{ - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (c > 31) - return _mm_setzero_si128(); - - int32x4_t vc = vdupq_n_s32((int32_t) c); - return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); -} - -// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while -// shifting in zeros. -// -// r0 := a0 << count -// r1 := a1 << count -// -// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) -{ - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (c > 63) - return _mm_setzero_si128(); - - int64x2_t vc = vdupq_n_s64((int64_t) c); - return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); -} - -// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits -// while shifting in zeros. -// -// r0 := srl(a0, count) -// r1 := srl(a1, count) -// ... -// r7 := srl(a7, count) -// -// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) -{ - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (c > 15) - return _mm_setzero_si128(); - - int16x8_t vc = vdupq_n_s16(-(int16_t) c); - return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); -} - -// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits -// while shifting in zeros. -// -// r0 := srl(a0, count) -// r1 := srl(a1, count) -// r2 := srl(a2, count) -// r3 := srl(a3, count) -// -// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) -{ - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (c > 31) - return _mm_setzero_si128(); - - int32x4_t vc = vdupq_n_s32(-(int32_t) c); - return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); -} - -// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits -// while shifting in zeros. -// -// r0 := srl(a0, count) -// r1 := srl(a1, count) -// -// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) -{ - uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); - if (c > 63) - return _mm_setzero_si128(); - - int64x2_t vc = vdupq_n_s64(-(int64_t) c); - return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); -} - -// NEON does not provide a version of this function. -// Creates a 16-bit mask from the most significant bits of the 16 signed or -// unsigned 8-bit integers in a and zero extends the upper bits. -// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_epi8(__m128i a) -{ -#if defined(__aarch64__) - uint8x16_t input = vreinterpretq_u8_m128i(a); - const int8_t ALIGN_STRUCT(16) - xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0}; - const uint8x16_t mask_and = vdupq_n_u8(0x80); - const int8x16_t mask_shift = vld1q_s8(xr); - const uint8x16_t mask_result = - vshlq_u8(vandq_u8(input, mask_and), mask_shift); - uint8x8_t lo = vget_low_u8(mask_result); - uint8x8_t hi = vget_high_u8(mask_result); - - return vaddv_u8(lo) + (vaddv_u8(hi) << 8); -#else - // Use increasingly wide shifts+adds to collect the sign bits - // together. - // Since the widening shifts would be rather confusing to follow in little - // endian, everything will be illustrated in big endian order instead. This - // has a different result - the bits would actually be reversed on a big - // endian machine. - - // Starting input (only half the elements are shown): - // 89 ff 1d c0 00 10 99 33 - uint8x16_t input = vreinterpretq_u8_m128i(a); - - // Shift out everything but the sign bits with an unsigned shift right. - // - // Bytes of the vector:: - // 89 ff 1d c0 00 10 99 33 - // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) - // | | | | | | | | - // 01 01 00 01 00 00 01 00 - // - // Bits of first important lane(s): - // 10001001 (89) - // \______ - // | - // 00000001 (01) - uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); - - // Merge the even lanes together with a 16-bit unsigned shift right + add. - // 'xx' represents garbage data which will be ignored in the final result. - // In the important bytes, the add functions like a binary OR. - // - // 01 01 00 01 00 00 01 00 - // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) - // \| \| \| \| - // xx 03 xx 01 xx 00 xx 02 - // - // 00000001 00000001 (01 01) - // \_______ | - // \| - // xxxxxxxx xxxxxx11 (xx 03) - uint32x4_t paired16 = - vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); - - // Repeat with a wider 32-bit shift + add. - // xx 03 xx 01 xx 00 xx 02 - // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> - // 14)) - // \| \| - // xx xx xx 0d xx xx xx 02 - // - // 00000011 00000001 (03 01) - // \\_____ || - // '----.\|| - // xxxxxxxx xxxx1101 (xx 0d) - uint64x2_t paired32 = - vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); - - // Last, an even wider 64-bit shift + add to get our result in the low 8 bit - // lanes. xx xx xx 0d xx xx xx 02 - // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> - // 28)) - // \| - // xx xx xx xx xx xx xx d2 - // - // 00001101 00000010 (0d 02) - // \ \___ | | - // '---. \| | - // xxxxxxxx 11010010 (xx d2) - uint8x16_t paired64 = - vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); - - // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. - // xx xx xx xx xx xx xx d2 - // || return paired64[0] - // d2 - // Note: Little endian would return the correct value 4b (01001011) instead. - return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); -#endif -} - -// Copy the lower 64-bit integer in a to dst. -// -// dst[63:0] := a[63:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 -FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) -{ - return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); -} - -// Copy the 64-bit integer a to the lower element of dst, and zero the upper -// element. -// -// dst[63:0] := a[63:0] -// dst[127:64] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 -FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) -{ - return vreinterpretq_m128i_s64( - vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); -} - -// NEON does not provide this method -// Creates a 4-bit mask from the most significant bits of the four -// single-precision, floating-point values. -// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx -FORCE_INLINE int _mm_movemask_ps(__m128 a) -{ - uint32x4_t input = vreinterpretq_u32_m128(a); -#if defined(__aarch64__) - static const int32x4_t shift = {0, 1, 2, 3}; - uint32x4_t tmp = vshrq_n_u32(input, 31); - return vaddvq_u32(vshlq_u32(tmp, shift)); -#else - // Uses the exact same method as _mm_movemask_epi8, see that for details. - // Shift out everything but the sign bits with a 32-bit unsigned shift - // right. - uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); - // Merge the two pairs together with a 64-bit unsigned shift right + add. - uint8x16_t paired = - vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); - // Extract the result. - return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); -#endif -} - -// Compute the bitwise NOT of a and then AND with a 128-bit vector containing -// all 1's, and return 1 if the result is zero, otherwise return 0. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones -FORCE_INLINE int _mm_test_all_ones(__m128i a) -{ - return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == - ~(uint64_t) 0; -} - -// Compute the bitwise AND of 128 bits (representing integer data) in a and -// mask, and return 1 if the result is zero, otherwise return 0. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros -FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) -{ - int64x2_t a_and_mask = - vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); - return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0 - : 1; -} - -/* Math operations */ - -// Subtracts the four single-precision, floating-point values of a and b. -// -// r0 := a0 - b0 -// r1 := a1 - b1 -// r2 := a2 - b2 -// r3 := a3 - b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_f32( - vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Subtract the lower single-precision (32-bit) floating-point element in b from -// the lower single-precision (32-bit) floating-point element in a, store the -// result in the lower element of dst, and copy the upper 3 packed elements from -// a to the upper elements of dst. -// -// dst[31:0] := a[31:0] - b[31:0] -// dst[127:32] := a[127:32] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss -FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_sub_ps(a, b)); -} - -// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, -// and store the results in dst. -// r0 := a0 - b0 -// r1 := a1 - b1 -FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s64( - vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); -} - -// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or -// unsigned 32-bit integers of a. -// -// r0 := a0 - b0 -// r1 := a1 - b1 -// r2 := a2 - b2 -// r3 := a3 - b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx -FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( - vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16( - vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8( - vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. -// -// dst[63:0] := a[63:0] - b[63:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 -FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) -{ - return vreinterpret_m64_s64( - vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); -} - -// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit -// integers of a and saturates.. -// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx -FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16( - vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); -} - -// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit -// integers of a and saturates. -// -// r0 := UnsignedSaturate(a0 - b0) -// r1 := UnsignedSaturate(a1 - b1) -// ... -// r15 := UnsignedSaturate(a15 - b15) -// -// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) -FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8( - vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); -} - -// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers -// of a and saturates. -// -// r0 := SignedSaturate(a0 - b0) -// r1 := SignedSaturate(a1 - b1) -// ... -// r15 := SignedSaturate(a15 - b15) -// -// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) -FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8( - vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers -// of a and saturates. -// -// r0 := SignedSaturate(a0 - b0) -// r1 := SignedSaturate(a1 - b1) -// ... -// r7 := SignedSaturate(a7 - b7) -// -// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) -FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16( - vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16( - vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); -} - -// Negate packed 8-bit integers in a when the corresponding signed -// 8-bit integer in b is negative, and store the results in dst. -// Element in dst are zeroed out when the corresponding element -// in b is zero. -// -// for i in 0..15 -// if b[i] < 0 -// r[i] := -a[i] -// else if b[i] == 0 -// r[i] := 0 -// else -// r[i] := a[i] -// fi -// done -FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) -{ - int8x16_t a = vreinterpretq_s8_m128i(_a); - int8x16_t b = vreinterpretq_s8_m128i(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFF : 0 - uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); - - // (b == 0) ? 0xFF : 0 -#if defined(__aarch64__) - int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); -#else - int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); -#endif - - // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a') - // based on ltMask - int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); - // res = masked & (~zeroMask) - int8x16_t res = vbicq_s8(masked, zeroMask); - - return vreinterpretq_m128i_s8(res); -} - -// Negate packed 16-bit integers in a when the corresponding signed -// 16-bit integer in b is negative, and store the results in dst. -// Element in dst are zeroed out when the corresponding element -// in b is zero. -// -// for i in 0..7 -// if b[i] < 0 -// r[i] := -a[i] -// else if b[i] == 0 -// r[i] := 0 -// else -// r[i] := a[i] -// fi -// done -FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) -{ - int16x8_t a = vreinterpretq_s16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFFFF : 0 - uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); - // (b == 0) ? 0xFFFF : 0 -#if defined(__aarch64__) - int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); -#else - int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); -#endif - - // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative - // 'a') based on ltMask - int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); - // res = masked & (~zeroMask) - int16x8_t res = vbicq_s16(masked, zeroMask); - return vreinterpretq_m128i_s16(res); -} - -// Negate packed 32-bit integers in a when the corresponding signed -// 32-bit integer in b is negative, and store the results in dst. -// Element in dst are zeroed out when the corresponding element -// in b is zero. -// -// for i in 0..3 -// if b[i] < 0 -// r[i] := -a[i] -// else if b[i] == 0 -// r[i] := 0 -// else -// r[i] := a[i] -// fi -// done -FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) -{ - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFFFFFFFF : 0 - uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); - - // (b == 0) ? 0xFFFFFFFF : 0 -#if defined(__aarch64__) - int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); -#else - int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); -#endif - - // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative - // 'a') based on ltMask - int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); - // res = masked & (~zeroMask) - int32x4_t res = vbicq_s32(masked, zeroMask); - return vreinterpretq_m128i_s32(res); -} - -// Negate packed 16-bit integers in a when the corresponding signed 16-bit -// integer in b is negative, and store the results in dst. Element in dst are -// zeroed out when the corresponding element in b is zero. -// -// FOR j := 0 to 3 -// i := j*16 -// IF b[i+15:i] < 0 -// dst[i+15:i] := -(a[i+15:i]) -// ELSE IF b[i+15:i] == 0 -// dst[i+15:i] := 0 -// ELSE -// dst[i+15:i] := a[i+15:i] -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 -FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) -{ - int16x4_t a = vreinterpret_s16_m64(_a); - int16x4_t b = vreinterpret_s16_m64(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFFFF : 0 - uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); - - // (b == 0) ? 0xFFFF : 0 -#if defined(__aarch64__) - int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); -#else - int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); -#endif - - // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a') - // based on ltMask - int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); - // res = masked & (~zeroMask) - int16x4_t res = vbic_s16(masked, zeroMask); - - return vreinterpret_m64_s16(res); -} - -// Negate packed 32-bit integers in a when the corresponding signed 32-bit -// integer in b is negative, and store the results in dst. Element in dst are -// zeroed out when the corresponding element in b is zero. -// -// FOR j := 0 to 1 -// i := j*32 -// IF b[i+31:i] < 0 -// dst[i+31:i] := -(a[i+31:i]) -// ELSE IF b[i+31:i] == 0 -// dst[i+31:i] := 0 -// ELSE -// dst[i+31:i] := a[i+31:i] -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 -FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) -{ - int32x2_t a = vreinterpret_s32_m64(_a); - int32x2_t b = vreinterpret_s32_m64(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFFFFFFFF : 0 - uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); - - // (b == 0) ? 0xFFFFFFFF : 0 -#if defined(__aarch64__) - int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); -#else - int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); -#endif - - // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a') - // based on ltMask - int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); - // res = masked & (~zeroMask) - int32x2_t res = vbic_s32(masked, zeroMask); - - return vreinterpret_m64_s32(res); -} - -// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer -// in b is negative, and store the results in dst. Element in dst are zeroed out -// when the corresponding element in b is zero. -// -// FOR j := 0 to 7 -// i := j*8 -// IF b[i+7:i] < 0 -// dst[i+7:i] := -(a[i+7:i]) -// ELSE IF b[i+7:i] == 0 -// dst[i+7:i] := 0 -// ELSE -// dst[i+7:i] := a[i+7:i] -// FI -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 -FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) -{ - int8x8_t a = vreinterpret_s8_m64(_a); - int8x8_t b = vreinterpret_s8_m64(_b); - - // signed shift right: faster than vclt - // (b < 0) ? 0xFF : 0 - uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); - - // (b == 0) ? 0xFF : 0 -#if defined(__aarch64__) - int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); -#else - int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); -#endif - - // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a') - // based on ltMask - int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); - // res = masked & (~zeroMask) - int8x8_t res = vbic_s8(masked, zeroMask); - - return vreinterpret_m64_s8(res); -} - -// Average packed unsigned 16-bit integers in a and b, and store the results in -// dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 -FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) -{ - return vreinterpret_m64_u16( - vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); -} - -// Average packed unsigned 8-bit integers in a and b, and store the results in -// dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 -FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) -{ - return vreinterpret_m64_u8( - vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); -} - -// Average packed unsigned 8-bit integers in a and b, and store the results in -// dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb -#define _m_pavgb(a, b) _mm_avg_pu8(a, b) - -// Average packed unsigned 16-bit integers in a and b, and store the results in -// dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw -#define _m_pavgw(a, b) _mm_avg_pu16(a, b) - -// Computes the average of the 16 unsigned 8-bit integers in a and the 16 -// unsigned 8-bit integers in b and rounds. -// -// r0 := (a0 + b0) / 2 -// r1 := (a1 + b1) / 2 -// ... -// r15 := (a15 + b15) / 2 -// -// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8( - vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); -} - -// Computes the average of the 8 unsigned 16-bit integers in a and the 8 -// unsigned 16-bit integers in b and rounds. -// -// r0 := (a0 + b0) / 2 -// r1 := (a1 + b1) / 2 -// ... -// r7 := (a7 + b7) / 2 -// -// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx -FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) -{ - return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), - vreinterpretq_u16_m128i(b)); -} - -// Adds the four single-precision, floating-point values of a and b. -// -// r0 := a0 + b0 -// r1 := a1 + b1 -// r2 := a2 + b2 -// r3 := a3 + b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_f32( - vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Add packed double-precision (64-bit) floating-point elements in a and b, and -// store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd -FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); -#else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2]; - c[0] = da[0] + db[0]; - c[1] = da[1] + db[1]; - return vld1q_f32((float32_t *) c); -#endif -} - -// Add 64-bit integers a and b, and store the result in dst. -// -// dst[63:0] := a[63:0] + b[63:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 -FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) -{ - return vreinterpret_m64_s64( - vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); -} - -// adds the scalar single-precision floating point values of a and b. -// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx -FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) -{ - float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); - float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); - // the upper values in the result must be the remnants of . - return vreinterpretq_m128_f32(vaddq_f32(a, value)); -} - -// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or -// unsigned 32-bit integers in b. -// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s64( - vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); -} - -// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or -// unsigned 32-bit integers in b. -// -// r0 := a0 + b0 -// r1 := a1 + b1 -// r2 := a2 + b2 -// r3 := a3 + b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( - vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or -// unsigned 16-bit integers in b. -// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16( - vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or -// unsigned 8-bit integers in b. -// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) -FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8( - vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b -// and saturates. -// -// r0 := SignedSaturate(a0 + b0) -// r1 := SignedSaturate(a1 + b1) -// ... -// r7 := SignedSaturate(a7 + b7) -// -// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx -FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16( - vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -// Add packed signed 8-bit integers in a and b using saturation, and store the -// results in dst. -// -// FOR j := 0 to 15 -// i := j*8 -// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 -FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8( - vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in -// b and saturates.. -// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx -FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8( - vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); -} - -// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or -// unsigned 16-bit integers from b. -// -// r0 := (a0 * b0)[15:0] -// r1 := (a1 * b1)[15:0] -// ... -// r7 := (a7 * b7)[15:0] -// -// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16( - vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or -// unsigned 32-bit integers from b. -// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( - vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Multiply the packed unsigned 16-bit integers in a and b, producing -// intermediate 32-bit integers, and store the high 16 bits of the intermediate -// integers in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// tmp[31:0] := a[i+15:i] * b[i+15:i] -// dst[i+15:i] := tmp[31:16] -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw -#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) - -// Multiplies the four single-precision, floating-point values of a and b. -// -// r0 := a0 * b0 -// r1 := a1 * b1 -// r2 := a2 * b2 -// r3 := a3 * b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx -FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_f32( - vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Multiply packed double-precision (64-bit) floating-point elements in a and b, -// and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd -FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); -#else - double *da = (double *) &a; - double *db = (double *) &b; - double c[2]; - c[0] = da[0] * db[0]; - c[1] = da[1] * db[1]; - return vld1q_f32((float32_t *) c); -#endif -} - -// Multiply the lower single-precision (32-bit) floating-point element in a and -// b, store the result in the lower element of dst, and copy the upper 3 packed -// elements from a to the upper elements of dst. -// -// dst[31:0] := a[31:0] * b[31:0] -// dst[127:32] := a[127:32] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss -FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_mul_ps(a, b)); -} - -// Multiply the low unsigned 32-bit integers from each packed 64-bit element in -// a and b, and store the unsigned 64-bit results in dst. -// -// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) -// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) -FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) -{ - // vmull_u32 upcasts instead of masking, so we downcast. - uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); - uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); - return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); -} - -// Multiply the low unsigned 32-bit integers from a and b, and store the -// unsigned 64-bit result in dst. -// -// dst[63:0] := a[31:0] * b[31:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 -FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) -{ - return vreinterpret_m64_u64(vget_low_u64( - vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); -} - -// Multiply the low signed 32-bit integers from each packed 64-bit element in -// a and b, and store the signed 64-bit results in dst. -// -// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 -// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 -FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) -{ - // vmull_s32 upcasts instead of masking, so we downcast. - int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); - int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); - return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); -} - -// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit -// integers from b. -// -// r0 := (a0 * b0) + (a1 * b1) -// r1 := (a2 * b2) + (a3 * b3) -// r2 := (a4 * b4) + (a5 * b5) -// r3 := (a6 * b6) + (a7 * b7) -// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx -FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) -{ - int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), - vget_low_s16(vreinterpretq_s16_m128i(b))); - int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), - vget_high_s16(vreinterpretq_s16_m128i(b))); - - int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); - int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); - - return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); -} - -// Multiply packed signed 16-bit integers in a and b, producing intermediate -// signed 32-bit integers. Shift right by 15 bits while rounding up, and store -// the packed 16-bit integers in dst. -// -// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) -// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) -// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) -// ... -// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) -FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) -{ - // Has issues due to saturation - // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); - - // Multiply - int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), - vget_low_s16(vreinterpretq_s16_m128i(b))); - int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), - vget_high_s16(vreinterpretq_s16_m128i(b))); - - // Rounding narrowing shift right - // narrow = (int16_t)((mul + 16384) >> 15); - int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); - int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); - - // Join together - return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); -} - -// Vertically multiply each unsigned 8-bit integer from a with the corresponding -// signed 8-bit integer from b, producing intermediate signed 16-bit integers. -// Horizontally add adjacent pairs of intermediate signed 16-bit integers, -// and pack the saturated results in dst. -// -// FOR j := 0 to 7 -// i := j*16 -// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + -// a[i+7:i]*b[i+7:i] ) -// ENDFOR -FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) -{ -#if defined(__aarch64__) - uint8x16_t a = vreinterpretq_u8_m128i(_a); - int8x16_t b = vreinterpretq_s8_m128i(_b); - int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), - vmovl_s8(vget_low_s8(b))); - int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), - vmovl_s8(vget_high_s8(b))); - return vreinterpretq_m128i_s16( - vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); -#else - // This would be much simpler if x86 would choose to zero extend OR sign - // extend, not both. This could probably be optimized better. - uint16x8_t a = vreinterpretq_u16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); - - // Zero extend a - int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); - int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); - - // Sign extend by shifting left then shifting right. - int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); - int16x8_t b_odd = vshrq_n_s16(b, 8); - - // multiply - int16x8_t prod1 = vmulq_s16(a_even, b_even); - int16x8_t prod2 = vmulq_s16(a_odd, b_odd); - - // saturated add - return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); -#endif -} - -// Computes the fused multiple add product of 32-bit floating point numbers. -// -// Return Value -// Multiplies A and B, and adds C to the temporary result before returning it. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd -FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c), - vreinterpretq_f32_m128(b), - vreinterpretq_f32_m128(a))); -#else - return _mm_add_ps(_mm_mul_ps(a, b), c); -#endif -} - -// Alternatively add and subtract packed single-precision (32-bit) -// floating-point elements in a to/from packed elements in b, and store the -// results in dst. -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps -FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) -{ - __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; - return _mm_fmadd_ps(b, mask, a); -} - -// Compute the absolute differences of packed unsigned 8-bit integers in a and -// b, then horizontally sum each consecutive 8 differences to produce two -// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low -// 16 bits of 64-bit elements in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 -FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) -{ - uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); - uint16_t r0 = t[0] + t[1] + t[2] + t[3]; - uint16_t r4 = t[4] + t[5] + t[6] + t[7]; - uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); - return (__m128i) vsetq_lane_u16(r4, r, 4); -} - -// Compute the absolute differences of packed unsigned 8-bit integers in a and -// b, then horizontally sum each consecutive 8 differences to produce four -// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low -// 16 bits of dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 -FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) -{ - uint16x4_t t = - vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); - uint16_t r0 = t[0] + t[1] + t[2] + t[3]; - return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0)); -} - -// Compute the absolute differences of packed unsigned 8-bit integers in a and -// b, then horizontally sum each consecutive 8 differences to produce four -// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low -// 16 bits of dst. -// -// FOR j := 0 to 7 -// i := j*8 -// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) -// ENDFOR -// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + -// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw -#define _m_psadbw(a, b) _mm_sad_pu8(a, b) - -// Divides the four single-precision, floating-point values of a and b. -// -// r0 := a0 / b0 -// r1 := a1 / b1 -// r2 := a2 / b2 -// r3 := a3 / b3 -// -// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32( - vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -#else - float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b)); - float32x4_t recip1 = - vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b))); - return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1)); -#endif -} - -// Divides the scalar single-precision floating point value of a by b. -// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx -FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) -{ - float32_t value = - vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); - return vreinterpretq_m128_f32( - vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); -} - -// Compute the approximate reciprocal of packed single-precision (32-bit) -// floating-point elements in a, and store the results in dst. The maximum -// relative error for this approximation is less than 1.5*2^-12. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps -FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32( - vdivq_f32(vdupq_n_f32(1.0f), vreinterpretq_f32_m128(in))); -#else - float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); - recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); - return vreinterpretq_m128_f32(recip); -#endif -} - -// Compute the approximate reciprocal of the lower single-precision (32-bit) -// floating-point element in a, store the result in the lower element of dst, -// and copy the upper 3 packed elements from a to the upper elements of dst. The -// maximum relative error for this approximation is less than 1.5*2^-12. -// -// dst[31:0] := (1.0 / a[31:0]) -// dst[127:32] := a[127:32] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss -FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) -{ - return _mm_move_ss(a, _mm_rcp_ps(a)); -} - -// Computes the approximations of square roots of the four single-precision, -// floating-point values of a. First computes reciprocal square roots and then -// reciprocals of the four values. -// -// r0 := sqrt(a0) -// r1 := sqrt(a1) -// r2 := sqrt(a2) -// r3 := sqrt(a3) -// -// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); -#else - float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); - float32x4_t sq = vrecpeq_f32(recipsq); - // ??? use step versions of both sqrt and recip for better accuracy? - return vreinterpretq_m128_f32(sq); -#endif -} - -// Computes the approximation of the square root of the scalar single-precision -// floating point value of in. -// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx -FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) -{ - float32_t value = - vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); - return vreinterpretq_m128_f32( - vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); -} - -// Computes the approximations of the reciprocal square roots of the four -// single-precision floating point values of in. -// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx -FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) -{ - return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in))); -} - -// Compute the approximate reciprocal square root of the lower single-precision -// (32-bit) floating-point element in a, store the result in the lower element -// of dst, and copy the upper 3 packed elements from a to the upper elements of -// dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss -FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) -{ - return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); -} - -// Compare packed signed 16-bit integers in a and b, and store packed maximum -// values in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 -FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) -{ - return vreinterpret_m64_s16( - vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); -} - -// Compare packed signed 16-bit integers in a and b, and store packed maximum -// values in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 -#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) - -// Computes the maximums of the four single-precision, floating-point values of -// a and b. -// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) -{ -#if SSE2NEON_PRECISE_MINMAX - float32x4_t _a = vreinterpretq_f32_m128(a); - float32x4_t _b = vreinterpretq_f32_m128(b); - return vbslq_f32(vcltq_f32(_b, _a), _a, _b); -#else - return vreinterpretq_m128_f32( - vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -#endif -} - -// Compare packed unsigned 8-bit integers in a and b, and store packed maximum -// values in dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 -FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) -{ - return vreinterpret_m64_u8( - vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); -} - -// Compare packed unsigned 8-bit integers in a and b, and store packed maximum -// values in dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 -#define _m_pmaxub(a, b) _mm_max_pu8(a, b) - -// Compare packed signed 16-bit integers in a and b, and store packed minimum -// values in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 -FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) -{ - return vreinterpret_m64_s16( - vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); -} - -// Compare packed signed 16-bit integers in a and b, and store packed minimum -// values in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 -#define _m_pminsw(a, b) _mm_min_pi16(a, b) - -// Computes the minima of the four single-precision, floating-point values of a -// and b. -// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) -{ -#if SSE2NEON_PRECISE_MINMAX - float32x4_t _a = vreinterpretq_f32_m128(a); - float32x4_t _b = vreinterpretq_f32_m128(b); - return vbslq_f32(vcltq_f32(_a, _b), _a, _b); -#else - return vreinterpretq_m128_f32( - vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -#endif -} - -// Compare packed unsigned 8-bit integers in a and b, and store packed minimum -// values in dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 -FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) -{ - return vreinterpret_m64_u8( - vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); -} - -// Compare packed unsigned 8-bit integers in a and b, and store packed minimum -// values in dst. -// -// FOR j := 0 to 7 -// i := j*8 -// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 -#define _m_pminub(a, b) _mm_min_pu8(a, b) - -// Computes the maximum of the two lower scalar single-precision floating point -// values of a and b. -// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx -FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) -{ - float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); - return vreinterpretq_m128_f32( - vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); -} - -// Computes the minimum of the two lower scalar single-precision floating point -// values of a and b. -// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx -FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) -{ - float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); - return vreinterpretq_m128_f32( - vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); -} - -// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the -// 16 unsigned 8-bit integers from b. -// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8( - vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); -} - -// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the -// 16 unsigned 8-bit integers from b. -// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx -FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8( - vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); -} - -// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 -// signed 16-bit integers from b. -// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx -FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16( - vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -// Compare packed signed 8-bit integers in a and b, and store packed maximum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 -FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8( - vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -// Compare packed unsigned 16-bit integers in a and b, and store packed maximum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 -FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16( - vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); -} - -// Compare packed signed 8-bit integers in a and b, and store packed minimum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 -FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8( - vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -// Compare packed unsigned 16-bit integers in a and b, and store packed minimum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 -FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16( - vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); -} - -// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 -// signed 16-bit integers from b. -// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16( - vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -// epi versions of min/max -// Computes the pariwise maximums of the four signed 32-bit integer values of a -// and b. -// -// A 128-bit parameter that can be defined with the following equations: -// r0 := (a0 > b0) ? a0 : b0 -// r1 := (a1 > b1) ? a1 : b1 -// r2 := (a2 > b2) ? a2 : b2 -// r3 := (a3 > b3) ? a3 : b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx -FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( - vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Computes the pariwise minima of the four signed 32-bit integer values of a -// and b. -// -// A 128-bit parameter that can be defined with the following equations: -// r0 := (a0 < b0) ? a0 : b0 -// r1 := (a1 < b1) ? a1 : b1 -// r2 := (a2 < b2) ? a2 : b2 -// r3 := (a3 < b3) ? a3 : b3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx -FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s32( - vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Compare packed unsigned 32-bit integers in a and b, and store packed maximum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 -FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u32( - vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); -} - -// Compare packed unsigned 32-bit integers in a and b, and store packed minimum -// values in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 -FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u32( - vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); -} - -// Multiply the packed unsigned 16-bit integers in a and b, producing -// intermediate 32-bit integers, and store the high 16 bits of the intermediate -// integers in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 -FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) -{ - return vreinterpret_m64_u16(vshrn_n_u32( - vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); -} - -// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit -// integers from b. -// -// r0 := (a0 * b0)[31:16] -// r1 := (a1 * b1)[31:16] -// ... -// r7 := (a7 * b7)[31:16] -// -// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) -{ - /* FIXME: issue with large values because of result saturation */ - // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), - // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return - // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); - int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); - int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ - int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); - int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ - uint16x8x2_t r = - vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); - return vreinterpretq_m128i_u16(r.val[1]); -} - -// Multiply the packed unsigned 16-bit integers in a and b, producing -// intermediate 32-bit integers, and store the high 16 bits of the intermediate -// integers in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16 -FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) -{ - uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); - uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); - uint32x4_t ab3210 = vmull_u16(a3210, b3210); -#if defined(__aarch64__) - uint32x4_t ab7654 = - vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); - uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), - vreinterpretq_u16_u32(ab7654)); - return vreinterpretq_m128i_u16(r); -#else - uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); - uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); - uint32x4_t ab7654 = vmull_u16(a7654, b7654); - uint16x8x2_t r = - vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); - return vreinterpretq_m128i_u16(r.val[1]); -#endif -} - -// Computes pairwise add of each argument as single-precision, floating-point -// values a and b. -// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx -FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32( - vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -#else - float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); - float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_f32( - vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); -#endif -} - -// Computes pairwise add of each argument as a 16-bit signed or unsigned integer -// values a and b. -FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) -{ - int16x8_t a = vreinterpretq_s16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); -#if defined(__aarch64__) - return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); -#else - return vreinterpretq_m128i_s16( - vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), - vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); -#endif -} - -// Horizontally substract adjacent pairs of single-precision (32-bit) -// floating-point elements in a and b, and pack the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps -FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32(vsubq_f32( - vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)), - vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)))); -#else - float32x4x2_t c = - vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)); - return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); -#endif -} - -// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the -// signed 16-bit results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 -FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) -{ - return vreinterpret_m64_s16( - vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); -} - -// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the -// signed 32-bit results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 -FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) -{ - return vreinterpret_m64_s32( - vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); -} - -// Computes pairwise difference of each argument as a 16-bit signed or unsigned -// integer values a and b. -FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) -{ - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - // Interleave using vshrn/vmovn - // [a0|a2|a4|a6|b0|b2|b4|b6] - // [a1|a3|a5|a7|b1|b3|b5|b7] - int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); - int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); - // Subtract - return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357)); -} - -// Computes saturated pairwise sub of each argument as a 16-bit signed -// integer values a and b. -FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) -{ -#if defined(__aarch64__) - int16x8_t a = vreinterpretq_s16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); - return vreinterpretq_s64_s16( - vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); -#else - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - // Interleave using vshrn/vmovn - // [a0|a2|a4|a6|b0|b2|b4|b6] - // [a1|a3|a5|a7|b1|b3|b5|b7] - int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); - int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); - // Saturated add - return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); -#endif -} - -// Computes saturated pairwise difference of each argument as a 16-bit signed -// integer values a and b. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 -FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) -{ -#if defined(__aarch64__) - int16x8_t a = vreinterpretq_s16_m128i(_a); - int16x8_t b = vreinterpretq_s16_m128i(_b); - return vreinterpretq_s64_s16( - vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); -#else - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - // Interleave using vshrn/vmovn - // [a0|a2|a4|a6|b0|b2|b4|b6] - // [a1|a3|a5|a7|b1|b3|b5|b7] - int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); - int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); - // Saturated subtract - return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357)); -#endif -} - -// Computes pairwise add of each argument as a 32-bit signed or unsigned integer -// values a and b. -FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) -{ - int32x4_t a = vreinterpretq_s32_m128i(_a); - int32x4_t b = vreinterpretq_s32_m128i(_b); - return vreinterpretq_m128i_s32( - vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), - vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); -} - -// Computes pairwise difference of each argument as a 32-bit signed or unsigned -// integer values a and b. -FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) -{ - int64x2_t a = vreinterpretq_s64_m128i(_a); - int64x2_t b = vreinterpretq_s64_m128i(_b); - // Interleave using vshrn/vmovn - // [a0|a2|b0|b2] - // [a1|a2|b1|b3] - int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b)); - int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32)); - // Subtract - return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13)); -} - -// Kahan summation for accurate summation of floating-point numbers. -// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html -FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y) -{ - y -= *c; - float t = *sum + y; - *c = (t - *sum) - y; - *sum = t; -} - -// Conditionally multiply the packed single-precision (32-bit) floating-point -// elements in a and b using the high 4 bits in imm8, sum the four products, -// and conditionally store the sum in dst using the low 4 bits of imm. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps -FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) -{ -#if defined(__aarch64__) - /* shortcuts */ - if (imm == 0xFF) { - return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); - } - if (imm == 0x7F) { - float32x4_t m = _mm_mul_ps(a, b); - m[3] = 0; - return _mm_set1_ps(vaddvq_f32(m)); - } -#endif - - float s = 0, c = 0; - float32x4_t f32a = vreinterpretq_f32_m128(a); - float32x4_t f32b = vreinterpretq_f32_m128(b); - - /* To improve the accuracy of floating-point summation, Kahan algorithm - * is used for each operation. - */ - if (imm & (1 << 4)) - sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); - if (imm & (1 << 5)) - sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); - if (imm & (1 << 6)) - sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); - if (imm & (1 << 7)) - sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); - s += c; - - float32x4_t res = { - (imm & 0x1) ? s : 0, - (imm & 0x2) ? s : 0, - (imm & 0x4) ? s : 0, - (imm & 0x8) ? s : 0, - }; - return vreinterpretq_m128_f32(res); -} - -/* Compare operations */ - -// Compares for less than -// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32( - vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Compares for less than -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) -FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_cmplt_ps(a, b)); -} - -// Compares for greater than. -// -// r0 := (a0 > b0) ? 0xffffffff : 0x0 -// r1 := (a1 > b1) ? 0xffffffff : 0x0 -// r2 := (a2 > b2) ? 0xffffffff : 0x0 -// r3 := (a3 > b3) ? 0xffffffff : 0x0 -// -// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32( - vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Compares for greater than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) -FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); -} - -// Compares for greater than or equal. -// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32( - vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Compares for greater than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) -FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_cmpge_ps(a, b)); -} - -// Compares for less than or equal. -// -// r0 := (a0 <= b0) ? 0xffffffff : 0x0 -// r1 := (a1 <= b1) ? 0xffffffff : 0x0 -// r2 := (a2 <= b2) ? 0xffffffff : 0x0 -// r3 := (a3 <= b3) ? 0xffffffff : 0x0 -// -// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32( - vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Compares for less than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) -FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_cmple_ps(a, b)); -} - -// Compares for equality. -// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32( - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -} - -// Compares for equality. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) -FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); -} - -// Compares for inequality. -// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) -{ - return vreinterpretq_m128_u32(vmvnq_u32( - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); -} - -// Compares for inequality. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) -FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); -} - -// Compares for not greater than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) -{ - return _mm_cmplt_ps(a, b); -} - -// Compares for not greater than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) -{ - return _mm_cmplt_ss(a, b); -} - -// Compares for not greater than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) -FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) -{ - return _mm_cmple_ps(a, b); -} - -// Compares for not greater than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) -FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) -{ - return _mm_cmple_ss(a, b); -} - -// Compares for not less than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) -{ - return _mm_cmpgt_ps(a, b); -} - -// Compares for not less than or equal. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) -{ - return _mm_cmpgt_ss(a, b); -} - -// Compares for not less than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) -{ - return _mm_cmpge_ps(a, b); -} - -// Compares for not less than. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) -FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) -{ - return _mm_cmpge_ss(a, b); -} - -// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or -// unsigned 8-bit integers in b for equality. -// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx -FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8( - vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or -// unsigned 16-bit integers in b for equality. -// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16( - vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -// Compare packed 32-bit integers in a and b for equality, and store the results -// in dst -FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u32( - vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Compare packed 64-bit integers in a and b for equality, and store the results -// in dst -FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128i_u64( - vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); -#else - // ARMv7 lacks vceqq_u64 - // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) - uint32x4_t cmp = - vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); - uint32x4_t swapped = vrev64q_u32(cmp); - return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); -#endif -} - -// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers -// in b for lesser than. -// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx -FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8( - vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers -// in b for greater than. -// -// r0 := (a0 > b0) ? 0xff : 0x0 -// r1 := (a1 > b1) ? 0xff : 0x0 -// ... -// r15 := (a15 > b15) ? 0xff : 0x0 -// -// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8( - vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -} - -// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers -// in b for less than. -// -// r0 := (a0 < b0) ? 0xffff : 0x0 -// r1 := (a1 < b1) ? 0xffff : 0x0 -// ... -// r7 := (a7 < b7) ? 0xffff : 0x0 -// -// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16( - vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - -// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers -// in b for greater than. -// -// r0 := (a0 > b0) ? 0xffff : 0x0 -// r1 := (a1 > b1) ? 0xffff : 0x0 -// ... -// r7 := (a7 > b7) ? 0xffff : 0x0 -// -// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16( - vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -} - - -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers -// in b for less than. -// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u32( - vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers -// in b for greater than. -// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u32( - vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -} - -// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers -// in b for greater than. -FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128i_u64( - vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); -#else - // ARMv7 lacks vcgtq_s64. - // This is based off of Clang's SSE2 polyfill: - // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi)) - - // Mask the sign bit out since we need a signed AND an unsigned comparison - // and it is ugly to try and split them. - int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull)); - int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask); - int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask); - // Check if a > b - int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask)); - // Copy upper mask to lower mask - // a_hi > b_hi - int64x2_t gt_hi = vshrq_n_s64(greater, 63); - // Copy lower mask to upper mask - // a_lo > b_lo - int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32); - // Compare for equality - int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask)); - // Copy upper mask to lower mask - // a_hi == b_hi - int64x2_t eq_hi = vshrq_n_s64(equal, 63); - // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi) - int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi)); - return vreinterpretq_m128i_s64(ret); -#endif -} - -// Compares the four 32-bit floats in a and b to check if any values are NaN. -// Ordered compare between each value returns true for "orderable" and false for -// "not orderable" (NaN). -// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see -// also: -// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean -// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics -FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) -{ - // Note: NEON does not have ordered compare builtin - // Need to compare a eq a and b eq b to check for NaN - // Do AND of results to get final - uint32x4_t ceqaa = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t ceqbb = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); -} - -// Compares for ordered. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) -FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_cmpord_ps(a, b)); -} - -// Compares for unordered. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) -FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) -{ - uint32x4_t f32a = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t f32b = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); -} - -// Compares for unordered. -// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) -FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) -{ - return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); -} - -// Compares the lower single-precision floating point scalar values of a and b -// using a less than operation. : -// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important -// note!! The documentation on MSDN is incorrect! If either of the values is a -// NAN the docs say you will get a one, but in fact, it will return a zero!! -FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) -{ - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_lt_b = - vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b -// using a greater than operation. : -// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx -FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) -{ - // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_gt_b = - vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b -// using a less than or equal operation. : -// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx -FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) -{ - // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_le_b = - vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b -// using a greater than or equal operation. : -// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx -FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) -{ - // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_ge_b = - vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b -// using an equality operation. : -// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx -FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) -{ - // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_eq_b = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); - return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0; -} - -// Compares the lower single-precision floating point scalar values of a and b -// using an inequality operation. : -// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx -FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) -{ - // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), - // vreinterpretq_f32_m128(b)), 0); - uint32x4_t a_not_nan = - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); - uint32x4_t b_not_nan = - vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_neq_b = vmvnq_u32( - vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); - return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0; -} - -// according to the documentation, these intrinsics behave the same as the -// non-'u' versions. We'll just alias them here. -#define _mm_ucomilt_ss _mm_comilt_ss -#define _mm_ucomile_ss _mm_comile_ss -#define _mm_ucomigt_ss _mm_comigt_ss -#define _mm_ucomige_ss _mm_comige_ss -#define _mm_ucomieq_ss _mm_comieq_ss -#define _mm_ucomineq_ss _mm_comineq_ss - -/* Conversions */ - -// Convert packed signed 32-bit integers in b to packed single-precision -// (32-bit) floating-point elements, store the results in the lower 2 elements -// of dst, and copy the upper 2 packed elements from a to the upper elements of -// dst. -// -// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) -// dst[95:64] := a[95:64] -// dst[127:96] := a[127:96] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps -FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) -{ - return vreinterpretq_m128_f32( - vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), - vget_high_f32(vreinterpretq_f32_m128(a)))); -} - -// Convert the signed 32-bit integer b to a single-precision (32-bit) -// floating-point element, store the result in the lower element of dst, and -// copy the upper 3 packed elements from a to the upper elements of dst. -// -// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -// dst[127:32] := a[127:32] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss -FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) -{ - return vreinterpretq_m128_f32( - vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); -} - -// Convert the signed 32-bit integer b to a single-precision (32-bit) -// floating-point element, store the result in the lower element of dst, and -// copy the upper 3 packed elements from a to the upper elements of dst. -// -// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -// dst[127:32] := a[127:32] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss -#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) - -// Convert the signed 64-bit integer b to a single-precision (32-bit) -// floating-point element, store the result in the lower element of dst, and -// copy the upper 3 packed elements from a to the upper elements of dst. -// -// dst[31:0] := Convert_Int64_To_FP32(b[63:0]) -// dst[127:32] := a[127:32] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss -FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) -{ - return vreinterpretq_m128_f32( - vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); -} - -// Convert the lower single-precision (32-bit) floating-point element in a to a -// 32-bit integer, and store the result in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si -FORCE_INLINE int _mm_cvt_ss2si(__m128 a) -{ -#if defined(__aarch64__) - return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0); -#else - float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - float32_t diff = data - floor(data); - if (diff > 0.5) - return (int32_t) ceil(data); - if (diff == 0.5) { - int32_t f = (int32_t) floor(data); - int32_t c = (int32_t) ceil(data); - return c & 1 ? f : c; - } - return (int32_t) floor(data); -#endif -} - -// Convert packed 16-bit integers in a to packed single-precision (32-bit) -// floating-point elements, and store the results in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// m := j*32 -// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps -FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) -{ - return vreinterpretq_m128_f32( - vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); -} - -// Convert packed 32-bit integers in b to packed single-precision (32-bit) -// floating-point elements, store the results in the lower 2 elements of dst, -// and copy the upper 2 packed elements from a to the upper elements of dst. -// -// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) -// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) -// dst[95:64] := a[95:64] -// dst[127:96] := a[127:96] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps -FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) -{ - return vreinterpretq_m128_f32( - vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), - vget_high_f32(vreinterpretq_f32_m128(a)))); -} - -// Convert packed signed 32-bit integers in a to packed single-precision -// (32-bit) floating-point elements, store the results in the lower 2 elements -// of dst, then covert the packed signed 32-bit integers in b to -// single-precision (32-bit) floating-point element, and store the results in -// the upper 2 elements of dst. -// -// dst[31:0] := Convert_Int32_To_FP32(a[31:0]) -// dst[63:32] := Convert_Int32_To_FP32(a[63:32]) -// dst[95:64] := Convert_Int32_To_FP32(b[31:0]) -// dst[127:96] := Convert_Int32_To_FP32(b[63:32]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps -FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) -{ - return vreinterpretq_m128_f32(vcvtq_f32_s32( - vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); -} - -// Convert the lower packed 8-bit integers in a to packed single-precision -// (32-bit) floating-point elements, and store the results in dst. -// -// FOR j := 0 to 3 -// i := j*8 -// m := j*32 -// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps -FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) -{ - return vreinterpretq_m128_f32(vcvtq_f32_s32( - vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); -} - -// Convert packed unsigned 16-bit integers in a to packed single-precision -// (32-bit) floating-point elements, and store the results in dst. -// -// FOR j := 0 to 3 -// i := j*16 -// m := j*32 -// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps -FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) -{ - return vreinterpretq_m128_f32( - vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); -} - -// Convert the lower packed unsigned 8-bit integers in a to packed -// single-precision (32-bit) floating-point elements, and store the results in -// dst. -// -// FOR j := 0 to 3 -// i := j*8 -// m := j*32 -// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps -FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) -{ - return vreinterpretq_m128_f32(vcvtq_f32_u32( - vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); -} - -// Converts the four single-precision, floating-point values of a to signed -// 32-bit integer values using truncate. -// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx -FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) -{ - return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); -} - -// Convert the lower double-precision (64-bit) floating-point element in a to a -// 64-bit integer with truncation, and store the result in dst. -// -// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 -FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) -{ -#if defined(__aarch64__) - return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); -#else - double ret = *((double *) &a); - return (int64_t) ret; -#endif -} - -// Convert the lower double-precision (64-bit) floating-point element in a to a -// 64-bit integer with truncation, and store the result in dst. -// -// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x -#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) - -// Converts the four signed 32-bit integer values of a to single-precision, -// floating-point values -// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx -FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) -{ - return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); -} - -// Converts the four unsigned 8-bit integers in the lower 16 bits to four -// unsigned 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) -{ - uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - return vreinterpretq_m128i_u16(u16x8); -} - -// Converts the four unsigned 8-bit integers in the lower 32 bits to four -// unsigned 32-bit integers. -// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx -FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) -{ - uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ - return vreinterpretq_m128i_u32(u32x4); -} - -// Converts the two unsigned 8-bit integers in the lower 16 bits to two -// unsigned 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) -{ - uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ - uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ - return vreinterpretq_m128i_u64(u64x2); -} - -// Converts the four unsigned 8-bit integers in the lower 16 bits to four -// unsigned 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) -{ - int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - return vreinterpretq_m128i_s16(s16x8); -} - -// Converts the four unsigned 8-bit integers in the lower 32 bits to four -// unsigned 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) -{ - int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ - return vreinterpretq_m128i_s32(s32x4); -} - -// Converts the two signed 8-bit integers in the lower 32 bits to four -// signed 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) -{ - int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ - int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ - return vreinterpretq_m128i_s64(s64x2); -} - -// Converts the four signed 16-bit integers in the lower 64 bits to four signed -// 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) -{ - return vreinterpretq_m128i_s32( - vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); -} - -// Converts the two signed 16-bit integers in the lower 32 bits two signed -// 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) -{ - int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ - int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ - return vreinterpretq_m128i_s64(s64x2); -} - -// Converts the four unsigned 16-bit integers in the lower 64 bits to four -// unsigned 32-bit integers. -FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) -{ - return vreinterpretq_m128i_u32( - vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); -} - -// Converts the two unsigned 16-bit integers in the lower 32 bits to two -// unsigned 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) -{ - uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ - uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ - return vreinterpretq_m128i_u64(u64x2); -} - -// Converts the two unsigned 32-bit integers in the lower 64 bits to two -// unsigned 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) -{ - return vreinterpretq_m128i_u64( - vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); -} - -// Converts the two signed 32-bit integers in the lower 64 bits to two signed -// 64-bit integers. -FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) -{ - return vreinterpretq_m128i_s64( - vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); -} - -// Converts the four single-precision, floating-point values of a to signed -// 32-bit integer values. -// -// r0 := (int) a0 -// r1 := (int) a1 -// r2 := (int) a2 -// r3 := (int) a3 -// -// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx -// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A -// does not support! It is supported on ARMv8-A however. -FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) -{ -#if defined(__aarch64__) - return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); -#else - uint32x4_t signmask = vdupq_n_u32(0x80000000); - float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), - vdupq_n_f32(0.5f)); /* +/- 0.5 */ - int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( - vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ - int32x4_t r_trunc = - vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ - int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( - vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ - int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), - vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ - float32x4_t delta = vsubq_f32( - vreinterpretq_f32_m128(a), - vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ - uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ - return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal)); -#endif -} - -// Copy the lower 32-bit integer in a to dst. -// -// dst[31:0] := a[31:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 -FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) -{ - return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); -} - -// Copy the lower 64-bit integer in a to dst. -// -// dst[63:0] := a[63:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 -FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) -{ - return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); -} - -// Copy the lower 64-bit integer in a to dst. -// -// dst[63:0] := a[63:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x -#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) - -// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, -// zero extending the upper bits. -// -// r0 := a -// r1 := 0x0 -// r2 := 0x0 -// r3 := 0x0 -// -// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) -{ - return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); -} - -// Moves 64-bit integer a to the least significant 64 bits of an __m128 object, -// zero extending the upper bits. -// -// r0 := a -// r1 := 0x0 -FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) -{ - return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); -} - -// Cast vector of type __m128 to type __m128d. This intrinsic is only used for -// compilation and does not generate any instructions, thus it has zero latency. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd -FORCE_INLINE __m128d _mm_castps_pd(__m128 a) -{ - return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); -} - -// Applies a type cast to reinterpret four 32-bit floating point values passed -// in as a 128-bit parameter as packed 32-bit integers. -// https://msdn.microsoft.com/en-us/library/bb514099.aspx -FORCE_INLINE __m128i _mm_castps_si128(__m128 a) -{ - return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); -} - -// Applies a type cast to reinterpret four 32-bit integers passed in as a -// 128-bit parameter as packed 32-bit floating point values. -// https://msdn.microsoft.com/en-us/library/bb514029.aspx -FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) -{ - return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); -} - -// Loads 128-bit value. : -// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx -FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) -{ - return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); -} - -// Load a double-precision (64-bit) floating-point element from memory into both -// elements of dst. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd -FORCE_INLINE __m128d _mm_load1_pd(const double *p) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); -#else - return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); -#endif -} - -// Load a double-precision (64-bit) floating-point element from memory into the -// upper element of dst, and copy the lower element from a to dst. mem_addr does -// not need to be aligned on any particular boundary. -// -// dst[63:0] := a[63:0] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd -FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); -#else - return vreinterpretq_m128d_f32(vcombine_f32( - vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); -#endif -} - -// Load a double-precision (64-bit) floating-point element from memory into both -// elements of dst. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 -#define _mm_load_pd1 _mm_load1_pd - -// Load a double-precision (64-bit) floating-point element from memory into both -// elements of dst. -// -// dst[63:0] := MEM[mem_addr+63:mem_addr] -// dst[127:64] := MEM[mem_addr+63:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd -#define _mm_loaddup_pd _mm_load1_pd - -// Loads 128-bit value. : -// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx -FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) -{ - return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); -} - -// Load unaligned 32-bit integer from memory into the first element of dst. -// -// dst[31:0] := MEM[mem_addr+31:mem_addr] -// dst[MAX:32] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 -FORCE_INLINE __m128i _mm_loadu_si32(const void *p) -{ - return vreinterpretq_m128i_s32( - vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); -} - -// Convert packed double-precision (64-bit) floating-point elements in a to -// packed single-precision (32-bit) floating-point elements, and store the -// results in dst. -// -// FOR j := 0 to 1 -// i := 32*j -// k := 64*j -// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) -// ENDFOR -// dst[127:64] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps -FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) -{ -#if defined(__aarch64__) - float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); - return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); -#else - float a0 = (float) ((double *) &a)[0]; - float a1 = (float) ((double *) &a)[1]; - return _mm_set_ps(0, 0, a1, a0); -#endif -} - -// Copy the lower double-precision (64-bit) floating-point element of a to dst. -// -// dst[63:0] := a[63:0] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 -FORCE_INLINE double _mm_cvtsd_f64(__m128d a) -{ -#if defined(__aarch64__) - return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); -#else - return ((double *) &a)[0]; -#endif -} - -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed double-precision (64-bit) floating-point elements, and store the -// results in dst. -// -// FOR j := 0 to 1 -// i := 64*j -// k := 32*j -// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd -FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) -{ -#if defined(__aarch64__) - return vreinterpretq_m128d_f64( - vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); -#else - double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); - double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); - return _mm_set_pd(a1, a0); -#endif -} - -// Cast vector of type __m128d to type __m128i. This intrinsic is only used for -// compilation and does not generate any instructions, thus it has zero latency. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 -FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) -{ - return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); -} - -// Cast vector of type __m128d to type __m128. This intrinsic is only used for -// compilation and does not generate any instructions, thus it has zero latency. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps -FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) -{ - return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); -} - -// Blend packed single-precision (32-bit) floating-point elements from a and b -// using mask, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps -FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask) -{ - return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask), - vreinterpretq_f32_m128(b), - vreinterpretq_f32_m128(a))); -} - -// Blend packed double-precision (64-bit) floating-point elements from a and b -// using mask, and store the results in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd -FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) -{ - uint64x2_t mask = - vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); -#if defined(__aarch64__) - float64x2_t a = vreinterpretq_f64_m128d(_a); - float64x2_t b = vreinterpretq_f64_m128d(_b); - return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); -#else - uint64x2_t a = vreinterpretq_u64_m128d(_a); - uint64x2_t b = vreinterpretq_u64_m128d(_b); - return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); -#endif -} - -// Round the packed single-precision (32-bit) floating-point elements in a using -// the rounding parameter, and store the results as packed single-precision -// floating-point elements in dst. -// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps -FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) -{ -#if defined(__aarch64__) - switch (rounding) { - case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): - return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); - case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): - return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); - case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): - return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); - case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): - return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); - default: //_MM_FROUND_CUR_DIRECTION - return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); - } -#else - float *v_float = (float *) &a; - __m128 zero, neg_inf, pos_inf; - - switch (rounding) { - case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): - return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); - case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): - return (__m128){floorf(v_float[0]), floorf(v_float[1]), - floorf(v_float[2]), floorf(v_float[3])}; - case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): - return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]), - ceilf(v_float[3])}; - case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): - zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); - neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]), - floorf(v_float[2]), floorf(v_float[3])); - pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]), - ceilf(v_float[2]), ceilf(v_float[3])); - return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero)); - default: //_MM_FROUND_CUR_DIRECTION - return (__m128){roundf(v_float[0]), roundf(v_float[1]), - roundf(v_float[2]), roundf(v_float[3])}; - } -#endif -} - -// Convert packed single-precision (32-bit) floating-point elements in a to -// packed 32-bit integers, and store the results in dst. -// -// FOR j := 0 to 1 -// i := 32*j -// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) -// ENDFOR -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi -FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) -{ -#if defined(__aarch64__) - return vreinterpret_m64_s32( - vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)))); -#else - return vreinterpret_m64_s32( - vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128( - _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))))); -#endif -} - -// Round the packed single-precision (32-bit) floating-point elements in a up to -// an integer value, and store the results as packed single-precision -// floating-point elements in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps -FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) -{ - return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); -} - -// Round the packed single-precision (32-bit) floating-point elements in a down -// to an integer value, and store the results as packed single-precision -// floating-point elements in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps -FORCE_INLINE __m128 _mm_floor_ps(__m128 a) -{ - return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); -} - - -// Load 128-bits of integer data from unaligned memory into dst. This intrinsic -// may perform better than _mm_loadu_si128 when the data crosses a cache line -// boundary. -// -// dst[127:0] := MEM[mem_addr+127:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 -#define _mm_lddqu_si128 _mm_loadu_si128 - -/* Miscellaneous Operations */ - -// Shifts the 8 signed 16-bit integers in a right by count bits while shifting -// in the sign bit. -// -// r0 := a0 >> count -// r1 := a1 >> count -// ... -// r7 := a7 >> count -// -// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx -FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) -{ - int64_t c = (int64_t) vget_low_s64((int64x2_t) count); - if (c > 15) - return _mm_cmplt_epi16(a, _mm_setzero_si128()); - return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c))); -} - -// Shifts the 4 signed 32-bit integers in a right by count bits while shifting -// in the sign bit. -// -// r0 := a0 >> count -// r1 := a1 >> count -// r2 := a2 >> count -// r3 := a3 >> count -// -// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx -FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) -{ - int64_t c = (int64_t) vget_low_s64((int64x2_t) count); - if (c > 31) - return _mm_cmplt_epi32(a, _mm_setzero_si128()); - return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c))); -} - -// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and -// saturates. -// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s8( - vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), - vqmovn_s16(vreinterpretq_s16_m128i(b)))); -} - -// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned -// integers and saturates. -// -// r0 := UnsignedSaturate(a0) -// r1 := UnsignedSaturate(a1) -// ... -// r7 := UnsignedSaturate(a7) -// r8 := UnsignedSaturate(b0) -// r9 := UnsignedSaturate(b1) -// ... -// r15 := UnsignedSaturate(b7) -// -// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx -FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) -{ - return vreinterpretq_m128i_u8( - vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), - vqmovun_s16(vreinterpretq_s16_m128i(b)))); -} - -// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers -// and saturates. -// -// r0 := SignedSaturate(a0) -// r1 := SignedSaturate(a1) -// r2 := SignedSaturate(a2) -// r3 := SignedSaturate(a3) -// r4 := SignedSaturate(b0) -// r5 := SignedSaturate(b1) -// r6 := SignedSaturate(b2) -// r7 := SignedSaturate(b3) -// -// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_s16( - vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), - vqmovn_s32(vreinterpretq_s32_m128i(b)))); -} - -// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit -// integers and saturates. -// -// r0 := UnsignedSaturate(a0) -// r1 := UnsignedSaturate(a1) -// r2 := UnsignedSaturate(a2) -// r3 := UnsignedSaturate(a3) -// r4 := UnsignedSaturate(b0) -// r5 := UnsignedSaturate(b1) -// r6 := UnsignedSaturate(b2) -// r7 := UnsignedSaturate(b3) -FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u16( - vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), - vqmovun_s32(vreinterpretq_s32_m128i(b)))); -} - -// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower -// 8 signed or unsigned 8-bit integers in b. -// -// r0 := a0 -// r1 := b0 -// r2 := a1 -// r3 := b1 -// ... -// r14 := a7 -// r15 := b7 -// -// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128i_s8( - vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -#else - int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); - int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); - int8x8x2_t result = vzip_s8(a1, b1); - return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); -#endif -} - -// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the -// lower 4 signed or unsigned 16-bit integers in b. -// -// r0 := a0 -// r1 := b0 -// r2 := a1 -// r3 := b1 -// r4 := a2 -// r5 := b2 -// r6 := a3 -// r7 := b3 -// -// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx -FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128i_s16( - vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -#else - int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); - int16x4x2_t result = vzip_s16(a1, b1); - return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); -#endif -} - -// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the -// lower 2 signed or unsigned 32 - bit integers in b. -// -// r0 := a0 -// r1 := b0 -// r2 := a1 -// r3 := b1 -// -// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128i_s32( - vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -#else - int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); - int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); - int32x2x2_t result = vzip_s32(a1, b1); - return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); -#endif -} - -FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) -{ - int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); - int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); - return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); -} - -// Selects and interleaves the lower two single-precision, floating-point values -// from a and b. -// -// r0 := a0 -// r1 := b0 -// r2 := a1 -// r3 := b1 -// -// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32( - vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -#else - float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); - float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); - float32x2x2_t result = vzip_f32(a1, b1); - return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); -#endif -} - -// Selects and interleaves the upper two single-precision, floating-point values -// from a and b. -// -// r0 := a2 -// r1 := b2 -// r2 := a3 -// r3 := b3 -// -// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx -FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128_f32( - vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); -#else - float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); - float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); - float32x2x2_t result = vzip_f32(a1, b1); - return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); -#endif -} - -// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper -// 8 signed or unsigned 8-bit integers in b. -// -// r0 := a8 -// r1 := b8 -// r2 := a9 -// r3 := b9 -// ... -// r14 := a15 -// r15 := b15 -// -// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128i_s8( - vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); -#else - int8x8_t a1 = - vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); - int8x8_t b1 = - vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); - int8x8x2_t result = vzip_s8(a1, b1); - return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); -#endif -} - -// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the -// upper 4 signed or unsigned 16-bit integers in b. -// -// r0 := a4 -// r1 := b4 -// r2 := a5 -// r3 := b5 -// r4 := a6 -// r5 := b6 -// r6 := a7 -// r7 := b7 -// -// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128i_s16( - vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); -#else - int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); - int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); - int16x4x2_t result = vzip_s16(a1, b1); - return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); -#endif -} - -// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the -// upper 2 signed or unsigned 32-bit integers in b. -// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx -FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) -{ -#if defined(__aarch64__) - return vreinterpretq_m128i_s32( - vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); -#else - int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); - int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); - int32x2x2_t result = vzip_s32(a1, b1); - return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); -#endif -} - -// Interleaves the upper signed or unsigned 64-bit integer in a with the -// upper signed or unsigned 64-bit integer in b. -// -// r0 := a1 -// r1 := b1 -FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) -{ - int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); - int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); - return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); -} - -// Horizontally compute the minimum amongst the packed unsigned 16-bit integers -// in a, store the minimum and index in dst, and zero the remaining bits in dst. -// -// index[2:0] := 0 -// min[15:0] := a[15:0] -// FOR j := 0 to 7 -// i := j*16 -// IF a[i+15:i] < min[15:0] -// index[2:0] := j -// min[15:0] := a[i+15:i] -// FI -// ENDFOR -// dst[15:0] := min[15:0] -// dst[18:16] := index[2:0] -// dst[127:19] := 0 -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16 -FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) -{ - __m128i dst; - uint16_t min, idx = 0; - // Find the minimum value -#if defined(__aarch64__) - min = vminvq_u16(vreinterpretq_u16_m128i(a)); -#else - __m64 tmp; - tmp = vreinterpret_m64_u16( - vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), - vget_high_u16(vreinterpretq_u16_m128i(a)))); - tmp = vreinterpret_m64_u16( - vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); - tmp = vreinterpret_m64_u16( - vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); - min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); -#endif - // Get the index of the minimum value - int i; - for (i = 0; i < 8; i++) { - if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { - idx = (uint16_t) i; - break; - } - a = _mm_srli_si128(a, 2); - } - // Generate result - dst = _mm_setzero_si128(); - dst = vreinterpretq_m128i_u16( - vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); - dst = vreinterpretq_m128i_u16( - vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); - return dst; -} - -// shift to right -// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx -// http://blog.csdn.net/hemmingway/article/details/44828303 -// Clang requires a macro here, as it is extremely picky about c being a -// literal. -#define _mm_alignr_epi8(a, b, c) \ - ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c))) - -// Compute the bitwise AND of 128 bits (representing integer data) in a and b, -// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the -// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, -// otherwise set CF to 0. Return the CF value. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128 -FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) -{ - int64x2_t s64 = - vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))), - vreinterpretq_s64_m128i(b)); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); -} - -// Compute the bitwise AND of 128 bits (representing integer data) in a and b, -// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the -// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, -// otherwise set CF to 0. Return the ZF value. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 -FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) -{ - int64x2_t s64 = - vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); -} - -// Extracts the selected signed or unsigned 8-bit integer from a and zero -// extends. -// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) -#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) - -// Inserts the least significant 8 bits of b into the selected 8-bit integer -// of a. -// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, -// __constrange(0,16) int imm) -#define _mm_insert_epi8(a, b, imm) \ - __extension__({ \ - vreinterpretq_m128i_s8( \ - vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ - }) - -// Extracts the selected signed or unsigned 16-bit integer from a and zero -// extends. -// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx -// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) -#define _mm_extract_epi16(a, imm) \ - vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) - -// Inserts the least significant 16 bits of b into the selected 16-bit integer -// of a. -// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx -// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, -// __constrange(0,8) int imm) -#define _mm_insert_epi16(a, b, imm) \ - __extension__({ \ - vreinterpretq_m128i_s16( \ - vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ - }) - -// Copy a to dst, and insert the 16-bit integer i into dst at the location -// specified by imm8. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 -#define _mm_insert_pi16(a, b, imm) \ - __extension__({ \ - vreinterpret_m64_s16( \ - vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ - }) - -// Extracts the selected signed or unsigned 32-bit integer from a and zero -// extends. -// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) -#define _mm_extract_epi32(a, imm) \ - vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) - -// Extracts the selected single-precision (32-bit) floating-point from a. -// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) -#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) - -// Inserts the least significant 32 bits of b into the selected 32-bit integer -// of a. -// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, -// __constrange(0,4) int imm) -#define _mm_insert_epi32(a, b, imm) \ - __extension__({ \ - vreinterpretq_m128i_s32( \ - vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ - }) - -// Extracts the selected signed or unsigned 64-bit integer from a and zero -// extends. -// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) -#define _mm_extract_epi64(a, imm) \ - vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) - -// Inserts the least significant 64 bits of b into the selected 64-bit integer -// of a. -// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, -// __constrange(0,2) int imm) -#define _mm_insert_epi64(a, b, imm) \ - __extension__({ \ - vreinterpretq_m128i_s64( \ - vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ - }) - -// Count the number of bits set to 1 in unsigned 32-bit integer a, and -// return that count in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 -FORCE_INLINE int _mm_popcnt_u32(unsigned int a) -{ -#if defined(__aarch64__) -#if __has_builtin(__builtin_popcount) - return __builtin_popcount(a); -#else - return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); -#endif -#else - uint32_t count = 0; - uint8x8_t input_val, count8x8_val; - uint16x4_t count16x4_val; - uint32x2_t count32x2_val; - - input_val = vld1_u8((uint8_t *) &a); - count8x8_val = vcnt_u8(input_val); - count16x4_val = vpaddl_u8(count8x8_val); - count32x2_val = vpaddl_u16(count16x4_val); - - vst1_u32(&count, count32x2_val); - return count; -#endif -} - -// Count the number of bits set to 1 in unsigned 64-bit integer a, and -// return that count in dst. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 -FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) -{ -#if defined(__aarch64__) -#if __has_builtin(__builtin_popcountll) - return __builtin_popcountll(a); -#else - return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); -#endif -#else - uint64_t count = 0; - uint8x8_t input_val, count8x8_val; - uint16x4_t count16x4_val; - uint32x2_t count32x2_val; - uint64x1_t count64x1_val; - - input_val = vld1_u8((uint8_t *) &a); - count8x8_val = vcnt_u8(input_val); - count16x4_val = vpaddl_u8(count8x8_val); - count32x2_val = vpaddl_u16(count16x4_val); - count64x1_val = vpaddl_u32(count32x2_val); - vst1_u64(&count, count64x1_val); - return count; -#endif -} - -// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision -// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the -// transposed matrix in these vectors (row0 now contains column 0, etc.). -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS -#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ - do { \ - float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ - float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ - row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ - vget_low_f32(ROW23.val[0])); \ - row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ - vget_low_f32(ROW23.val[1])); \ - row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ - vget_high_f32(ROW23.val[0])); \ - row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ - vget_high_f32(ROW23.val[1])); \ - } while (0) - -/* Crypto Extensions */ - -#if defined(__ARM_FEATURE_CRYPTO) -// Wraps vmull_p64 -FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) -{ - poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); - poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); - return vreinterpretq_u64_p128(vmull_p64(a, b)); -} -#else // ARMv7 polyfill -// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. -// -// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a -// 64-bit->128-bit polynomial multiply. -// -// It needs some work and is somewhat slow, but it is still faster than all -// known scalar methods. -// -// Algorithm adapted to C from -// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted -// from "Fast Software Polynomial Multiplication on ARM Processors Using the -// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab -// (https://hal.inria.fr/hal-01506572) -static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) -{ - poly8x8_t a = vreinterpret_p8_u64(_a); - poly8x8_t b = vreinterpret_p8_u64(_b); - - // Masks - uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), - vcreate_u8(0x00000000ffffffff)); - uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), - vcreate_u8(0x0000000000000000)); - - // Do the multiplies, rotating with vext to get all combinations - uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 - uint8x16_t e = - vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 - uint8x16_t f = - vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 - uint8x16_t g = - vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 - uint8x16_t h = - vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 - uint8x16_t i = - vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 - uint8x16_t j = - vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 - uint8x16_t k = - vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 - - // Add cross products - uint8x16_t l = veorq_u8(e, f); // L = E + F - uint8x16_t m = veorq_u8(g, h); // M = G + H - uint8x16_t n = veorq_u8(i, j); // N = I + J - - // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL - // instructions. -#if defined(__aarch64__) - uint8x16_t lm_p0 = vreinterpretq_u8_u64( - vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); - uint8x16_t lm_p1 = vreinterpretq_u8_u64( - vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); - uint8x16_t nk_p0 = vreinterpretq_u8_u64( - vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); - uint8x16_t nk_p1 = vreinterpretq_u8_u64( - vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); -#else - uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); - uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); - uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); - uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); -#endif - // t0 = (L) (P0 + P1) << 8 - // t1 = (M) (P2 + P3) << 16 - uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); - uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); - uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); - - // t2 = (N) (P4 + P5) << 24 - // t3 = (K) (P6 + P7) << 32 - uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); - uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); - uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); - - // De-interleave -#if defined(__aarch64__) - uint8x16_t t0 = vreinterpretq_u8_u64( - vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); - uint8x16_t t1 = vreinterpretq_u8_u64( - vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); - uint8x16_t t2 = vreinterpretq_u8_u64( - vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); - uint8x16_t t3 = vreinterpretq_u8_u64( - vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); -#else - uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); - uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); - uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); - uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); -#endif - // Shift the cross products - uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 - uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 - uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 - uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 - - // Accumulate the products - uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); - uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); - uint8x16_t mix = veorq_u8(d, cross1); - uint8x16_t r = veorq_u8(mix, cross2); - return vreinterpretq_u64_u8(r); -} -#endif // ARMv7 polyfill - -FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) -{ - uint64x2_t a = vreinterpretq_u64_m128i(_a); - uint64x2_t b = vreinterpretq_u64_m128i(_b); - switch (imm & 0x11) { - case 0x00: - return vreinterpretq_m128i_u64( - _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); - case 0x01: - return vreinterpretq_m128i_u64( - _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); - case 0x10: - return vreinterpretq_m128i_u64( - _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); - case 0x11: - return vreinterpretq_m128i_u64( - _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); - default: - abort(); - } -} - -#if !defined(__ARM_FEATURE_CRYPTO) -/* clang-format off */ -#define SSE2NEON_AES_DATA(w) \ - { \ - w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ - w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ - w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ - w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ - w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ - w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ - w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ - w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ - w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ - w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ - w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ - w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ - w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ - w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ - w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ - w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ - w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ - w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ - w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ - w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ - w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ - w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ - w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ - w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ - w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ - w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ - w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ - w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ - w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ - w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ - w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ - w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ - w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ - w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ - w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ - w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ - w(0xb0), w(0x54), w(0xbb), w(0x16) \ - } -/* clang-format on */ - -/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ -#define SSE2NEON_AES_H0(x) (x) -static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0); -#undef SSE2NEON_AES_H0 - -// In the absence of crypto extensions, implement aesenc using regular neon -// intrinsics instead. See: -// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ -// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and -// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52 -// for more information Reproduced with permission of the author. -FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) -{ -#if defined(__aarch64__) - static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, - 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, - 0xc, 0x1, 0x6, 0xb}; - static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, - 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc}; - - uint8x16_t v; - uint8x16_t w = vreinterpretq_u8_m128i(EncBlock); - - // shift rows - w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); - - // sub bytes - v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w); - v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40); - v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80); - v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0); - - // mix columns - w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b); - w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); - w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); - - // add round key - return vreinterpretq_m128i_u8(w) ^ RoundKey; - -#else /* ARMv7-A NEON implementation */ -#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ - (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ - (b0)) -#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) -#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) -#define SSE2NEON_AES_U0(p) \ - SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) -#define SSE2NEON_AES_U1(p) \ - SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) -#define SSE2NEON_AES_U2(p) \ - SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) -#define SSE2NEON_AES_U3(p) \ - SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) - static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { - SSE2NEON_AES_DATA(SSE2NEON_AES_U0), - SSE2NEON_AES_DATA(SSE2NEON_AES_U1), - SSE2NEON_AES_DATA(SSE2NEON_AES_U2), - SSE2NEON_AES_DATA(SSE2NEON_AES_U3), - }; -#undef SSE2NEON_AES_B2W -#undef SSE2NEON_AES_F2 -#undef SSE2NEON_AES_F3 -#undef SSE2NEON_AES_U0 -#undef SSE2NEON_AES_U1 -#undef SSE2NEON_AES_U2 -#undef SSE2NEON_AES_U3 - - uint32_t x0 = _mm_cvtsi128_si32(EncBlock); - uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55)); - uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA)); - uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF)); - - __m128i out = _mm_set_epi32( - (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ - aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), - (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ - aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), - (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ - aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), - (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ - aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); - - return _mm_xor_si128(out, RoundKey); -#endif -} - -FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) -{ - /* FIXME: optimized for NEON */ - uint8_t v[4][4] = { - {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, - {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, - {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, - {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], - SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, - }; - for (int i = 0; i < 16; i++) - vreinterpretq_nth_u8_m128i(a, i) = - v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i); - return a; -} - -// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. -// This instruction generates a round key for AES encryption. See -// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ -// for details. -// -// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx -FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) -{ - uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); - uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); - for (int i = 0; i < 4; ++i) { - ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]]; - ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]]; - } - return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, - ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); -} -#undef SSE2NEON_AES_DATA - -#else /* __ARM_FEATURE_CRYPTO */ -// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and -// AESMC and then manually applying the real key as an xor operation. This -// unfortunately means an additional xor op; the compiler should be able to -// optimize this away for repeated calls however. See -// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a -// for more details. -FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) -{ - return vreinterpretq_m128i_u8( - vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^ - vreinterpretq_u8_m128i(b)); -} - -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 -FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) -{ - return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( - vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), - RoundKey); -} - -FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) -{ - // AESE does ShiftRows and SubBytes on A - uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); - - uint8x16_t dest = { - // Undo ShiftRows step from AESE and extract X1 and X3 - u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) - u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) - u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) - u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) - }; - uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; - return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); -} -#endif - -/* Streaming Extensions */ - -// Guarantees that every preceding store is globally visible before any -// subsequent store. -// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx -FORCE_INLINE void _mm_sfence(void) -{ - __sync_synchronize(); -} - -// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- -// point elements) from a into memory using a non-temporal memory hint. -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps -FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) -{ -#if __has_builtin(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, (float32x4_t *) p); -#else - vst1q_f32(p, vreinterpretq_f32_m128(a)); -#endif -} - -// Stores the data in a to the address p without polluting the caches. If the -// cache line containing address p is already in the cache, the cache will be -// updated. -// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx -FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) -{ -#if __has_builtin(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, p); -#else - vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); -#endif -} - -// Load 128-bits of integer data from memory into dst using a non-temporal -// memory hint. mem_addr must be aligned on a 16-byte boundary or a -// general-protection exception may be generated. -// -// dst[127:0] := MEM[mem_addr+127:mem_addr] -// -// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 -FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) -{ -#if __has_builtin(__builtin_nontemporal_store) - return __builtin_nontemporal_load(p); -#else - return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); -#endif -} - -// Cache line containing p is flushed and invalidated from all caches in the -// coherency domain. : -// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx -FORCE_INLINE void _mm_clflush(void const *p) -{ - (void) p; - // no corollary for Neon? -} - -// Allocate aligned blocks of memory. -// https://software.intel.com/en-us/ -// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks -FORCE_INLINE void *_mm_malloc(size_t size, size_t align) -{ - void *ptr; - if (align == 1) - return malloc(size); - if (align == 2 || (sizeof(void *) == 8 && align == 4)) - align = sizeof(void *); - if (!posix_memalign(&ptr, align, size)) - return ptr; - return NULL; -} - -FORCE_INLINE void _mm_free(void *addr) -{ - free(addr); -} - -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 8-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) -FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); -#else - crc ^= v; - for (int bit = 0; bit < 8; bit++) { - if (crc & 1) - crc = (crc >> 1) ^ UINT32_C(0x82f63b78); - else - crc = (crc >> 1); - } -#endif - return crc; -} - -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 16-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) -FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); -#else - crc = _mm_crc32_u8(crc, v & 0xff); - crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); -#endif - return crc; -} - -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 32-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) -FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); -#else - crc = _mm_crc32_u16(crc, v & 0xffff); - crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); -#endif - return crc; -} - -// Starting with the initial value in crc, accumulates a CRC32 value for -// unsigned 64-bit integer v. -// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) -FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) -{ -#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) - __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" - : [c] "+r"(crc) - : [v] "r"(v)); -#else - crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); - crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); -#endif - return crc; -} - -#if defined(__GNUC__) || defined(__clang__) -#pragma pop_macro("ALIGN_STRUCT") -#pragma pop_macro("FORCE_INLINE") -#endif - -#if defined(__GNUC__) -#pragma GCC pop_options -#endif - -#endif diff --git a/soxr/src/std-types.h b/soxr/src/std-types.h new file mode 100644 index 0000000..c5e8636 --- /dev/null +++ b/soxr/src/std-types.h @@ -0,0 +1,48 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined soxr_std_types_included +#define soxr_std_types_included + +#include "soxr-config.h" + +#include + +#if HAVE_STDBOOL_H + #include +#else + #undef bool + #undef false + #undef true + #define bool int + #define false 0 + #define true 1 +#endif + +#if HAVE_STDINT_H + #include +#else + #undef int16_t + #undef int32_t + #undef int64_t + #undef uint32_t + #undef uint64_t + #define int16_t short + #if LONG_MAX > 2147483647L + #define int32_t int + #define int64_t long + #elif LONG_MAX < 2147483647L + #error this library requires that 'long int' has at least 32-bits + #else + #define int32_t long + #if defined _MSC_VER + #define int64_t __int64 + #else + #define int64_t long long + #endif + #endif + #define uint32_t unsigned int32_t + #define uint64_t unsigned int64_t +#endif + +#endif diff --git a/soxr/src/simd.c b/soxr/src/util-simd.c similarity index 69% rename from soxr/src/simd.c rename to soxr/src/util-simd.c index 7659ab9..ec548fd 100644 --- a/soxr/src/simd.c +++ b/soxr/src/util-simd.c @@ -1,15 +1,15 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ #include #include #include -#include "simd.h" -#include "simd-dev.h" -#define SIMD_ALIGNMENT (sizeof(float) * 4) +#include "soxr-config.h" -void * _soxr_simd_aligned_malloc(size_t size) +#define SIMD_ALIGNMENT (sizeof(float) * (1 + (PFFFT_DOUBLE|AVCODEC_FOUND)) * 4) + +void * SIMD_ALIGNED_MALLOC(size_t size) { char * p1 = 0, * p = malloc(size + SIMD_ALIGNMENT); if (p) { @@ -21,9 +21,9 @@ void * _soxr_simd_aligned_malloc(size_t size) -void * _soxr_simd_aligned_calloc(size_t nmemb, size_t size) +void * SIMD_ALIGNED_CALLOC(size_t nmemb, size_t size) { - void * p = _soxr_simd_aligned_malloc(nmemb * size); + void * p = SIMD_ALIGNED_MALLOC(nmemb * size); if (p) memset(p, 0, nmemb * size); return p; @@ -31,7 +31,7 @@ void * _soxr_simd_aligned_calloc(size_t nmemb, size_t size) -void _soxr_simd_aligned_free(void * p1) +void SIMD_ALIGNED_FREE(void * p1) { if (p1) free(*((void * *)p1 - 1)); @@ -39,11 +39,16 @@ void _soxr_simd_aligned_free(void * p1) -void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b) +#define PFFT_MACROS_ONLY +#include "pffft.c" + + + +void ORDERED_CONVOLVE_SIMD(int n, void * not_used, float * a, float const * b) { int i; float ab0, ab1; - v4sf * /*RESTRICT*/ va = (v4sf *)a; + v4sf * RESTRICT va = (v4sf *)a; v4sf const * RESTRICT vb = (v4sf const *)b; assert(VALIGNED(a) && VALIGNED(b)); ab0 = a[0] * b[0], ab1 = a[1] * b[1]; @@ -62,11 +67,11 @@ void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float -void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b) +void ORDERED_PARTIAL_CONVOLVE_SIMD(int n, float * a, float const * b) { int i; float ab0; - v4sf * /*RESTRICT*/ va = (v4sf *)a; + v4sf * RESTRICT va = (v4sf *)a; v4sf const * RESTRICT vb = (v4sf const *)b; assert(VALIGNED(a) && VALIGNED(b)); ab0 = a[0] * b[0]; diff --git a/soxr/src/util32s.c b/soxr/src/util32s.c new file mode 100644 index 0000000..b9c9e08 --- /dev/null +++ b/soxr/src/util32s.c @@ -0,0 +1,8 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#define PFFFT_DOUBLE 0 + +#include "util32s.h" + +#include "util-simd.c" diff --git a/soxr/src/util32s.h b/soxr/src/util32s.h new file mode 100644 index 0000000..12226e5 --- /dev/null +++ b/soxr/src/util32s.h @@ -0,0 +1,23 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined soxr_util32s_included +#define soxr_util32s_included + +#include + +void * _soxr_simd32_aligned_malloc(size_t); +void * _soxr_simd32_aligned_calloc(size_t, size_t); +void _soxr_simd32_aligned_free(void *); + +#define SIMD_ALIGNED_MALLOC _soxr_simd32_aligned_malloc +#define SIMD_ALIGNED_CALLOC _soxr_simd32_aligned_calloc +#define SIMD_ALIGNED_FREE _soxr_simd32_aligned_free + +void _soxr_ordered_convolve_simd32(int n, void * not_used, float * a, float const * b); +void _soxr_ordered_partial_convolve_simd32(int n, float * a, float const * b); + +#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd32 +#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd32 + +#endif diff --git a/soxr/src/util64s.c b/soxr/src/util64s.c new file mode 100644 index 0000000..0faa9e9 --- /dev/null +++ b/soxr/src/util64s.c @@ -0,0 +1,8 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#define PFFFT_DOUBLE 1 + +#include "util64s.h" + +#include "util-simd.c" diff --git a/soxr/src/util64s.h b/soxr/src/util64s.h new file mode 100644 index 0000000..7beeb89 --- /dev/null +++ b/soxr/src/util64s.h @@ -0,0 +1,23 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#if !defined soxr_util64s_included +#define soxr_util64s_included + +#include + +void * _soxr_simd64_aligned_malloc(size_t); +void * _soxr_simd64_aligned_calloc(size_t, size_t); +void _soxr_simd64_aligned_free(void *); + +#define SIMD_ALIGNED_MALLOC _soxr_simd64_aligned_malloc +#define SIMD_ALIGNED_CALLOC _soxr_simd64_aligned_calloc +#define SIMD_ALIGNED_FREE _soxr_simd64_aligned_free + +void _soxr_ordered_convolve_simd64(int n, void * not_used, double * a, double const * b); +void _soxr_ordered_partial_convolve_simd64(int n, double * a, double const * b); + +#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd64 +#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd64 + +#endif diff --git a/soxr/src/vr-coefs.c b/soxr/src/vr-coefs.c index 14886df..a57bec8 100644 --- a/soxr/src/vr-coefs.c +++ b/soxr/src/vr-coefs.c @@ -103,6 +103,9 @@ static void iir(int N, double Fp, char const * name) int main(int argc, char **argv) { + puts("/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net"); + puts(" * Licence for this file: LGPL v2.1 See LICENCE for details. */\n"); + fir(241, 1, .45, .5, 160, 32, "half_fir_coefs"); fir( 24, .5, .25, .5, 1, 31, "fast_half_fir_coefs"); fir( 20, 12, .9 , 1.5, 160, 58, "coefs0_d"); diff --git a/soxr/src/vr-coefs.h b/soxr/src/vr-coefs.h index 9790ec0..e44138e 100644 --- a/soxr/src/vr-coefs.h +++ b/soxr/src/vr-coefs.h @@ -1,3 +1,6 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + static float const half_fir_coefs[] = { 0.471112154f, 0.316907549f, 0.0286963396f, -0.101927032f, -0.0281272982f, 0.0568029535f, 0.027196876f, -0.0360795942f, diff --git a/soxr/src/vr32.c b/soxr/src/vr32.c index 65eed3f..5159603 100644 --- a/soxr/src/vr32.c +++ b/soxr/src/vr32.c @@ -1,16 +1,10 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ /* Variable-rate resampling. */ #include -#include -#if !defined M_PI -#define M_PI 3.14159265358979323846 -#endif -#if !defined M_LN2 -#define M_LN2 0.69314718055994530942 -#endif +#include "math-wrap.h" #include #include #include "internal.h" @@ -197,7 +191,7 @@ static float poly_fir1_u(float const * input, uint32_t frac) typedef struct { union { int64_t all; -#if WORDS_BIGENDIAN +#if HAVE_BIGENDIAN struct {int32_t integer; uint32_t frac;} part; #else struct {uint32_t frac; int32_t integer;} part; @@ -316,7 +310,7 @@ static void vr_init(rate_t * p, double default_io_ratio, int num_stages, double } fifo_create(&p->output_fifo, sizeof(float)); p->default_io_ratio = default_io_ratio; - if (!fade_coefs[0]) { + if (fade_coefs[0]==0) { for (i = 0; i < iAL(fade_coefs); ++i) fade_coefs[i] = (float)(.5 * (1 + cos(M_PI * i / (AL(fade_coefs) - 1)))); prepare_coefs(poly_fir_coefs_u, POLY_FIR_LEN_U, PHASES0_U, PHASES_U, coefs0_u, mult); @@ -354,8 +348,9 @@ static bool set_step_step(stream_t * p, double io_ratio, int slew_len) return p->step_step.all != 0; } -static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len) +static void vr_set_io_ratio(void * P, double io_ratio, size_t slew_len) { + rate_t *p = P; assert(io_ratio > 0); if (slew_len) { if (!set_step_step(&p->current, io_ratio, p->slew_len = (int)slew_len)) @@ -367,7 +362,7 @@ static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len) } } else { - if (p->default_io_ratio) { /* Then this is the first call to this fn. */ + if (p->default_io_ratio!=0) { /* Then this is the first call to this fn. */ int octave = (int)floor(log(io_ratio) / M_LN2); p->current.stage_num = octave < 0? -1 : min(octave, p->num_stages0-1); enter_new_stage(p, 0); @@ -375,7 +370,7 @@ static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len) else if (p->fade_len) set_step(&p->fadeout, io_ratio); set_step(&p->current, io_ratio); - if (p->default_io_ratio) FRAC(p->current.at) = FRAC(p->current.step) >> 1; + if (p->default_io_ratio!=0) FRAC(p->current.at) = FRAC(p->current.step) >> 1; p->default_io_ratio = 0; } } @@ -427,10 +422,11 @@ static bool do_input_stage(rate_t * p, int stage_num, int sign, int min_stage_nu return true; } -static int vr_process(rate_t * p, int olen0) +static void vr_process(void * P, size_t olen0) { + rate_t *p = P; assert(p->num_stages > 0); - if (p->default_io_ratio) + if (p->default_io_ratio!=0) vr_set_io_ratio(p, p->default_io_ratio, 0); { float * output = fifo_reserve(&p->output_fifo, olen0); @@ -462,7 +458,7 @@ static int vr_process(rate_t * p, int olen0) olen = min(olen, (int)(AL(buf) >> 1)); if (p->slew_len) olen = min(olen, p->slew_len); - else if (p->new_io_ratio) { + else if (p->new_io_ratio!=0) { set_step(&p->current, p->new_io_ratio); set_step(&p->fadeout, p->new_io_ratio); p->fadeout.step_step.all = p->current.step_step.all = 0; @@ -568,17 +564,18 @@ static int vr_process(rate_t * p, int olen0) fifo_read(&p->stages[i].fifo, idone, NULL); } fifo_trim_by(&p->output_fifo, olen0 - odone0); - return odone0; } } -static float * vr_input(rate_t * p, float const * input, size_t n) +static void * vr_input(void * p, void * input, size_t n) { - return fifo_write(&p->stages[0].fifo, (int)n, input); + return fifo_write(&((rate_t *)p)->stages[0].fifo, (int)n, input); } -static float const * vr_output(rate_t * p, float * output, size_t * n) +static void const * vr_output(void * P, void * O, size_t * n) { + rate_t *p = P; + float *output = O; fifo_t * fifo = &p->output_fifo; if (1 || !p->num_stages0) return fifo_read(fifo, (int)(*n = min(*n, (size_t)fifo_occupancy(fifo))), output); @@ -594,17 +591,19 @@ static float const * vr_output(rate_t * p, float * output, size_t * n) } } -static void vr_flush(rate_t * p) +static void vr_flush(void * P) { + rate_t *p = P; if (!p->flushing) { stage_preload(&p->stages[0]); ++p->flushing; } } -static void vr_close(rate_t * p) +static void vr_close(void * P) { int i; + rate_t *p = P; fifo_delete(&p->output_fifo); for (i = -1; i < p->num_stages; ++i) { @@ -614,7 +613,7 @@ static void vr_close(rate_t * p) free(p->stages - 1); } -static double vr_delay(rate_t * p) +static double vr_delay(void * p) { return 100; /* TODO */ (void)p; @@ -639,19 +638,20 @@ static char const * vr_create(void * channel, void * shared,double max_io_ratio, static char const * vr_id(void) { - return "single-precision variable-rate"; + return "vr32"; } -typedef void (* fn_t)(void); -fn_t _soxr_vr32_cb[] = { - (fn_t)vr_input, - (fn_t)vr_process, - (fn_t)vr_output, - (fn_t)vr_flush, - (fn_t)vr_close, - (fn_t)vr_delay, - (fn_t)vr_sizes, - (fn_t)vr_create, - (fn_t)vr_set_io_ratio, - (fn_t)vr_id, +#include "cb_t.h" + +control_block_t _soxr_vr32_cb = { + vr_input, + vr_process, + vr_output, + vr_flush, + vr_close, + vr_delay, + vr_sizes, + vr_create, + vr_set_io_ratio, + vr_id, }; diff --git a/soxr/src/vr32s.c b/soxr/src/vr32s.c deleted file mode 100644 index cf0fdaa..0000000 --- a/soxr/src/vr32s.c +++ /dev/null @@ -1,665 +0,0 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net - * Licence for this file: LGPL v2.1 See LICENCE for details. */ - -/* Variable-rate resampling. */ - -#include -#include -#if !defined M_PI -#define M_PI 3.14159265358979323846 -#endif -#if !defined M_LN2 -#define M_LN2 0.69314718055994530942 -#endif -#include -#include -#if defined(__x86_64__) || defined(_M_X64) -#include -#elif defined(__ARM_NEON) -#include "sse2neon.h" -#endif -#include "internal.h" -#define FIFO_SIZE_T int -#define FIFO_MIN 0x8000 -#include "fifo.h" -#include "vr-coefs.h" - -#define FADE_LEN_BITS 9 -#define PHASE_BITS_D 10 -#define PHASE_BITS_U 9 - -#define PHASES0_D 12 -#define POLY_FIR_LEN_D 20 -#define POLY_FIR_LEN_D_VEC (POLY_FIR_LEN_D / 4) -#define PHASES0_U 6 -#define POLY_FIR_LEN_U 12 -#define POLY_FIR_LEN_U_VEC (POLY_FIR_LEN_U / 4) - -#define MULT32 (65536. * 65536.) -#define PHASES_D (1 << PHASE_BITS_D) -#define PHASES_U (1 << PHASE_BITS_U) - -#define CONVOLVE \ - _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \ - _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \ - _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ - -#define HALF_FIR_LEN_2 (iAL(half_fir_coefs) - 1) -#define HALF_FIR_LEN_4 (HALF_FIR_LEN_2 / 2) - -#define _ sum += (input[-i] + input[i]) * half_fir_coefs[i], ++i; -static float half_fir(float const * input) -{ - long i = 1; - float sum = input[0] * half_fir_coefs[0]; - CONVOLVE CONVOLVE - assert(i == HALF_FIR_LEN_2 + 1); - return (float)sum; -} -#undef _ - -#define _ sum += (input[-i] + input[i]) * half_fir_coefs[2*i], ++i; -static float double_fir0(float const * input) -{ - int i = 1; - float sum = input[0] * half_fir_coefs[0]; - CONVOLVE - assert(i == HALF_FIR_LEN_4 + 1); - return (float)(sum * 2); -} -#undef _ - -#define _ sum += (input[-i] + input[1+i]) * half_fir_coefs[2*i+1], ++i; -static float double_fir1(float const * input) -{ - int i = 0; - float sum = 0; - CONVOLVE - assert(i == HALF_FIR_LEN_4 + 0); - return (float)(sum * 2); -} -#undef _ - -static float fast_half_fir(float const * input) -{ - int i = 0; - float sum = input[0] * .5f; -#define _ sum += (input[-(2*i+1)] + input[2*i+1]) * fast_half_fir_coefs[i], ++i; - _ _ _ _ _ _ -#undef _ - return (float)sum; -} - -#define IIR_FILTER _ _ _ _ _ _ _ -#define _ in1=(in1-p->y[i])*iir_coefs[i]+tmp1;tmp1=p->y[i],p->y[i]=in1;++i;\ - in0=(in0-p->y[i])*iir_coefs[i]+tmp0;tmp0=p->y[i],p->y[i]=in0;++i; - -typedef struct {float x[2], y[AL(iir_coefs)];} half_iir_t; - -static float half_iir1(half_iir_t * p, float in0, float in1) -{ - int i = 0; - float tmp0, tmp1; - tmp0 = p->x[0], p->x[0] = in0; - tmp1 = p->x[1], p->x[1] = in1; - IIR_FILTER - p->y[i] = in1 = (in1 - p->y[i]) * iir_coefs[i] + tmp1; - return in1 + in0; -} -#undef _ - -static void half_iir(half_iir_t * p, float * obuf, float const * ibuf, int olen) -{ - int i; - for (i=0; i < olen; obuf[i] = (float)half_iir1(p, ibuf[i*2], ibuf[i*2+1]),++i); -} - -static void half_phase(half_iir_t * p, float * buf, int len) -{ - float const small_normal = 1/MULT32/MULT32; /* To quash denormals on path 0.*/ - int i; - for (i = 0; i < len; buf[i] = (float)half_iir1(p, buf[i], 0), ++i); -#define _ p->y[i] += small_normal, i += 2; - i = 0, _ IIR_FILTER -#undef _ -#define _ p->y[i] -= small_normal, i += 2; - i = 0, _ IIR_FILTER -#undef _ -} - -#define coefs(coef_p, fir_len, phase_num, coef_vec_num) \ - coef_p[(fir_len) * (phase_num) + (coef_vec_num)] - -#define COEF(h,l,i) ((i)<0||(i)>=(l)?0:(h)[(i)>(l)/2?(l)-(i):(i)]) -static void prepare_coefs(__m128 * coefs_a, __m128 * coefs_b, - int n, int phases0, int phases, float const * coefs0, double multiplier) -{ - double k[6]; - int length0 = n * phases0, length = n * phases, K0 = iAL(k)/2 - 1, i, j, pos; - float * coefs1 = malloc(((size_t)length / 2 + 1) * sizeof(*coefs1)); - float * p = coefs1, f0, f1 = 0; - - for (j = 0; j < iAL(k); k[j] = COEF(coefs0, length0, j - K0), ++j); - for (pos = i = 0; i < length0 / 2; ++i) { - double b=(1/24.)*(k[0]+k[4]+6*k[2]-4*(k[1]+k[3])),d=.5*(k[1]+k[3])-k[2]-b; - double a=(1/120.)*(k[5]-k[2]-9*(9*b+d)+2.5*(k[3]-k[1])-2*(k[4]-k[0])); - double c=(1/12.)*(k[4]-k[0]-2*(k[3]-k[1])-60*a),e=.5*(k[3]-k[1])-a-c; - for (; pos / phases == i; pos += phases0) { - double x = (double)(pos % phases) / phases; - *p++ = (float)(k[K0] + ((((a*x + b)*x + c)*x + d)*x + e)*x); - } - for (j = 0; j < iAL(k) - 1; k[j] = k[j + 1], ++j); - k[j] = COEF(coefs0, length0, i + iAL(k) / 2 + 1); - } - if (!(length & 1)) - *p++ = (float)k[K0]; - assert(p - coefs1 == length / 2 + 1); - - for (i = 0; i < n; ++i) for (j = phases - 1; j >= 0; --j, f1 = f0) { - pos = (n - 1 - i) * phases + j; - f0 = COEF(coefs1, length, pos) * (float)multiplier; - ((float*)&coefs(coefs_a, n / 4, j, i / 4))[i % 4] = (float)f0; - ((float*)&coefs(coefs_b, n / 4, j, i / 4))[i % 4] = (float)(f1 - f0); - } - free(coefs1); -} - -#define _ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_add_ps(_mm_mul_ps(b, x), a), _mm_loadu_ps(&input[i*4]))), ++i; -#define a (coefs(poly_fir_coefs_d_a, POLY_FIR_LEN_D_VEC, phase, i)) -#define b (coefs(poly_fir_coefs_d_b, POLY_FIR_LEN_D_VEC, phase, i)) -static __m128 poly_fir_coefs_d_a[POLY_FIR_LEN_D_VEC * PHASES_D]; -static __m128 poly_fir_coefs_d_b[POLY_FIR_LEN_D_VEC * PHASES_D]; - -static float poly_fir1_d(float const * input, uint32_t frac) -{ - int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_D)); - __m128 sum = _mm_set1_ps(0.f), x = _mm_set1_ps((float)(frac << PHASE_BITS_D) * (float)(1 / MULT32)); - _ _ _ _ _ - assert(i == POLY_FIR_LEN_D_VEC); - return ((float*)&sum)[0] + ((float*)&sum)[1] + ((float*)&sum)[2] + ((float*)&sum)[3]; -} -#undef a -#undef b -#define a (coefs(poly_fir_coefs_u_a, POLY_FIR_LEN_U_VEC, phase, i)) -#define b (coefs(poly_fir_coefs_u_b, POLY_FIR_LEN_U_VEC, phase, i)) -static __m128 poly_fir_coefs_u_a[POLY_FIR_LEN_U_VEC * PHASES_U]; -static __m128 poly_fir_coefs_u_b[POLY_FIR_LEN_U_VEC * PHASES_U]; - -static float poly_fir1_u(float const * input, uint32_t frac) -{ - int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_U)); - __m128 sum = _mm_set1_ps(0.f), x = _mm_set1_ps((float)(frac << PHASE_BITS_U) * (float)(1 / MULT32)); - _ _ _ - assert(i == POLY_FIR_LEN_U_VEC); - return ((float*)&sum)[0] + ((float*)&sum)[1] + ((float*)&sum)[2] + ((float*)&sum)[3]; -} -#undef a -#undef b -#undef _ - -#define ADD_TO(x,y) x.all += y.all -#define SUBTRACT_FROM(x,y) x.all -= y.all -#define FRAC(x) x.part.frac -#define INT(x) x.part.integer - -typedef struct { - union { - int64_t all; -#if WORDS_BIGENDIAN - struct {int32_t integer; uint32_t frac;} part; -#else - struct {uint32_t frac; int32_t integer;} part; -#endif - } at, step, step_step; - float const * input; - int len, stage_num; - bool is_d; /* true: downsampling at x2 rate; false: upsampling at 1x rate. */ - double step_mult; -} stream_t; - -static int poly_fir_d(stream_t * s, float * output, int olen) -{ - int i; - float const * input = s->input - POLY_FIR_LEN_D / 2 + 1; - for (i = 0; i < olen && INT(s->at) < s->len; ++i) { - output[i] = poly_fir1_d(input + INT(s->at), FRAC(s->at)); - ADD_TO(s->at, s->step); - if (!(INT(s->at) < s->len)) { - SUBTRACT_FROM(s->at, s->step); - break; - } - output[++i] = poly_fir1_d(input + INT(s->at), FRAC(s->at)); - ADD_TO(s->at, s->step); - ADD_TO(s->step, s->step_step); - } - return i; -} - -static int poly_fir_fade_d( - stream_t * s, float const * vol, int step, float * output, int olen) -{ - int i; - float const * input = s->input - POLY_FIR_LEN_D / 2 + 1; - for (i = 0; i < olen && INT(s->at) < s->len; ++i, vol += step) { - output[i] += *vol * poly_fir1_d(input + INT(s->at), FRAC(s->at)); - ADD_TO(s->at, s->step); - if (!(INT(s->at) < s->len)) { - SUBTRACT_FROM(s->at, s->step); - break; - } - output[++i] += *(vol += step) * poly_fir1_d(input + INT(s->at),FRAC(s->at)); - ADD_TO(s->at, s->step); - ADD_TO(s->step, s->step_step); - } - return i; -} - -static int poly_fir_u(stream_t * s, float * output, int olen) -{ - int i; - float const * input = s->input - POLY_FIR_LEN_U / 2 + 1; - for (i = 0; i < olen && INT(s->at) < s->len; ++i) { - output[i] = poly_fir1_u(input + INT(s->at), FRAC(s->at)); - ADD_TO(s->at, s->step); - ADD_TO(s->step, s->step_step); - } - return i; -} - -static int poly_fir_fade_u( - stream_t * s, float const * vol, int step, float * output, int olen) -{ - int i; - float const * input = s->input - POLY_FIR_LEN_U / 2 + 1; - for (i = 0; i < olen && INT(s->at) < s->len; i += 2, vol += step) { - output[i] += *vol * poly_fir1_u(input + INT(s->at), FRAC(s->at)); - ADD_TO(s->at, s->step); - ADD_TO(s->step, s->step_step); - } - return i; -} - -#define shiftr(x,by) ((by) < 0? (x) << (-(by)) : (x) >> (by)) -#define shiftl(x,by) shiftr(x,-(by)) -#define stage_occupancy(s) (fifo_occupancy(&(s)->fifo) - 4*HALF_FIR_LEN_2) -#define stage_read_p(s) ((float *)fifo_read_ptr(&(s)->fifo) + 2*HALF_FIR_LEN_2) -#define stage_preload(s) memset(fifo_reserve(&(s)->fifo, (s)->preload), \ - 0, sizeof(float) * (size_t)(s)->preload); - -typedef struct { - fifo_t fifo; - double step_mult; - int is_fast, x_fade_len, preload; -} stage_t; - -typedef struct { - int num_stages0, num_stages, flushing; - int fade_len, slew_len, xfade, stage_inc, switch_stage_num; - double new_io_ratio, default_io_ratio; - stage_t * stages; - fifo_t output_fifo; - half_iir_t halfer; - stream_t current, fadeout; /* Current/fade-in, fadeout streams. */ -} rate_t; - -static float fade_coefs[(2 << FADE_LEN_BITS) + 1]; - -static void vr_init(rate_t * p, double default_io_ratio, int num_stages, double mult) -{ - int i; - assert(num_stages >= 0); - memset(p, 0, sizeof(*p)); - - p->num_stages0 = num_stages; - p->num_stages = num_stages = max(num_stages, 1); - p->stages = (stage_t *)calloc((unsigned)num_stages + 1, sizeof(*p->stages)) + 1; - for (i = -1; i < p->num_stages; ++i) { - stage_t * s = &p->stages[i]; - fifo_create(&s->fifo, sizeof(float)); - s->step_mult = 2 * MULT32 / shiftl(2, i); - s->preload = i < 0? 0 : i == 0? 2 * HALF_FIR_LEN_2 : 3 * HALF_FIR_LEN_2 / 2; - stage_preload(s); - s->is_fast = true; - lsx_debug("%-3i preload=%i", i, s->preload); - } - fifo_create(&p->output_fifo, sizeof(float)); - p->default_io_ratio = default_io_ratio; - if (!fade_coefs[0]) { - for (i = 0; i < iAL(fade_coefs); ++i) - fade_coefs[i] = (float)(.5 * (1 + cos(M_PI * i / (AL(fade_coefs) - 1)))); - prepare_coefs(poly_fir_coefs_u_a, poly_fir_coefs_u_b, POLY_FIR_LEN_U, PHASES0_U, PHASES_U, coefs0_u, mult); - prepare_coefs(poly_fir_coefs_d_a, poly_fir_coefs_d_b, POLY_FIR_LEN_D, PHASES0_D, PHASES_D, coefs0_d, mult *.5); - } - assert(fade_coefs[0]); -} - -static void enter_new_stage(rate_t * p, int occupancy0) -{ - p->current.len = shiftr(occupancy0, p->current.stage_num); - p->current.input = stage_read_p(&p->stages[p->current.stage_num]); - - p->current.step_mult = p->stages[p->current.stage_num].step_mult; - p->current.is_d = p->current.stage_num >= 0; - if (p->current.is_d) - p->current.step_mult *= .5; -} - -static void set_step(stream_t * p, double io_ratio) -{ - p->step.all = (int64_t)(io_ratio * p->step_mult + .5); -} - -static bool set_step_step(stream_t * p, double io_ratio, int slew_len) -{ - int64_t dif; - int difi; - stream_t tmp = *p; - set_step(&tmp, io_ratio); - dif = tmp.step.all - p->step.all; - dif = dif < 0? dif - (slew_len >> 1) : dif + (slew_len >> 1); - difi = (int)dif; /* Try to avoid int64_t div. */ - p->step_step.all = difi == dif? difi / slew_len : dif / slew_len; - return p->step_step.all != 0; -} - -static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len) -{ - assert(io_ratio > 0); - if (slew_len) { - if (!set_step_step(&p->current, io_ratio, p->slew_len = (int)slew_len)) - p->slew_len = 0, p->new_io_ratio = 0, p->fadeout.step_step.all = 0; - else { - p->new_io_ratio = io_ratio; - if (p->fade_len) - set_step_step(&p->fadeout, io_ratio, p->slew_len); - } - } - else { - if (p->default_io_ratio) { /* Then this is the first call to this fn. */ - int octave = (int)floor(log(io_ratio) / M_LN2); - p->current.stage_num = octave < 0? -1 : min(octave, p->num_stages0-1); - enter_new_stage(p, 0); - } - else if (p->fade_len) - set_step(&p->fadeout, io_ratio); - set_step(&p->current, io_ratio); - if (p->default_io_ratio) FRAC(p->current.at) = FRAC(p->current.step) >> 1; - p->default_io_ratio = 0; - } -} - -static bool do_input_stage(rate_t * p, int stage_num, int sign, int min_stage_num) -{ - int i = 0; - float * dest; - stage_t * s = &p->stages[stage_num]; - stage_t * s1 = &p->stages[stage_num - sign]; - float const * src = (float *)fifo_read_ptr(&s1->fifo) + HALF_FIR_LEN_2; - int len = shiftr(fifo_occupancy(&s1->fifo) - HALF_FIR_LEN_2 * 2, sign); - int already_done = fifo_occupancy(&s->fifo) - s->preload; - if ((len -= already_done) <= 0) - return false; - src += shiftl(already_done, sign); - - dest = fifo_reserve(&s->fifo, len); - if (stage_num < 0) for (; i < len; ++src) - dest[i++] = double_fir0(src), dest[i++] = double_fir1(src); - else { - bool should_be_fast = p->stage_inc; - if (!s->x_fade_len && stage_num == p->switch_stage_num) { - p->switch_stage_num = 0; - if (s->is_fast != should_be_fast) { - s->x_fade_len = 1 << FADE_LEN_BITS, s->is_fast = should_be_fast, ++p->xfade; - lsx_debug("xfade level %i, inc?=%i", stage_num, p->stage_inc); - } - } - if (s->x_fade_len) { - float const * vol1 = fade_coefs + (s->x_fade_len << 1); - float const * vol2 = fade_coefs + (((1 << FADE_LEN_BITS) - s->x_fade_len) << 1); - int n = min(len, s->x_fade_len); - /*lsx_debug("xfade level %i, inc?=%i len=%i n=%i", stage_num, p->stage_inc, s->x_fade_len, n);*/ - if (should_be_fast) - for (; i < n; vol2 += 2, vol1 -= 2, src += 2) - dest[i++] = *vol1 * fast_half_fir(src) + *vol2 * half_fir(src); - else for (; i < n; vol2 += 2, vol1 -= 2, src += 2) - dest[i++] = *vol2 * fast_half_fir(src) + *vol1 * half_fir(src); - s->x_fade_len -= n; - p->xfade -= !s->x_fade_len; - } - if (stage_num < min_stage_num) - for (; i < len; dest[i++] = fast_half_fir(src), src += 2); - else for (; i < len; dest[i++] = half_fir(src), src += 2); - } - if (p->flushing > 0) - stage_preload(s); - return true; -} - -static int vr_process(rate_t * p, int olen0) -{ - assert(p->num_stages > 0); - if (p->default_io_ratio) - vr_set_io_ratio(p, p->default_io_ratio, 0); - { - float * output = fifo_reserve(&p->output_fifo, olen0); - int j, odone0 = 0, min_stage_num = p->current.stage_num; - int occupancy0, max_stage_num = min_stage_num; - if (p->fade_len) { - min_stage_num = min(min_stage_num, p->fadeout.stage_num); - max_stage_num = max(max_stage_num, p->fadeout.stage_num); - } - - for (j = min(min_stage_num, 0); j <= max_stage_num; ++j) - if (j && !do_input_stage(p, j, j < 0? -1 : 1, min_stage_num)) - break; - if (p->flushing > 0) - p->flushing = -1; - - occupancy0 = shiftl(max(0,stage_occupancy(&p->stages[max_stage_num])), max_stage_num); - p->current.len = shiftr(occupancy0, p->current.stage_num); - p->current.input = stage_read_p(&p->stages[p->current.stage_num]); - if (p->fade_len) { - p->fadeout.len = shiftr(occupancy0, p->fadeout.stage_num); - p->fadeout.input = stage_read_p(&p->stages[p->fadeout.stage_num]); - } - - while (odone0 < olen0) { - int odone, odone2, olen = olen0 - odone0, stage_dif = 0, shift; - float buf[64 << 1]; - - olen = min(olen, (int)(AL(buf) >> 1)); - if (p->slew_len) - olen = min(olen, p->slew_len); - else if (p->new_io_ratio) { - set_step(&p->current, p->new_io_ratio); - set_step(&p->fadeout, p->new_io_ratio); - p->fadeout.step_step.all = p->current.step_step.all = 0; - p->new_io_ratio = 0; - } - if (!p->flushing && !p->fade_len && !p->xfade) { - if (p->current.is_d) { - if (INT(p->current.step) && FRAC(p->current.step)) - stage_dif = 1, ++max_stage_num; - else if (!INT(p->current.step) && FRAC(p->current.step) < (1u << 31)) - stage_dif = -1, --min_stage_num; - } else if (INT(p->current.step) > 1 && FRAC(p->current.step)) - stage_dif = 1, ++max_stage_num; - } - if (stage_dif) { - int n = p->current.stage_num + stage_dif; - if (n >= p->num_stages) - --max_stage_num; - else { - p->stage_inc = stage_dif > 0; - p->fadeout = p->current; - p->current.stage_num += stage_dif; - if (!p->stage_inc) - p->switch_stage_num = p->current.stage_num; - if ((p->current.stage_num < 0 && stage_dif < 0) || - (p->current.stage_num > 0 && stage_dif > 0)) { - stage_t * s = &p->stages[p->current.stage_num]; - fifo_clear(&s->fifo); - stage_preload(s); - s->is_fast = false; - do_input_stage(p, p->current.stage_num, stage_dif, p->current.stage_num); - } - if (p->current.stage_num > 0 && stage_dif < 0) { - int idone = INT(p->current.at); - stage_t * s = &p->stages[p->current.stage_num]; - fifo_trim_to(&s->fifo, 2 * HALF_FIR_LEN_2 + idone + (POLY_FIR_LEN_D >> 1)); - do_input_stage(p, p->current.stage_num, 1, p->current.stage_num); - } - enter_new_stage(p, occupancy0); - shift = -stage_dif; -#define lshift(x,by) (x)=(by)>0?(x)<<(by):(x)>>-(by) - lshift(p->current.at.all, shift); - shift += p->fadeout.is_d - p->current.is_d; - lshift(p->current.step.all, shift); - lshift(p->current.step_step.all, shift); - p->fade_len = AL(fade_coefs) - 1; - lsx_debug("switch from stage %i to %i, x2 from %i to %i", p->fadeout.stage_num, p->current.stage_num, p->fadeout.is_d, p->current.is_d); - } - } - - if (p->fade_len) { - float const * vol1 = fade_coefs + p->fade_len; - float const * vol2 = fade_coefs + (iAL(fade_coefs) - 1 - p->fade_len); - int olen2 = (olen = min(olen, p->fade_len >> 1)) << 1; - - /* x2 is more fine-grained so may fail to produce a pair of samples - * where x1 would not (the x1 second sample is a zero so is always - * available). So do x2 first, then feed odone to the second one. */ - memset(buf, 0, sizeof(*buf) * (size_t)olen2); - if (p->current.is_d && p->fadeout.is_d) { - odone = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2); - odone2 = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, odone); - } else if (p->current.is_d) { - odone = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2); - odone2 = poly_fir_fade_u(&p->fadeout, vol2, 2, buf, odone); - } else { - assert(p->fadeout.is_d); - odone = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, olen2); - odone2 = poly_fir_fade_u(&p->current, vol1,-2, buf, odone); - } - assert(odone == odone2); - (void)odone2; - p->fade_len -= odone; - if (!p->fade_len) { - if (p->stage_inc) - p->switch_stage_num = min_stage_num++; - else - --max_stage_num; - } - half_iir(&p->halfer, &output[odone0], buf, odone >>= 1); - } - else if (p->current.is_d) { - odone = poly_fir_d(&p->current, buf, olen << 1) >> 1; - half_iir(&p->halfer, &output[odone0], buf, odone); - } - else { - odone = poly_fir_u(&p->current, &output[odone0], olen); - if (p->num_stages0) - half_phase(&p->halfer, &output[odone0], odone); - } - odone0 += odone; - if (p->slew_len) - p->slew_len -= odone; - if (odone != olen) - break; /* Need more input. */ - } { - int from = max(0, max_stage_num), to = min(0, min_stage_num); - int i, idone = shiftr(INT(p->current.at), from - p->current.stage_num); - INT(p->current.at) -= shiftl(idone, from - p->current.stage_num); - if (p->fade_len) - INT(p->fadeout.at) -= shiftl(idone, from - p->fadeout.stage_num); - for (i = from; i >= to; --i, idone <<= 1) - fifo_read(&p->stages[i].fifo, idone, NULL); - } - fifo_trim_by(&p->output_fifo, olen0 - odone0); - return odone0; - } -} - -static float * vr_input(rate_t * p, float const * input, size_t n) -{ - return fifo_write(&p->stages[0].fifo, (int)n, input); -} - -static float const * vr_output(rate_t * p, float * output, size_t * n) -{ - fifo_t * fifo = &p->output_fifo; - if (1 || !p->num_stages0) - return fifo_read(fifo, (int)(*n = min(*n, (size_t)fifo_occupancy(fifo))), output); - else { /* Ignore this complication for now. */ - int const IIR_DELAY = 2; - float * ptr = fifo_read_ptr(fifo); - int olen = min((int)*n, max(0, fifo_occupancy(fifo) - IIR_DELAY)); - *n = (size_t)olen; - if (output) - memcpy(output, ptr + IIR_DELAY, *n * sizeof(*output)); - fifo_read(fifo, olen, NULL); - return ptr + IIR_DELAY; - } -} - -static void vr_flush(rate_t * p) -{ - if (!p->flushing) { - stage_preload(&p->stages[0]); - ++p->flushing; - } -} - -static void vr_close(rate_t * p) -{ - int i; - - fifo_delete(&p->output_fifo); - for (i = -1; i < p->num_stages; ++i) { - stage_t * s = &p->stages[i]; - fifo_delete(&s->fifo); - } - free(p->stages - 1); -} - -static double vr_delay(rate_t * p) -{ - return 100; /* TODO */ - (void)p; -} - -static void vr_sizes(size_t * shared, size_t * channel) -{ - *shared = 0; - *channel = sizeof(rate_t); -} - -static char const * vr_create(void * channel, void * shared,double max_io_ratio, - void * q_spec, void * r_spec, double scale) -{ - double x = max_io_ratio; - int n; - for (n = 0; x > 1; x *= .5, ++n); - vr_init(channel, max_io_ratio, n, scale); - return 0; - (void)shared, (void)q_spec, (void)r_spec; -} - -static char const * vr_id(void) -{ - return "single-precision variable-rate"; -} - -typedef void (* fn_t)(void); -fn_t _soxr_vr32_cb[] = { - (fn_t)vr_input, - (fn_t)vr_process, - (fn_t)vr_output, - (fn_t)vr_flush, - (fn_t)vr_close, - (fn_t)vr_delay, - (fn_t)vr_sizes, - (fn_t)vr_create, - (fn_t)vr_set_io_ratio, - (fn_t)vr_id, -}; diff --git a/soxr/tests/CMakeLists.txt b/soxr/tests/CMakeLists.txt index fc350de..ee8dd0b 100644 --- a/soxr/tests/CMakeLists.txt +++ b/soxr/tests/CMakeLists.txt @@ -1,8 +1,8 @@ # SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net # Licence for this file: LGPL v2.1 See LICENCE for details. -add_definitions (${PROJECT_C_FLAGS}) -link_libraries (soxr) +set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}") +link_libraries (${PROJECT_NAME} ${LIBM_LIBRARIES}) file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.c) foreach (fe ${SOURCES}) @@ -10,7 +10,10 @@ foreach (fe ${SOURCES}) add_executable (${f} ${fe}) endforeach () -enable_testing () +# Can't use c89 for this file: +if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang") + set_property (SOURCE throughput APPEND_STRING PROPERTY COMPILE_FLAGS "-std=gnu89") +endif () set (sweep_to_freq 22050) set (leader 1) @@ -20,33 +23,40 @@ math (EXPR base_rate "${sweep_to_freq} + ${sweep_to_freq}") macro (add_vector r) set (output ${CMAKE_CURRENT_BINARY_DIR}/ref-${r}.s32) add_custom_command (OUTPUT ${output} DEPENDS vector-gen ${CMAKE_CURRENT_LIST_FILE} - COMMAND vector-gen ${r} ${leader} ${len} ${sweep_to_freq} 1 ${output}) + COMMAND vector-gen ${r} ${leader} ${len} 0 ${sweep_to_freq} 1 ${output}) set (vectors ${output} ${vectors}) endmacro () -macro (add_cmp_test from to bits) - set (name ${bits}-bit-perfect-${from}-${to}) - add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN} -DEXAMPLES_BIN=${EXAMPLES_BIN} -Dleader=${leader} -Dto=${to} - -Dfrom=${from} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake) - add_vector (${from}) - add_vector (${to}) +macro (add_cmp_test irate orate bits) + set (name ${bits}-bit-perfect-${irate}-${orate}) + add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN} + -DEXAMPLES_BIN=${EXAMPLES_BIN} -DlenToSkip=${leader} -Dorate=${orate} + -Dirate=${irate} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake) + add_vector (${irate}) + add_vector (${orate}) endmacro () unset (test_bits) -if (WITH_SINGLE_PRECISION) +if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S) set (test_bits 20) endif () -if (WITH_DOUBLE_PRECISION) - set (test_bits ${test_bits} 24) +if (WITH_CR64 OR WITH_CR64S) + set (test_bits ${test_bits} 28) endif () +set (rates 192000) +if (WITH_HI_PREC_CLOCK) + set (rates ${rates} 65537) +endif () foreach (b ${test_bits}) - foreach (r 96000 65537) + foreach (r ${rates}) add_cmp_test (${base_rate} ${r} ${b}) add_cmp_test (${r} ${base_rate} ${b}) endforeach () endforeach () -add_custom_target (test-vectors ALL DEPENDS ${vectors}) +if (NOT CMAKE_CROSSCOMPILING) + add_custom_target (test-vectors ALL DEPENDS ${vectors}) +endif () add_test (1-delay-clear ${BIN}1-delay-clear) diff --git a/soxr/tests/bandwidth-test b/soxr/tests/bandwidth-test index 47c2303..4efdcc9 100755 --- a/soxr/tests/bandwidth-test +++ b/soxr/tests/bandwidth-test @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -e # SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net @@ -9,8 +9,9 @@ set -e tool=./3-options-input-fn +w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //') -spec="spectrogram -z120 -Z-20 -wd -ho" +spec="spectrogram -z120 -Z-20 -w$w -ho" ext=f32; e=0 rate1=48000 rate2=44100 @@ -23,12 +24,12 @@ rate1n=`expr $rate1 / 2` sox -r $rate1 -n 0.$ext synth 8 sin 0:$rate1n gain -1 for pass in `seq 79 5 99`; do - f=bw1-$rate2-p`printf %02u $pass` + f=bw1-$rate2-p`printf %02u $pass`-$w $tool $rate1 $rate2 1 $e $e 4 0 $pass < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:100" done for pass in `seq 79 5 99`; do - f=bw2-$rate2-p`printf %02u $pass` + f=bw2-$rate2-p`printf %02u $pass`-$w stop=`expr 200 - $pass` $tool $rate1 $rate2 1 $e $e 4 0 $pass $stop < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:$stop" done diff --git a/soxr/tests/cmp-test.cmake b/soxr/tests/cmp-test.cmake index 8db76c5..a836322 100644 --- a/soxr/tests/cmp-test.cmake +++ b/soxr/tests/cmp-test.cmake @@ -1,17 +1,13 @@ # SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net # Licence for this file: LGPL v2.1 See LICENCE for details. -if (${bits} STREQUAL 24) - set (quality 45) -else () - set (quality 44) -endif () +math (EXPR quality "43 + (${bits} - 13) / 4") +set (ofile ${irate}-${orate}-${quality}.s32) +#message (STATUS "Output file = [${ofile}]") -set (output ${from}-${to}-${quality}.s32) - -execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${from} ${to} 1 2 2 ${quality} a - INPUT_FILE ref-${from}.s32 - OUTPUT_FILE ${output} +execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${irate} ${orate} 1 2 2 ${quality} a + INPUT_FILE ref-${irate}.s32 + OUTPUT_FILE ${ofile} ERROR_VARIABLE test_error RESULT_VARIABLE test_result) @@ -19,7 +15,11 @@ if (test_result) message (FATAL_ERROR "Resampling failure: ${test_error}") endif () -execute_process(COMMAND ${BIN}vector-cmp ref-${to}.s32 ${output} ${to} ${leader} ${len} ${bits} 98 +set (percentageToCheck 98) +math (EXPR lenToCheck "${len} * ${percentageToCheck}") +string (REGEX REPLACE "(..)$" ".\\1" lenToCheck "${lenToCheck}") # Divide by 100 + +execute_process(COMMAND ${BIN}vector-cmp ref-${orate}.s32 ${ofile} ${orate} ${lenToSkip} ${lenToCheck} ${bits} OUTPUT_VARIABLE test_output RESULT_VARIABLE test_result) diff --git a/soxr/tests/eg-test b/soxr/tests/eg-test index 58d085c..ccf4ce3 100755 --- a/soxr/tests/eg-test +++ b/soxr/tests/eg-test @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -e # SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net @@ -9,6 +9,7 @@ set -e len=8 +w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //') #vg="valgrind --leak-check=full --show-reachable=yes" @@ -42,6 +43,6 @@ signals=(sine-wave saw-tooth-wave) for n in 0 1 2 3; do signal=${signals[`expr $n % 2 || true`]} variation=${variations[`expr $n / 2 || true`]} - $vg ./5-variable-rate $n | sox -tf32 -r44100 -c1 - -n spectrogram -z130 -hwd -o v$n.png -X 50 -c "variation:$variation signal:$signal" + $vg ./5-variable-rate $n | sox -tf32 -r44100 -c1 - -n spectrogram -z130 -hw$w -o v$n-$w.png -X 50 -c "variation:$variation signal:$signal" vg="" done diff --git a/soxr/tests/io-test b/soxr/tests/io-test index a291c78..608bc9a 100755 --- a/soxr/tests/io-test +++ b/soxr/tests/io-test @@ -1,7 +1,7 @@ -#!/bin/bash +#!/usr/bin/env bash set -e -# SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net # Licence for this file: LGPL v2.1 See LICENCE for details. # Tests IO @@ -14,22 +14,28 @@ len=16 f=1/32768 g=32768:0 tool=./3-options-input-fn +w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //') types=(f32 f64 s32 s16) zs=(180 180 180 180 180 120 120 120 120) do_one() { - $tool $ir $or $c $1 $2 $3 < $c.${types[$1]} | - sox -t ${types[`expr $2 % 4`]} -r $or -c $c - -n spectrogram -X50 -hwk -z${zs[$n]} -o io$c$n.png -c "io-test i:${types[$1]} o:${types[`expr $2 % 4`]} ($2) q:$3" + it=${types[$1]}; ot=${types[`expr $2 % 4 || true`]} + $tool $ir $or $c $1 $2 $3 < $c.$it > a.$ot + sox -r $or -c $c a.$ot -n spectrogram -X50 -hw$w -z${zs[$n]} -o io$c$n-$w.png -c "io-test i:$it o:$ot ($2) q:$3" + ./4-split-channels $ir $or $c $1 $2 $3 < $c.$it > b.$ot + [ $2 != 3 ] && cmp a.$ot b.$ot || + test $(sox -mv-1 -r$or -c$c a.$ot -r$or -c$c b.$ot -n stats 2>&1 |grep Pk\ l|tr ' ' '\n'|grep '[0-9]'|uniq) = -84.29 + rm [ab].$ot n=`expr $n + 1` } -j=3; test z$1 != z && j=$1 +test z$1 != z && j=$1 || j=1 for c in `seq 1 $j`; do for n in `seq 0 3`; do - sox -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1 + sox -R -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1 done n=0 diff --git a/soxr/tests/large-ratio-test b/soxr/tests/large-ratio-test index 64f1789..540c5df 100755 --- a/soxr/tests/large-ratio-test +++ b/soxr/tests/large-ratio-test @@ -1,23 +1,22 @@ -#!/bin/bash +#!/usr/bin/env bash set -e -# SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net # Licence for this file: LGPL v2.1 See LICENCE for details. -# Tests interpolating then decimating be the same, large ratio. +# Tests interpolating then decimating by the same, large ratio. tool=../examples/3-options-input-fn -q=6 -ratio=2e4 -srate=8000 -nrate=`expr $srate / 2` +w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //') +q=4 +test x$1 = x && ratio=1e5 || ratio=$1 +test x$2 = x && rate=8000 || rate=$2 -rm -f lr.png +sox -r$rate -n 1.s32 synth 10 sin 0:`expr $rate / 2` vol .9375 +sync -../tests/vector-gen $srate 0 8 $nrate .9375 1.s32 +time { $tool 1 $ratio 1 2 1 $q a < 1.s32 | $tool $ratio 1 1 1 2 $q a > 2.s32;} -$tool 1 $ratio 1 2 1 $q < 1.s32 | $tool $ratio 1 1 1 2 $q > 2.s32 - -sox -M -r $srate -c1 1.s32 -r $srate -c1 2.s32 -n spectrogram -hwd -Z-10 -z180 -o lr.png -c "large-ratio-test q:$q ratio:$ratio" +sox -mv-1 -r$rate -c1 1.s32 -r$rate -c1 2.s32 -n spectrogram -hw$w -z150 -o lr-$w.png -c "large-ratio-test q:$q ratio:$ratio" rm [12].s32 diff --git a/soxr/tests/phase-test b/soxr/tests/phase-test index 4c491d8..3c34268 100755 --- a/soxr/tests/phase-test +++ b/soxr/tests/phase-test @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -e # SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net @@ -7,7 +7,8 @@ set -e # Tests varying phase-response. tool=./3-options-input-fn -spec="spectrogram -z160 -Z-20 -X 2000 -wd -ho" +w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //') +spec="spectrogram -z160 -Z-20 -X 2000 -w$w -ho" ext=f32; e=0 rate1=48000 rate2=44100 @@ -20,7 +21,7 @@ for n in 1 2; do filters=(standard-filter steep-filter) for q in `seq 0 7`; do - f=ph-$rate2-q$q + f=ph-$rate2-q$q-$w name=${names[`expr $q % 4 || true`]} filter=${filters[`expr $q / 4 || true`]} $tool $rate1 $rate2 1 $e $e $q'6' < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test $filter $name" @@ -28,7 +29,7 @@ for n in 1 2; do # Test specific phase-response percentages: for q in `seq 0 20 100`; do - f=ph-$rate2-p`printf %03u $q` + f=ph-$rate2-p`printf %03u $q`-$w $tool $rate1 $rate2 1 $e $e 46 0 0 0 $q < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test phase:${q}%" done diff --git a/soxr/tests/q-test b/soxr/tests/q-test index 7a0f0a2..f274cb5 100755 --- a/soxr/tests/q-test +++ b/soxr/tests/q-test @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -e # SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net @@ -9,6 +9,7 @@ set -e tool=./3-options-input-fn +w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //') ext=f64; e=1 c=1 q1=0; q2=7 @@ -16,7 +17,7 @@ rates=48000 zs=(50 87 87 87 111 135 159 180 95) zz() { - echo "spectrogram -z${zs[$1]} -Z-30 -wd -ho" + echo "spectrogram -z${zs[$1]} -Z-30 -w$w -ho" } for rate0 in $rates; do @@ -36,11 +37,11 @@ sox -r $rate1 -n -c $c 0.$ext synth 8 sin 0:$rate1n gain -1 for q in `seq $q1 $q2`; do f=qa-$rate1-$rate2-$q - $tool $rate1 $rate2 $c $e $e $q 0 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f.png -c $f + $tool $rate1 $rate2 $c $e $e $q 0 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f-$w.png -c $f done q=8 f=qa-$rate1-$rate2-v -$tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f.png -c $f +$tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f-$w.png -c $f diff --git a/soxr/tests/scripts b/soxr/tests/scripts index f245919..8b6023f 100755 --- a/soxr/tests/scripts +++ b/soxr/tests/scripts @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash set -e # SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net @@ -6,8 +6,9 @@ set -e ../../tests/bandwidth-test ../../tests/eg-test -../../tests/io-test +../../tests/io-test 3 ../../tests/large-ratio-test ../../tests/phase-test ../../tests/q-test -../../tests/time-test +../../tests/time-test 1 +../../tests/time-test 2 diff --git a/soxr/tests/throughput-test b/soxr/tests/throughput-test new file mode 100644 index 0000000..aef36f6 --- /dev/null +++ b/soxr/tests/throughput-test @@ -0,0 +1,11 @@ +#!/bin/sh +set -e + +# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net +# Licence for this file: LGPL v2.1 See LICENCE for details. + +test -r throughput.exe && wine=wine + +test /$1 = / && list="`seq 0 3`" || list="$*" + +for n in $list; do $wine ./throughput 44.1 48 1 0 $n 4; done diff --git a/soxr/tests/throughput-test.bat b/soxr/tests/throughput-test.bat new file mode 100644 index 0000000..6644d8d --- /dev/null +++ b/soxr/tests/throughput-test.bat @@ -0,0 +1,5 @@ +@echo off +rem SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net +rem Licence for this file: LGPL v2.1 See LICENCE for details. + +for /L %%i in (0,1,3) DO throughput 44.1 48 1 0 %%i diff --git a/soxr/tests/throughput.c b/soxr/tests/throughput.c new file mode 100644 index 0000000..c52b885 --- /dev/null +++ b/soxr/tests/throughput.c @@ -0,0 +1,141 @@ +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net + * Licence for this file: LGPL v2.1 See LICENCE for details. */ + +#include +#include "rint.h" +#include "../examples/examples-common.h" + +#define k 1000 + +#if defined _WIN32 + #define WIN32_LEAN_AND_MEAN + #include + #define timerStart(msecs) LARGE_INTEGER start, stop, tmp; \ + QueryPerformanceCounter(&start), QueryPerformanceFrequency(&tmp), \ + stop.QuadPart = (msecs * tmp.QuadPart + k/2) / k + #define timerRunning() (QueryPerformanceCounter(&tmp), \ + (tmp.QuadPart-start.QuadPart < stop.QuadPart)) +#else + #include + #if defined timeradd + #define K k + #define tv_frac tv_usec + #define timespec timeval + #define get_time(x) gettimeofday(x, NULL) + #else + #include + #include + #if defined _POSIX_TIMERS && _POSIX_TIMERS > 0 + #define K (k*k) + #define tv_frac tv_nsec + #if defined _POSIX_MONOTONIC_CLOCK + #define get_time(x) clock_gettime(CLOCK_MONOTONIC, x) + #else + #define get_time(x) clock_gettime(CLOCK_REALTIME, x) + #endif + #else + #include + #define K 1 + #define tv_frac millitm + #define tv_sec time + #define timespec timeb + #define get_time(x) ftime(x) + #endif + #endif + + #define timerStart(msecs) struct timespec stop, tmp; get_time(&stop), \ + stop.tv_frac += (msecs%k)*K, \ + stop.tv_sec += msecs/k + stop.tv_frac/(K*k), \ + stop.tv_frac %= K*k + #define timerRunning() (get_time(&tmp), \ + (tmp.tv_sec < stop.tv_sec || tmp.tv_frac < stop.tv_frac)) +#endif + +int main(int n, char const * arg[]) +{ + char const * const arg0 = n? --n, *arg++ : "", * engine = ""; + double const irate = n? --n, atof(*arg++) : 96000.; + double const orate = n? --n, atof(*arg++) : 44100.; + unsigned const chans = n? --n, (unsigned)atoi(*arg++) : 1; + soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0; + unsigned const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0; + unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ; + unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0; + double const passband_end = n? --n, atof(*arg++) : 0; + double const stopband_begin = n? --n, atof(*arg++) : 0; + double const phase_response = n? --n, atof(*arg++) : -1; + int const use_threads = n? --n, atoi(*arg++) : 1; + soxr_datatype_t const otype = ospec & 3; + + soxr_quality_spec_t q_spec = soxr_quality_spec(q_recipe, q_flags); + soxr_io_spec_t io_spec = soxr_io_spec(itype, otype); + soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads); + + /* Allocate resampling input and output buffers in proportion to the input + * and output rates: */ + #define buf_total_len 15000 /* In samples per channel. */ + size_t const osize = soxr_datatype_size(otype) * chans; + size_t const isize = soxr_datatype_size(itype) * chans; + size_t const olen0= (size_t)(orate * buf_total_len / (irate + orate) + .5); + size_t const olen = min(max(olen0, 1), buf_total_len - 1); + size_t const ilen = buf_total_len - olen; + void * const obuf = malloc(osize * olen); + void * const ibuf = malloc(isize * ilen); + + size_t odone = 0, clips = 0, omax = 0, i; + soxr_error_t error; + soxr_t soxr; + int32_t seed = 0; + char const * e = getenv("SOXR_THROUGHPUT_GAIN"); + double gain = e? atof(e) : .5; + + /* Overrides (if given): */ + if (passband_end > 0) q_spec.passband_end = passband_end / 100; + if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100; + if (phase_response >=0) q_spec.phase_response = phase_response; + io_spec.flags = ospec & ~7u; + + /* Create a stream resampler: */ + soxr = soxr_create( + irate, orate, chans, /* Input rate, output rate, # of channels. */ + &error, /* To report any error during creation. */ + &io_spec, &q_spec, &runtime_spec); + +#define ranqd1(x) ((x) = 1664525 * (x) + 1013904223) /* int32_t x */ +#define dranqd1(x) (ranqd1(x) * (1. / (65536. * 32768.))) /* [-1,1) */ +#define RAND (dranqd1(seed) * gain) +#define DURATION_MSECS 125 +#define NUM_ATTEMPTS 8 + + if (!error) { /* If all is well, run the resampler: */ + engine = soxr_engine(soxr); + switch (itype & 3) { + case 0: for (i=0;i' $rate2 c=$c q=$q - time $tool $rate1 $rate2 $c $e $e $q < 0.$ext > /dev/null; + sox -R -r $rate1 -n -c $c 0.$ext synth $len noise; sync + for q in $qs; do + test $q = v && Q="4 20" || Q=$q + $time -f %e -o $TIME $tool $rate1 $rate2 $c $e $e $Q < 0.$ext > /dev/null 2> $ERR + echo $rate1 '-->' $rate2 c=$c q=$q t=`cat $TIME` `cat $ERR | sed 's/.*(/(/'` done - - echo $rate1 '-->' $rate2 c=$c q=v - time $tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext > /dev/null - - rate1=44100 - rate2=$rate0 + rate1=$rate0 + rate2=44100 done done diff --git a/soxr/tests/vector-cmp.c b/soxr/tests/vector-cmp.c index 6edd2d5..f90cc7f 100644 --- a/soxr/tests/vector-cmp.c +++ b/soxr/tests/vector-cmp.c @@ -1,53 +1,56 @@ -/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net +/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net * Licence for this file: LGPL v2.1 See LICENCE for details. */ /* Utility used to help test the library; not for general consumption. * - * Compare two swept-sine files. */ + * Measure the peak bit difference between two files. */ #include #include -#include #include "../src/rint.h" +#include "../examples/examples-common.h" -int main(int bit, char const * arg[]) +#define TYPE 0 /* As vector-gen */ + +#if TYPE + #define sample_t double + #define N 50 + #define DIFF(s1,s2) abs(rint32((s1-s2)*ldexp(1,N-1))) +#else + #define sample_t int32_t + #define N 32 + #define DIFF(s1,s2) abs((int)(s1-s2)) +#endif + +int main(int argc, char const * arg[]) { - FILE * f1 = fopen(arg[1], "rb"), - * f2 = fopen(arg[2], "rb"); - double rate = atof (arg[3]), /* Rate for this vector */ - leader_len = atof (arg[4]), /* Leader length in seconds */ - len = atof (arg[5]), /* Sweep length (excl. leader_len) */ - expect_bits= atof (arg[6]), - expect_bw = atof (arg[7]); + int two = !!arg[2][0]; + FILE * f1 = fopen(arg[1], "rb"), * f2 = two? fopen(arg[2], "rb") : 0; + double rate = atof (arg[3]), /* Sample-rate */ + skip_len = atof (arg[4]), /* Skip length in seconds */ + len = atof (arg[5]), /* Compare length in seconds */ r; + int i = 0, count = rint32(rate * len), max = 0, diff; + sample_t s1, s2; - int32_t s1, s2; - long count = 0; - static long thresh[32]; - double bw, prev = 0; - - for (; fread(&s1, sizeof(s1), 1, f1) == 1 && - fread(&s2, sizeof(s2), 1, f2) == 1; ++count) { - long diff = abs((int)(s1 - s2)); - for (bit = 0; diff && bit < 32; bit++, diff >>= 1) - if ((diff & 1) && !thresh[bit]) - thresh[bit] = count + 1; - } - - if (count != (long)((leader_len + len) * rate + .5)) { - printf("incorrect file length\n"); - exit(1); - } - - for (bit = 0; bit < 32; ++bit) { - bw = ((double)thresh[bit] - 1) / rate - leader_len; - if (bit && bw >= 0 && (bw - prev) * 100 / len < .08) { - --bit; - break; + fseek(f1, rint32(rate * skip_len) * (int)sizeof(s1), SEEK_CUR); + if (two) { + fseek(f2, rint32(rate * skip_len) * (int)sizeof(s2), SEEK_CUR); + for (; i < count && + fread(&s1, sizeof(s1), 1, f1) && + fread(&s2, sizeof(s2), 1, f2); ++i) { + diff = DIFF(s1, s2); + max = max(max, diff); } - prev = bw; } - bit = 32 - bit; - bw = bw * 100 / len; - printf("Bit perfect to %i bits, from DC to %.2f%% nyquist.\n", bit, bw); - return !(bit >= expect_bits && bw >= expect_bw); + else for (; i < count && fread(&s1, sizeof(s1), 1, f1); ++i) { + diff = DIFF(s1, 0); + max = max(max, diff); + } + + if (i != count) { + fprintf(stderr, "incorrect file length\n"); + return 1; + } + printf("%f\n", r = N-log(max)/log(2)); + return argc>6? r 1 #include #endif -#include "../examples/examples-common.h" +#include "math-wrap.h" +#include +#include -#if QUAD - #define modf modfq - #define cos cosq - #define sin sinq - #undef M_PI - #define M_PI M_PIq - #define real __float128 - #define atof(x) strtoflt128(x, 0) +#if TYPE + #if TYPE > 1 + #define modf modfq + #define cos cosq + #define sin sinq + #define PI M_PIq + #define real __float128 + #define atof(x) strtoflt128(x, 0) + #else + #define modf modfl + #define cos cosl + #define sin sinl + #define PI M_PIl + #define real long double + #endif + #define MULT 1 + #define OUT(d) double output = d #else + #define PI M_PI #define real double #include "rint.h" + #define MULT (32768. * 65536 - 1/scale) + #define OUT(d) int32_t output = rint32(d) #endif -int main(int i, char const * argv[]) +int main(int argc, char const * argv[]) { - real rate = atof(argv[1]), /* Rate for this vector */ - lead_in_len = atof(argv[2]), /* Lead-in length in seconds */ - len = atof(argv[3]), /* Sweep length (excl. lead_in_len) */ - sweep_to_freq = atof(argv[4]), /* Sweep from DC to this freq. */ - multiplier = atof(argv[5]), /* For headroom */ - f1 = -sweep_to_freq / len * lead_in_len, f2 = sweep_to_freq, - n1 = rate * -lead_in_len, n2 = rate * len, - m = (f2 - f1) / (n2 - n1) / 2, dummy; - FILE * file = fopen(argv[6], "wb"); - i = (int)n1; - if (!file || i != n1) - exit(1); - for (; i < (int)(n2 + .5); ++i) { - double d1 = multiplier * sin(2 * M_PI * modf(i * m * i / rate, &dummy)); - double d = i < 0? d1 * (1 - cos(M_PI * (i + n1) / n1)) * .5 : d1; -#if QUAD - size_t actual = fwrite(&d, sizeof(d), 1, file); -#else - int32_t out = rint32(d * (32768. * 65536 - 1)); - size_t actual = fwrite(&out, sizeof(out), 1, file); -#endif - if (actual != 1) - return 1; + real rate = atof(argv[1]), /* Rate for this vector */ + lead_in_len = atof(argv[2]), /* Lead-in length in seconds */ + len = atof(argv[3]), /* Sweep length (excl. lead_in_len) */ + f1 = atof(argv[4]), + f2 = atof(argv[5]), + scale = atof(argv[6]), /* For headroom */ + n1 = rate * -lead_in_len, + m = (f2 - f1) / (rate * len * 2), dummy; + FILE * file = fopen(argv[7], "wb"); + int i = (int)n1, err = !file || i != n1; + for (; !err && i < (int)(rate*(len+lead_in_len)+.5); ++i) { + real d = sin(2 * PI * modf((f1 + i * m) * i / rate, &dummy)); + OUT((double)(scale * MULT * d)); + err = fwrite(&output, sizeof(output), 1, file) != 1; } - return 0; + return err |!argc; }