diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85eab0a..584a36e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ endif ()
set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
-add_subdirectory(soxr/src)
+add_subdirectory(soxr)
add_library(boo
lib/audiodev/Common.hpp
diff --git a/soxr/.gitignore b/soxr/.gitignore
new file mode 100644
index 0000000..ac1dff9
--- /dev/null
+++ b/soxr/.gitignore
@@ -0,0 +1,2 @@
+Release*/
+Debug*/
diff --git a/soxr/CMakeLists.txt b/soxr/CMakeLists.txt
index 61bd596..76950ae 100644
--- a/soxr/CMakeLists.txt
+++ b/soxr/CMakeLists.txt
@@ -1,30 +1,36 @@
-# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
# Licence for this file: LGPL v2.1 See LICENCE for details.
-cmake_minimum_required (VERSION 2.8 FATAL_ERROR)
-cmake_policy(SET CMP0075 OLD)
-
-#project (soxr C)
-#set (DESCRIPTION_SUMMARY "High quality, one-dimensional sample-rate conversion library")
+cmake_minimum_required (VERSION 3.1 FATAL_ERROR)
+project (soxr C)
+set (DESCRIPTION_SUMMARY
+ "High quality, one-dimensional sample-rate conversion library")
+cmake_policy(SET CMP0075 NEW)
+cmake_policy(SET CMP0115 OLD)
+cmake_policy(SET CMP0127 OLD)
# Release versioning:
set (PROJECT_VERSION_MAJOR 0)
set (PROJECT_VERSION_MINOR 1)
-set (PROJECT_VERSION_PATCH 2)
+set (PROJECT_VERSION_PATCH 3)
# For shared-object; if, since the last public release:
-# * library code changed at all: ++revision
-# * interfaces changed at all: ++current, revision = 0
-# * interfaces added: ++age
-# * interfaces removed: age = 0
+# 1) library code changed at all: ++revision
+# 2) interfaces changed at all: ++current, revision = 0
+# 3) interfaces added: ++age
+# 4) interfaces removed: age = 0
set (SO_VERSION_CURRENT 1)
-set (SO_VERSION_REVISION 1)
+set (SO_VERSION_REVISION 2)
set (SO_VERSION_AGE 1)
+math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}")
+
# Main options:
@@ -32,112 +38,147 @@ set (SO_VERSION_AGE 1)
include (CMakeDependentOption)
if (NOT CMAKE_BUILD_TYPE)
- set (CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
+ set (CMAKE_BUILD_TYPE Release CACHE STRING
+ "Build type, one of: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
endif ()
-#option (BUILD_TESTS "Build sanity-tests." ON)
-#option (BUILD_SHARED_LIBS "Build shared libraries." ON)
-#option (BUILD_EXAMPLES "Build examples." OFF)
-option (WITH_LSR_BINDINGS "Include a `libsamplerate'-like interface." ON)
-cmake_dependent_option (WITH_SINGLE_PRECISION "Build with single precision (for up to 20-bit accuracy)." ON
- "WITH_DOUBLE_PRECISION" ON)
-cmake_dependent_option (WITH_DOUBLE_PRECISION "Build with double precision (for up to 32-bit accuracy)." ON
- "WITH_SINGLE_PRECISION" ON)
-cmake_dependent_option (WITH_SIMD "Use SIMD (for faster single precision)." ON
- "WITH_SINGLE_PRECISION" OFF)
-cmake_dependent_option (WITH_AVFFT "Use libavcodec (LGPL) for SIMD DFT." OFF
- "WITH_SIMD;NOT WITH_PFFFT" OFF)
-cmake_dependent_option (WITH_PFFFT "Use PFFFT (BSD-like licence) for SIMD DFT." ON
- "WITH_SIMD;NOT WITH_AVFFT" OFF)
-option (SOXR_SILENT "Disable debug messages, even in debug mode" OFF)
-if (UNIX)
- if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/lsr-tests)
- cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF
- "WITH_LSR_BINDINGS" OFF)
- endif ()
-endif ()
+option (BUILD_TESTS "Build sanity-tests." OFF)
+option (BUILD_EXAMPLES "Build examples." OFF)
+option (WITH_OPENMP "Include OpenMP threading." OFF)
+option (WITH_LSR_BINDINGS "Include a `libsamplerate'-like interface." OFF)
+
+cmake_dependent_option (BUILD_SHARED_LIBS
+ "Build shared (dynamic) soxr libraries." OFF
+ "NOT WITH_DEV_GPROF" OFF)
+cmake_dependent_option (WITH_VR32
+ "Include HQ variable-rate resampling engine." ON
+ "WITH_CR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S OR NOT DEFINED WITH_VR32" ON)
+cmake_dependent_option (WITH_CR32
+ "Include HQ constant-rate resampling engine." ON
+ "WITH_VR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_CR64
+ "Include VHQ constant-rate resampling engine." ON
+ "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_CR64S
+ "Include VHQ SIMD constant-rate resampling engine." ON
+ "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64" ON)
+cmake_dependent_option (WITH_CR32S
+ "Include HQ SIMD constant-rate resampling engine." ON
+ "WITH_VR32 OR WITH_CR64 OR WITH_CR32 OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_PFFFT
+ "Use PFFFT (BSD-like licence) for HQ SIMD DFT." ON
+ "WITH_CR32S;NOT WITH_AVFFT" OFF)
+cmake_dependent_option (WITH_AVFFT
+ "Use libavcodec (LGPL) for HQ SIMD DFT." OFF
+ "WITH_CR32S;NOT WITH_PFFFT" OFF)
+cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF
+ "UNIX;NOT CMAKE_CROSSCOMPILING;EXISTS ${PROJECT_SOURCE_DIR}/lsr-tests;WITH_LSR_BINDINGS" OFF)
+
+option (WITH_HI_PREC_CLOCK "Enable high-precision time-base." ON)
+option (WITH_FLOAT_STD_PREC_CLOCK
+ "Use floating-point for standard-precision time-base." OFF)
+option (WITH_DEV_TRACE "Enable developer trace capability." ON)
+option (WITH_DEV_GPROF "Enable developer grpof output." OFF)
+mark_as_advanced (WITH_HI_PREC_CLOCK WITH_FLOAT_STD_PREC_CLOCK
+ WITH_DEV_TRACE WITH_DEV_GPROF)
# Introspection:
-list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+list (APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
include (CheckFunctionExists)
include (CheckIncludeFiles)
include (CheckLibraryExists)
-include (TestBigEndian)
+include (SetSystemProcessor)
+if (NOT EMSCRIPTEN)
+ include(TestBigEndian)
+endif ()
+
+set_system_processor ()
check_library_exists (m pow "" NEED_LIBM)
if (NEED_LIBM)
set (CMAKE_REQUIRED_LIBRARIES "m;${CMAKE_REQUIRED_LIBRARIES}")
- link_libraries (m)
+ set (LIBM_LIBRARIES m)
endif ()
-#if (WITH_OPENMP)
-# find_package (OpenMP)
-# if (OPENMP_FOUND)
-# set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-# set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-# set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
-# endif ()
-#endif ()
-
-if (WITH_SIMD)
- find_package (SIMD)
- if (SIMD_FOUND)
- set (HAVE_SIMD 1)
- endif ()
+if (${BUILD_EXAMPLES})
+ project (${PROJECT_NAME}) # Adds c++ compiler
endif ()
-if (WITH_SINGLE_PRECISION)
- set (HAVE_SINGLE_PRECISION 1)
+if (WITH_OPENMP)
+ find_package (OpenMP)
+ if (OPENMP_FOUND)
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+ if (MINGW) # Is this still needed?
+ set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+ set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+ endif ()
+ endif()
endif ()
-if (WITH_DOUBLE_PRECISION)
- set (HAVE_DOUBLE_PRECISION 1)
+if (WITH_CR32S)
+ find_package (SIMD32)
+ set (WITH_CR32S ${SIMD32_FOUND})
+endif ()
+
+if (WITH_CR64S)
+ find_package (SIMD64)
+ set (WITH_CR64S ${SIMD64_FOUND})
endif ()
if (WITH_AVFFT)
- find_package (LibAVCodec)
+ find_package (LibAVCodec REQUIRED)
if (AVCODEC_FOUND)
include_directories (${AVCODEC_INCLUDE_DIRS})
- link_libraries (${AVCODEC_LIBRARIES})
- set (HAVE_AVFFT 1)
+ set (LIBS ${LIBS} ${AVCODEC_LIBRARIES})
endif ()
endif ()
-if (SOXR_SILENT)
- add_definitions (-DSOXR_SILENT=1)
+if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD32_FOUND AND WITH_CR32))
+ find_package (LibAVUtil)
+ if (AVUTIL_FOUND)
+ include_directories (${AVUTIL_INCLUDE_DIRS})
+ set (LIBS ${LIBS} ${AVUTIL_LIBRARIES})
+ endif ()
endif ()
check_function_exists (lrint HAVE_LRINT)
check_include_files (fenv.h HAVE_FENV_H)
-test_big_endian (WORDS_BIGENDIAN)
-
-macro (make_exist)
- foreach (x ${ARGN})
- if (NOT ${x})
- set (${x} 0)
- endif ()
- endforeach ()
-endmacro ()
-
-make_exist (HAVE_LRINT HAVE_FENV_H WORDS_BIGENDIAN HAVE_SIMD)
-make_exist (HAVE_SINGLE_PRECISION HAVE_DOUBLE_PRECISION HAVE_AVFFT)
+check_include_files (stdbool.h HAVE_STDBOOL_H)
+check_include_files (stdint.h HAVE_STDINT_H)
+if (EMSCRIPTEN)
+ set(HAVE_BIGENDIAN OFF)
+else()
+ test_big_endian (HAVE_BIGENDIAN)
+endif()
# Compiler configuration:
-if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
- set (PROJECT_CXX_FLAGS "-Wconversion -Wall -W -Wundef -Wcast-align -Wpointer-arith -Wno-long-long")
- set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} -Wnested-externs -Wmissing-prototypes -Wstrict-prototypes")
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+ set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wconversion -Wall -Wextra \
+ -pedantic -Wundef -Wpointer-arith -Wno-long-long")
+ if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+ set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wno-keyword-macro")
+ endif ()
+ if (WITH_DEV_GPROF)
+ set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -pg")
+ endif ()
+ # Can use std=c89, but gnu89 should give faster sinf, cosf, etc.:
+ set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} \
+ -std=gnu89 -Wnested-externs -Wmissing-prototypes -Wstrict-prototypes")
if (CMAKE_BUILD_TYPE STREQUAL "Release")
set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s") # strip
endif ()
- cmake_dependent_option (VISIBILITY_HIDDEN "Build with -fvisibility=hidden." ON
+ cmake_dependent_option (VISIBILITY_HIDDEN
+ "Build shared libraries with -fvisibility=hidden." ON
"BUILD_SHARED_LIBS" OFF)
+ mark_as_advanced (VISIBILITY_HIDDEN)
if (VISIBILITY_HIDDEN)
add_definitions (-fvisibility=hidden -DSOXR_VISIBILITY)
endif ()
@@ -145,9 +186,14 @@ endif ()
if (MSVC)
add_definitions (-D_USE_MATH_DEFINES -D_CRT_SECURE_NO_WARNINGS)
- option (ENABLE_STATIC_RUNTIME "Visual Studio, link with runtime statically." OFF)
- if (ENABLE_STATIC_RUNTIME)
- foreach (flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+ option (BUILD_SHARED_RUNTIME "MSVC, link with runtime dynamically." ON)
+ if (NOT BUILD_SHARED_RUNTIME)
+ foreach (flag_var
+ CMAKE_C_FLAGS CMAKE_CXX_FLAGS
+ CMAKE_C_FLAGS_DEBUG CMAKE_CXX_FLAGS_DEBUG
+ CMAKE_C_FLAGS_RELEASE CMAKE_CXX_FLAGS_RELEASE
+ CMAKE_C_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_MINSIZEREL
+ CMAKE_C_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_RELWITHDEBINFO)
string (REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
endforeach ()
endif ()
@@ -161,8 +207,9 @@ endif ()
# Build configuration:
-if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows) # Allow exes to find dlls:
- set (BIN ${CMAKE_CURRENT_BINARY_DIR}/bin/)
+if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows)
+ # Allow exes to find dlls:
+ set (BIN ${PROJECT_BINARY_DIR}/bin/)
set (EXAMPLES_BIN ${BIN})
set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${BIN})
set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${BIN})
@@ -179,6 +226,10 @@ if (BUILD_SHARED_LIBS)
endif ()
endif ()
+if (CMAKE_BUILD_TYPE STREQUAL "None") # As used by some distros.
+ add_definitions (-DNDEBUG)
+endif ()
+
# Installation configuration:
@@ -194,7 +245,7 @@ if (NOT DEFINED INCLUDE_INSTALL_DIR)
endif ()
if (NOT DEFINED DOC_INSTALL_DIR)
if (UNIX)
- set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/doc/libsoxr")
+ set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/doc/lib${PROJECT_NAME}")
else ()
set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/doc")
endif ()
@@ -202,25 +253,24 @@ endif ()
if (APPLE)
option (BUILD_FRAMEWORK "Build an OS X framework." OFF)
- set (FRAMEWORK_INSTALL_DIR "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.")
+ set (FRAMEWORK_INSTALL_DIR
+ "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.")
endif ()
# Top-level:
-set (PROJECT_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH})
-math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
-math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}")
-math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}")
+set (PROJECT_VERSION
+ ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH})
set (SO_VERSION ${SO_VERSION_MAJOR}.${SO_VERSION_MINOR}.${SO_VERSION_PATCH})
configure_file (
- ${CMAKE_CURRENT_SOURCE_DIR}/soxr-config.h.in
- ${CMAKE_CURRENT_BINARY_DIR}/soxr-config.h)
-include_directories (${CMAKE_CURRENT_BINARY_DIR})
+ ${PROJECT_SOURCE_DIR}/${PROJECT_NAME}-config.h.in
+ ${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config.h)
+include_directories (${PROJECT_BINARY_DIR})
-if (BUILD_TESTS OR BUILD_LSR_TESTS)
+if (NOT CMAKE_CROSSCOMPILING AND (BUILD_TESTS OR BUILD_LSR_TESTS))
enable_testing ()
endif ()
@@ -234,7 +284,7 @@ install (FILES
# Subdirectories:
-include_directories (${CMAKE_CURRENT_SOURCE_DIR}/src)
+include_directories (${PROJECT_SOURCE_DIR}/src)
add_subdirectory (src)
if (BUILD_TESTS)
@@ -249,55 +299,45 @@ endif ()
-# Rough-and-ready distclean for anyone still doing in-tree builds:
+# GNU Autotools compatibility; 'make check':
-#if (UNIX)
-# add_custom_target (distclean
-# COMMAND make clean && rm -rf
-# CMakeCache.txt
-# CMakeFiles
-# cmake_install.cmake
-# CPackConfig.cmake
-# CPackSourceConfig.cmake
-# deinstall.cmake
-# Makefile
-# soxr-config.h
-# src/CMakeFiles
-# src/cmake_install.cmake
-# src/libsoxr-dev.src
-# src/libsoxr-lsr.pc
-# src/libsoxr.pc
-# src/libsoxr.src
-# src/Makefile)
-#endif ()
+add_custom_target (check COMMAND ${CMAKE_CTEST_COMMAND})
+
+
+
+# GNU Autotools compatibility; 'make distclean':
+
+if (UNIX)
+ add_custom_target (distclean COMMAND make clean && find .
+ \\! -path \\*/Modules/\\* \\! -name cmp-test.cmake -a -name \\*.cmake
+ -o -name CMakeFiles -o -name Makefile -o -name CMakeCache.txt -o -name
+ Testing -o -name cmake_install.cmake -o -name install_manifest.txt -o
+ -path ./soxr-config.h -o -name config.h -o -name \\*.pc -o -name \\*.s32
+ | xargs rm -rf)
+endif ()
# Deinstallation:
-#configure_file (
-# "${CMAKE_CURRENT_SOURCE_DIR}/deinstall.cmake.in"
-# "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake"
-# IMMEDIATE @ONLY)
+configure_file (
+ "${CMAKE_CURRENT_SOURCE_DIR}/deinstall.cmake.in"
+ "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake"
+ IMMEDIATE @ONLY)
-#add_custom_target (deinstall
-# COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake")
+add_custom_target (deinstall
+ COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake")
# Packaging:
-#if (UNIX)
-# set (CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}")
-# set (CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
-# set (CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
-#
-# set (CPACK_SOURCE_GENERATOR "TGZ")
-# set (CPACK_SOURCE_IGNORE_FILES "dist;/lsr-tests/;/Debug/;/Release/;/cpack/;\\\\.swp$;\\\\.gitignore;/\\\\.git/")
-#
-# include (CPack)
-#
-# if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cpack)
-# add_subdirectory (cpack)
-# endif ()
-#endif ()
+if (UNIX)
+ set (CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}")
+ set (CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
+ set (CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
+ set (CPACK_SOURCE_GENERATOR "TXZ")
+ set (CPACK_SOURCE_IGNORE_FILES
+ "dist;/lsr-tests/;/Debug.*/;/Release.*/;\\\\.swp$;\\\\.git.*;/\\\\.git/")
+ include (CPack)
+endif ()
diff --git a/soxr/INSTALL b/soxr/INSTALL
index c2c7675..5599870 100644
--- a/soxr/INSTALL
+++ b/soxr/INSTALL
@@ -1,11 +1,12 @@
-SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
INSTALLATION GUIDE CONTENTS
* Standard build
* Build customisation
-* Cross-compiling with mingw (linux host)
+* Cross-compilation
* Integration with other build systems
+* Run-time configuration
@@ -20,7 +21,7 @@ STANDARD BUILD
* A 'make' utility (most compiler installations already have one of these).
- * CMake: http://www.cmake.org/cmake/resources/software.html
+ * CMake v3.0 or newer: https://cmake.org/download/
2. Build:
@@ -30,7 +31,7 @@ STANDARD BUILD
go (on MS-Windows with nmake)
or
- ./go (on unix-like systems)
+ ./go (on Unix-like systems)
This should build the library and run a few sanity tests.
@@ -38,14 +39,14 @@ STANDARD BUILD
3. Installation:
Note that this step may need to be performed by a system
- adminstrator. Enter:
+ administrator. Enter:
nmake install (on MS-Windows)
or
- cd Release; make install (on unix)
+ cd Release; make install (on Unix-like)
-4. Configuration:
+4. Preparation for use:
To use the library you may need to set up appropriate paths to the
library and its header file in your development environment.
@@ -60,38 +61,74 @@ STANDARD BUILD
BUILD CUSTOMISATION
-If it is necessary to customise the build, then steps 2 and 3 above may be
-substituted as follows. Change directory to the one containing this file,
-then enter commands along the lines of:
+If it is necessary to customise the build, then steps 2 and 3 above should be
+substituted as follows: change directory to the one containing this file, then
+enter commands along the lines:
mkdir build
cd build
- cmake [OPTIONS] ..
+ cmake -Wno-dev -DCMAKE_BUILD_TYPE=Release [OPTIONS] ..
make
make test
sudo make install
+N.B. The CMAKE_BUILD_TYPE to use for library deployment is Release.
+
To list help on the available options, enter:
cmake -LH ..
Options, if given, should be preceded with '-D', e.g.
- cmake -DWITH_SIMD:BOOL=OFF ..
+ -DBUILD_SHARED_LIBS:BOOL=OFF
-CROSS-COMPILING WITH MINGW (LINUX HOST)
+Resampling engines
-For example:
+As available on a given system, options for including up-to five resampling
+‘engines’ are available (per above) as follows:
+
+ WITH_CR32: for constant-rate HQ resampling,
+ WITH_CR32S: SIMD variant of previous,
+ WITH_CR64: for constant-rate VHQ resampling,
+ WITH_CR64S: SIMD variant of previous,
+ WITH_VR32: for variable-rate HQ resampling.
+
+By default, these options are all set to ON.
+
+When both SIMD and non-SIMD engine variants are included, run-time selection
+is automatic (based on CPU capability) for x86 CPUs, and can be automatic for
+ARM CPUs if the 3rd-party library `libavutil' is available at libsoxr
+build-time. Which engine has been selected for a specific configuration and
+invocation of the library can be checked using example #3, which reports it.
+See also Run-time Configuration, below.
+
+
+
+CROSS-COMPILATION
+
+E.g. targeting a Linux ARM system:
+
+ mkdir build
+ cd build
+ cmake -DCMAKE_SYSTEM_NAME=Linux \
+ -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \
+ ..
+or, also building the examples (one of which uses C++):
+
+ cmake -DCMAKE_SYSTEM_NAME=Linux \
+ -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \
+ -DCMAKE_CXX_COMPILER=arm-linux-gnueabi-g++ \
+ -DBUILD_EXAMPLES=1 \
+ ..
+
+E.g. with Mingw (Linux host), using a tool-chain file:
mkdir build
cd build
cmake -DCMAKE_TOOLCHAIN_FILE=~/Toolchain-x86_64-mingw-w64-mingw32.cmake \
-DCMAKE_INSTALL_PREFIX=install \
- -DHAVE_WORDS_BIGENDIAN_EXITCODE=1 \
- -DBUILD_TESTS=0 \
- -DBUILD_EXAMPLES=1 \
..
make
@@ -117,7 +154,30 @@ INTEGRATION WITH OTHER BUILD SYSTEMS
Autotools-based systems might find it useful to create a file called
`configure' in the directory containing this file, consisting of the line:
- cmake -DBUILD_SHARED_LIBS=OFF .
+ cmake -DBUILD_SHARED_LIBS=OFF .
(or with other build options as required).
-For MS visual studio, see msvc/README
+For MS Visual Studio, see msvc/README.
+
+
+
+RUN-TIME CONFIGURATION
+
+The libsoxr API structure ‘soxr_runtime_spec_t’ allows application developers
+to optimise some aspects of libsoxr’s operation for a particular application.
+Optimal performance however, might depend on an individual end-user’s run-
+time system and the end-user’s preferences. Hence environment variables are
+available to set (override) run-time parameters as follows:
+
+ Env. variable Equivalent soxr_runtime_spec_t item (see soxr.h)
+ ------------------ -----------------------------------
+ SOXR_COEFS_SIZE coef_size_kbytes
+ SOXR_COEF_INTERP SOXR_COEF_INTERP_xxx
+ SOXR_LARGE_DFT_SIZE log2_large_dft_size
+ SOXR_MIN_DFT_SIZE log2_min_dft_size
+ SOXR_NUM_THREADS num_threads
+
+Additionally, the SOXR_USE_SIMD32 and SOXR_USE_SIMD64 boolean environment
+variables can be used to override automatic selection (or to provide manual
+selection where automatic selection is not available) between SIMD and
+non-SIMD engine variants.
diff --git a/soxr/LICENCE b/soxr/LICENCE
index 1c61878..43e5a71 100644
--- a/soxr/LICENCE
+++ b/soxr/LICENCE
@@ -1,4 +1,4 @@
-SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
This library is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
@@ -11,8 +11,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser
General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
-along with this library; if not, write to the Free Software Foundation,
-Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+along with this library; if not, see .
Notes
diff --git a/soxr/NEWS b/soxr/NEWS
index f388974..9e7c298 100644
--- a/soxr/NEWS
+++ b/soxr/NEWS
@@ -1,3 +1,12 @@
+Version 0.1.3 (2018-02-24)
+ * SIMD enhancements: SSE, AVX, Neon.
+ * Improve support for clang, ARM, and cross-compilation.
+ * Provide env. var. override of runtime parameters.
+ * Build fix re cmake variables AVCODEC_INCLUDE_DIRS & AVUTIL_INCLUDE_DIRS.
+ * Build options WITH_SINGLE_PRECISION, WITH_DOUBLE_PRECISION & WITH_SIMD have
+ been removed; replacement options are detailed in INSTALL, `Resampling
+ engines'.
+
Version 0.1.2 (2015-09-05)
* Fix conversion failure when I/O types differ but I/O rates don't.
* Fix #defines for interpolation order selection.
diff --git a/soxr/README b/soxr/README
index 06f11e6..7f9a7af 100644
--- a/soxr/README
+++ b/soxr/README
@@ -1,4 +1,4 @@
-SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
The SoX Resampler library `libsoxr' performs one-dimensional sample-rate
conversion -- it may be used, for example, to resample PCM-encoded audio.
@@ -46,7 +46,7 @@ size configuration parameters may be used to reduce this figure).
For build and installation instructions, see the file `INSTALL'; for
copyright and licensing information, see the file `LICENCE'.
-For support and new versions, see http://soxr.sourceforge.net
+For support and new versions, see https://soxr.sourceforge.net
________
¹ For example, multi-channel resampling can utilise multiple CPU-cores.
² Bit-perfect within practical occupied-bandwidth limits.
diff --git a/soxr/TODO b/soxr/TODO
index 1c4a31b..2d1bc19 100644
--- a/soxr/TODO
+++ b/soxr/TODO
@@ -1,3 +1,3 @@
-* SOXR_ALLOW_ALIASING
-* Explicit flush API fn, perhaps.
-* More SIMD.
+* vr32s
+* vr32 with 1-delay-clear
+* fir_to_phase with RDFT32
diff --git a/soxr/cmake/Modules/FindCFlags.cmake b/soxr/cmake/Modules/FindCFlags.cmake
new file mode 100644
index 0000000..f118727
--- /dev/null
+++ b/soxr/cmake/Modules/FindCFlags.cmake
@@ -0,0 +1,35 @@
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# - Function to find C compiler feature flags
+
+include (CheckCSourceCompiles)
+include (FindPackageHandleStandardArgs)
+
+function (FindCFlags PKG_NAME PKG_DESC TRIAL_C_FLAGS TEST_C_SOURCE)
+
+foreach (TRIAL_C_FLAG ${TRIAL_C_FLAGS})
+ message (STATUS "Trying ${PKG_NAME} C flags: ${TRIAL_C_FLAG}")
+ unset (DETECT_${PKG_NAME}_C_FLAGS CACHE) #displayed by check_c_source_compiles
+
+ set (TMP "${CMAKE_REQUIRED_FLAGS}")
+ set (CMAKE_REQUIRED_FLAGS "${TRIAL_C_FLAG}")
+ check_c_source_compiles ("${TEST_C_SOURCE}" DETECT_${PKG_NAME}_C_FLAGS)
+ set (CMAKE_REQUIRED_FLAGS "${TMP}")
+
+ if (DETECT_${PKG_NAME}_C_FLAGS)
+ set (DETECTED_C_FLAGS "${TRIAL_C_FLAG}")
+ break ()
+ endif ()
+endforeach ()
+
+# N.B. Will not overwrite existing cache variable:
+set (${PKG_NAME}_C_FLAGS "${DETECTED_C_FLAGS}"
+ CACHE STRING "C compiler flags for ${PKG_DESC}")
+
+find_package_handle_standard_args (
+ ${PKG_NAME} DEFAULT_MSG ${PKG_NAME}_C_FLAGS ${PKG_NAME}_C_FLAGS)
+mark_as_advanced (${PKG_NAME}_C_FLAGS)
+set (${PKG_NAME}_FOUND ${${PKG_NAME}_FOUND} PARENT_SCOPE)
+
+endfunction ()
diff --git a/soxr/cmake/Modules/FindLibAVCodec.cmake b/soxr/cmake/Modules/FindLibAVCodec.cmake
index add33c3..f1bbf89 100644
--- a/soxr/cmake/Modules/FindLibAVCodec.cmake
+++ b/soxr/cmake/Modules/FindLibAVCodec.cmake
@@ -1,23 +1,23 @@
-# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
# Licence for this file: LGPL v2.1 See LICENCE for details.
# - Find AVCODEC
-# Find the native installation of this package: includes and libraries.
+# Find the installation of this package: include-dirs and libraries.
#
-# AVCODEC_INCLUDES - where to find headers for this package.
-# AVCODEC_LIBRARIES - List of libraries when using this package.
-# AVCODEC_FOUND - True if this package can be found.
+# AVCODEC_INCLUDE_DIRS - where to find headers for this package.
+# AVCODEC_LIBRARIES - libraries to link to when using this package.
+# AVCODEC_FOUND - true iff this package can be found.
-if (AVCODEC_INCLUDES)
+if (AVCODEC_INCLUDE_DIRS)
set (AVCODEC_FIND_QUIETLY TRUE)
-endif (AVCODEC_INCLUDES)
+endif ()
-find_path (AVCODEC_INCLUDES libavcodec/avcodec.h)
+find_path (AVCODEC_INCLUDE_DIRS libavcodec/avcodec.h)
find_library (AVCODEC_LIBRARIES NAMES avcodec)
include (FindPackageHandleStandardArgs)
find_package_handle_standard_args (
- AVCODEC DEFAULT_MSG AVCODEC_LIBRARIES AVCODEC_INCLUDES)
+ AVCODEC DEFAULT_MSG AVCODEC_LIBRARIES AVCODEC_INCLUDE_DIRS)
-mark_as_advanced (AVCODEC_LIBRARIES AVCODEC_INCLUDES)
+mark_as_advanced (AVCODEC_LIBRARIES AVCODEC_INCLUDE_DIRS)
diff --git a/soxr/cmake/Modules/FindLibAVUtil.cmake b/soxr/cmake/Modules/FindLibAVUtil.cmake
new file mode 100644
index 0000000..464e6cf
--- /dev/null
+++ b/soxr/cmake/Modules/FindLibAVUtil.cmake
@@ -0,0 +1,23 @@
+# SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# - Find AVUTIL
+# Find the installation of this package: includes and libraries.
+#
+# AVUTIL_INCLUDE_DIRS - where to find headers for this package.
+# AVUTIL_LIBRARIES - libraries to link to when using this package.
+# AVUTIL_FOUND - true iff this package can be found.
+
+if (AVUTIL_INCLUDE_DIRS)
+ set (AVUTIL_FIND_QUIETLY TRUE)
+endif ()
+
+find_path (AVUTIL_INCLUDE_DIRS libavutil/cpu.h)
+
+find_library (AVUTIL_LIBRARIES NAMES avutil)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (
+ AVUTIL DEFAULT_MSG AVUTIL_LIBRARIES AVUTIL_INCLUDE_DIRS)
+
+mark_as_advanced (AVUTIL_LIBRARIES AVUTIL_INCLUDE_DIRS)
diff --git a/soxr/cmake/Modules/FindOpenMP.cmake b/soxr/cmake/Modules/FindOpenMP.cmake
deleted file mode 100644
index eef8422..0000000
--- a/soxr/cmake/Modules/FindOpenMP.cmake
+++ /dev/null
@@ -1,115 +0,0 @@
-# - Finds OpenMP support
-# This module can be used to detect OpenMP support in a compiler.
-# If the compiler supports OpenMP, the flags required to compile with
-# openmp support are set.
-#
-# The following variables are set:
-# OpenMP_C_FLAGS - flags to add to the C compiler for OpenMP support
-# OPENMP_FOUND - true if openmp is detected
-#
-# Supported compilers can be found at http://openmp.org/wp/openmp-compilers/
-#
-# Modifications for soxr:
-# * don't rely on presence of C++ compiler
-# * support MINGW
-#
-#=============================================================================
-# Copyright 2009 Kitware, Inc.
-# Copyright 2008-2009 André Rigland Brodtkorb
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# * The names of Kitware, Inc., the Insight Consortium, or the names of
-# any consortium members, or of any contributors, may not be used to
-# endorse or promote products derived from this software without
-# specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-include (CheckCSourceCompiles)
-include (FindPackageHandleStandardArgs)
-
-set (OpenMP_C_FLAG_CANDIDATES
- #Gnu
- "-fopenmp"
- #Microsoft Visual Studio
- "/openmp"
- #Intel windows
- "-Qopenmp"
- #Intel
- "-openmp"
- #Empty, if compiler automatically accepts openmp
- " "
- #Sun
- "-xopenmp"
- #HP
- "+Oopenmp"
- #IBM XL C/c++
- "-qsmp"
- #Portland Group
- "-mp"
-)
-
-# sample openmp source code to test
-set (OpenMP_C_TEST_SOURCE
-"
-#include
-int main() {
-#ifdef _OPENMP
- return 0;
-#else
- breaks_on_purpose
-#endif
-}
-")
-# if these are set then do not try to find them again,
-# by avoiding any try_compiles for the flags
-if (DEFINED OpenMP_C_FLAGS)
- set (OpenMP_C_FLAG_CANDIDATES)
-endif (DEFINED OpenMP_C_FLAGS)
-
-# check c compiler
-foreach (FLAG ${OpenMP_C_FLAG_CANDIDATES})
- set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
- set (CMAKE_REQUIRED_FLAGS "${FLAG}")
- unset (OpenMP_FLAG_DETECTED CACHE)
- message (STATUS "Try OpenMP C flag = [${FLAG}]")
- check_c_source_compiles ("${OpenMP_C_TEST_SOURCE}" OpenMP_FLAG_DETECTED)
- set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}")
- if (OpenMP_FLAG_DETECTED)
- set (OpenMP_C_FLAGS_INTERNAL "${FLAG}")
- break ()
- endif (OpenMP_FLAG_DETECTED)
-endforeach (FLAG ${OpenMP_C_FLAG_CANDIDATES})
-
-set (OpenMP_C_FLAGS "${OpenMP_C_FLAGS_INTERNAL}"
- CACHE STRING "C compiler flags for OpenMP parallization")
-
-# handle the standard arguments for find_package
-find_package_handle_standard_args (OpenMP DEFAULT_MSG
- OpenMP_C_FLAGS)
-
-if (MINGW)
- set (OpenMP_SHARED_LINKER_FLAGS "${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
- set (OpenMP_EXE_LINKER_FLAGS "${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
-endif ()
-
-mark_as_advanced (OpenMP_C_FLAGS OpenMP_SHARED_LINKER_FLAGS OpenMP_EXE_LINKER_FLAGS)
diff --git a/soxr/cmake/Modules/FindSIMD.cmake b/soxr/cmake/Modules/FindSIMD.cmake
deleted file mode 100644
index 6ac51cb..0000000
--- a/soxr/cmake/Modules/FindSIMD.cmake
+++ /dev/null
@@ -1,94 +0,0 @@
-# - Finds SIMD support
-#
-# The following variables are set:
-# SIMD_C_FLAGS - flags to add to the C compiler for this package.
-# SIMD_FOUND - true if support for this package is found.
-#
-#=============================================================================
-# Based on FindOpenMP.cmake, which is:
-#
-# Copyright 2009 Kitware, Inc.
-# Copyright 2008-2009 André Rigland Brodtkorb
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-# * Redistributions of source code must retain the above copyright notice,
-# this list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-# this list of conditions and the following disclaimer in the documentation
-# and/or other materials provided with the distribution.
-#
-# * The names of Kitware, Inc., the Insight Consortium, or the names of
-# any consortium members, or of any contributors, may not be used to
-# endorse or promote products derived from this software without
-# specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-include (CheckCSourceCompiles)
-include (FindPackageHandleStandardArgs)
-
-if (WIN32) # Safety for when mixed lib/app compilers (but performance hit)
- set (GCC_WIN32_SIMD_OPTS "-mincoming-stack-boundary=2")
-endif ()
-
-set (SIMD_C_FLAG_CANDIDATES
- # x64
- " "
- # Microsoft Visual Studio x86
- "/arch:SSE /fp:fast -D__SSE__"
- # Gcc x86
- "-msse -mfpmath=sse ${GCC_WIN32_SIMD_OPTS}"
- # Gcc x86 (old versions)
- "-msse -mfpmath=sse"
-)
-
-set (SIMD_C_TEST_SOURCE
-"
-#include
-int main()
-{
- __m128 a, b;
- float vals[4] = {0};
- a = _mm_loadu_ps (vals);
- b = a;
- b = _mm_add_ps (a,b);
- _mm_storeu_ps (vals,b);
- return 0;
-}
-")
-
-if (DEFINED SIMD_C_FLAGS)
- set (SIMD_C_FLAG_CANDIDATES)
-endif ()
-
-foreach (FLAG ${SIMD_C_FLAG_CANDIDATES})
- set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
- set (CMAKE_REQUIRED_FLAGS "${FLAG}")
- unset (SIMD_FLAG_DETECTED CACHE)
- message (STATUS "Try SIMD C flag = [${FLAG}]")
- check_c_source_compiles ("${SIMD_C_TEST_SOURCE}" SIMD_FLAG_DETECTED)
- set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}")
- if (SIMD_FLAG_DETECTED)
- set (SIMD_C_FLAGS_INTERNAL "${FLAG}")
- break ()
- endif ()
-endforeach ()
-
-set (SIMD_C_FLAGS "${SIMD_C_FLAGS_INTERNAL}"
- CACHE STRING "C compiler flags for SIMD vectorization")
-
-find_package_handle_standard_args (SIMD DEFAULT_MSG SIMD_C_FLAGS SIMD_C_FLAGS)
-mark_as_advanced (SIMD_C_FLAGS)
diff --git a/soxr/cmake/Modules/FindSIMD32.cmake b/soxr/cmake/Modules/FindSIMD32.cmake
new file mode 100644
index 0000000..9e42373
--- /dev/null
+++ b/soxr/cmake/Modules/FindSIMD32.cmake
@@ -0,0 +1,54 @@
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# - Finds SIMD32 support
+#
+# The following variables are set:
+# SIMD32_C_FLAGS - flags to add to the C compiler for this package.
+# SIMD32_FOUND - true if support for this package is found.
+
+if (DEFINED SIMD32_C_FLAGS)
+ set (TRIAL_C_FLAGS)
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+ set (TRIAL_C_FLAGS
+ # Gcc
+ "-mfpu=neon-vfpv4 -mcpu=cortex-a7"
+ "-mfpu=neon -mfloat-abi=hard"
+ "-mfpu=neon -mfloat-abi=softfp"
+ "-mfpu=neon -mfloat-abi=soft"
+ )
+ set (TEST_C_SOURCE "
+ #include
+ int main(int c, char * * v) {
+ float32x4_t a = vdupq_n_f32((float)c), b = vdupq_n_f32((float)!!v);
+ return !vgetq_lane_u32(vceqq_f32(a,b),0);
+ }
+ ")
+else ()
+ if (WIN32) # Safety for when mixed lib/app compilers (but performance hit)
+ set (GCC_WIN32_SIMD32_OPTS "-mincoming-stack-boundary=2")
+ endif ()
+
+ set (TRIAL_C_FLAGS
+ # x64
+ " "
+ # MSVC x86
+ "/arch:SSE /fp:fast -D__SSE__"
+ # Gcc x86
+ "-msse -mfpmath=sse ${GCC_WIN32_SIMD32_OPTS}"
+ # Gcc x86 (old versions)
+ "-msse -mfpmath=sse"
+ )
+ set (TEST_C_SOURCE "
+ #include
+ int main(int c, char * * v) {
+ __m128 a = _mm_set_ss((float)c), b = _mm_set_ss((float)!!v);
+ return _mm_comineq_ss(a,b);
+ }
+ ")
+endif ()
+
+include (FindCFlags)
+
+FindCFlags ("SIMD32" "FLOAT-32 (single-precision) SIMD vectorization"
+ "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}")
diff --git a/soxr/cmake/Modules/FindSIMD64.cmake b/soxr/cmake/Modules/FindSIMD64.cmake
new file mode 100644
index 0000000..d412644
--- /dev/null
+++ b/soxr/cmake/Modules/FindSIMD64.cmake
@@ -0,0 +1,29 @@
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# - Finds SIMD64 support
+#
+# The following variables are set:
+# SIMD64_C_FLAGS - flags to add to the C compiler for this package.
+# SIMD64_FOUND - true if support for this package is found.
+
+if (DEFINED SIMD64_C_FLAGS OR CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+ set (TRIAL_C_FLAGS)
+else ()
+ set (TRIAL_C_FLAGS
+ "-mavx" # Gcc
+ "/arch:AVX" # MSVC
+ )
+ set (TEST_C_SOURCE "
+ #ifndef __AVX__
+ #error
+ #endif
+ #include
+ int main() {return 0;}
+ ")
+endif ()
+
+include (FindCFlags)
+
+FindCFlags ("SIMD64" "FLOAT-64 (double-precision) SIMD vectorization"
+ "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}")
diff --git a/soxr/cmake/Modules/SetSystemProcessor.cmake b/soxr/cmake/Modules/SetSystemProcessor.cmake
new file mode 100644
index 0000000..8e2c292
--- /dev/null
+++ b/soxr/cmake/Modules/SetSystemProcessor.cmake
@@ -0,0 +1,37 @@
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# Sets CMAKE_SYSTEM_PROCESSOR for cross-compiling.
+
+macro (set_system_processor)
+ if (CMAKE_CROSSCOMPILING)
+ if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "unknown")
+ unset(CMAKE_SYSTEM_PROCESSOR)
+ endif ()
+ if (NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
+ include (CheckCSourceCompiles)
+ set (CPU_LINES
+ "#if defined __x86_64__ || defined _M_X64 /*\;x86_64\;*/"
+ "#if defined __i386__ || defined _M_IX86 /*\;x86_32\;*/"
+ "#if defined __arm__ || defined _M_ARM /*\;arm\;*/"
+ )
+ foreach (CPU_LINE ${CPU_LINES})
+ string (CONCAT CPU_SOURCE "${CPU_LINE}" "
+ int main() {return 0;}
+ #endif
+ ")
+ unset (SYSTEM_PROCESSOR_DETECTED CACHE)
+ check_c_source_compiles ("${CPU_SOURCE}" SYSTEM_PROCESSOR_DETECTED)
+ if (SYSTEM_PROCESSOR_DETECTED)
+ list (GET CPU_LINE 1 CMAKE_SYSTEM_PROCESSOR)
+ message (STATUS "CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
+ break ()
+ endif ()
+ endforeach ()
+ endif ()
+
+ # N.B. Will not overwrite existing cache variable:
+ set (CMAKE_SYSTEM_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}"
+ CACHE STRING "Target system processor")
+ endif ()
+endmacro ()
diff --git a/soxr/cmake/Modules/TestBigEndian.cmake b/soxr/cmake/Modules/TestBigEndian.cmake
deleted file mode 100644
index d80df20..0000000
--- a/soxr/cmake/Modules/TestBigEndian.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
-# Licence for this file: LGPL v2.1 See LICENCE for details.
-
-# - Macro to determine endian type
-# test_big_endian (VARIABLE)
-# VARIABLE - variable to store the result to
-
-macro (test_big_endian VARIABLE)
- if ("${HAVE_${VARIABLE}}" MATCHES "^${HAVE_${VARIABLE}}$")
- include (CheckCSourceRuns)
- check_c_source_runs ("int main() {union {long i; char c[sizeof(long)];}
- const u = {1}; return !!u.c[0];}" HAVE_${VARIABLE})
- set (${VARIABLE} "${HAVE_${VARIABLE}}" CACHE INTERNAL "1 if system is big endian" FORCE)
- endif ()
-endmacro ()
diff --git a/soxr/dist b/soxr/dist
new file mode 100644
index 0000000..ee68b30
--- /dev/null
+++ b/soxr/dist
@@ -0,0 +1,12 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# Makes the distribution tarball
+
+test $# = 1 -o `git status -s|wc -c` = 0
+rm -rf Release
+./go -j4
+cd Release
+make package_source
diff --git a/soxr/examples/1-single-block.c b/soxr/examples/1-single-block.c
index 3fb9201..3b919b4 100644
--- a/soxr/examples/1-single-block.c
+++ b/soxr/examples/1-single-block.c
@@ -25,7 +25,7 @@ const float in[] = { /* Input: 12 cycles of a sine wave with freq. = irate/4 */
int main(int argc, char const * arg[])
{
- double irate = argc > 1? atof(arg[1]) : 1; /* Default to upsampling */
+ double irate = argc > 1? atof(arg[1]) : 1; /* Default to interpolation */
double orate = argc > 2? atof(arg[2]) : 2; /* by a factor of 2. */
size_t olen = (size_t)(AL(in) * orate / irate + .5); /* Assay output len. */
diff --git a/soxr/examples/1a-lsr.c b/soxr/examples/1a-lsr.c
index e42e530..6b50a8f 100644
--- a/soxr/examples/1a-lsr.c
+++ b/soxr/examples/1a-lsr.c
@@ -12,7 +12,7 @@ float in[] = { /* Input: 12 cycles of a sine wave with freq. = irate/4 */
int main(int argc, char const * arg[])
{
- double irate = argc > 1? atof(arg[1]) : 1; /* Default to upsampling */
+ double irate = argc > 1? atof(arg[1]) : 1; /* Default to interpolation */
double orate = argc > 2? atof(arg[2]) : 2; /* by a factor of 2. */
size_t olen = (size_t)(AL(in) * orate / irate + .5); /* Assay output len. */
diff --git a/soxr/examples/3-options-input-fn.c b/soxr/examples/3-options-input-fn.c
index 38fbb0d..afd43b9 100644
--- a/soxr/examples/3-options-input-fn.c
+++ b/soxr/examples/3-options-input-fn.c
@@ -1,4 +1,4 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
/* Example 3: extends example 2 with multiple channels, multiple datatypes,
@@ -14,7 +14,7 @@
* OUTPUT-RATE Ditto
* NUM-CHANNELS Number of interleaved channels
* IN-DATATYPE# 0:float32 1:float64 2:int32 3:int16
- * OUT-DATATYPE# Ditto
+ * OUT-DATATYPE# Ditto; or 11 for un-dithered int16
* Q-RECIPE Quality recipe (in hex) See soxr.h
* Q-FLAGS Quality flags (in hex) See soxr.h
* PASSBAND-END %
@@ -42,7 +42,7 @@ static size_t input_fn(input_context_t * p, soxr_cbuf_t * buf, size_t len)
int main(int n, char const * arg[])
{
- char const * const arg0 = n? --n, *arg++ : "";
+ char const * const arg0 = n? --n, *arg++ : "", * engine = "";
double const irate = n? --n, atof(*arg++) : 96000.;
double const orate = n? --n, atof(*arg++) : 44100.;
unsigned const chans = n? --n, (unsigned)atoi(*arg++) : 1;
@@ -94,6 +94,7 @@ int main(int n, char const * arg[])
}
if (!error) { /* If all is well, run the resampler: */
+ engine = soxr_engine(soxr);
USE_STD_STDIO;
/* Resample in blocks: */
do odone = soxr_output(soxr, obuf, olen);
@@ -106,8 +107,8 @@ int main(int n, char const * arg[])
soxr_delete(soxr);
free(obuf), free(ibuf);
/* Diagnostics: */
- fprintf(stderr, "%-26s %s; %lu clips; I/O: %s\n",
+ fprintf(stderr, "%-26s %s; %lu clips; I/O: %s (%s)\n",
arg0, soxr_strerror(error), (long unsigned)clips,
- ferror(stdin) || ferror(stdout)? strerror(errno) : "no error");
+ ferror(stdin) || ferror(stdout)? strerror(errno) : "no error", engine);
return !!error;
}
diff --git a/soxr/examples/4-split-channels.c b/soxr/examples/4-split-channels.c
index d6448aa..a9022ce 100644
--- a/soxr/examples/4-split-channels.c
+++ b/soxr/examples/4-split-channels.c
@@ -1,4 +1,4 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
/* Example 4: variant of examples 2 & 3, demonstrating I/O with split channels.
@@ -13,6 +13,8 @@
*
* Note also (not shown in the examples) that split/interleaved channels may
* be used for input and output independently.
+ *
+ * Arguments are as example 3.
*/
#include
@@ -73,13 +75,17 @@ int main(int n, char const * arg[])
double const orate = n? --n, atof(*arg++) : 44100.;
unsigned const chans = n? --n, (unsigned)atoi(*arg++) : 1;
soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
- soxr_datatype_t const otype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+ unsigned const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+ double const passband_end = n? --n, atof(*arg++) : 0;
+ double const stopband_begin = n? --n, atof(*arg++) : 0;
+ double const phase_response = n? --n, atof(*arg++) : -1;
int const use_threads = n? --n, atoi(*arg++) : 1;
+ soxr_datatype_t const otype = ospec & 3;
- soxr_quality_spec_t const q_spec = soxr_quality_spec(q_recipe, q_flags);
- soxr_io_spec_t const io_spec=soxr_io_spec(itype|SOXR_SPLIT, otype|SOXR_SPLIT);
+ soxr_quality_spec_t q_spec = soxr_quality_spec(q_recipe, q_flags);
+ soxr_io_spec_t io_spec=soxr_io_spec(itype|SOXR_SPLIT, otype|SOXR_SPLIT);
soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
/* Allocate resampling input and output buffers in proportion to the input
@@ -102,11 +108,18 @@ int main(int n, char const * arg[])
size_t odone, written, need_input = 1, clips = 0;
soxr_error_t error;
+ soxr_t soxr;
+ unsigned i;
- soxr_t soxr = soxr_create(
+ /* Overrides (if given): */
+ if (passband_end > 0) q_spec.passband_end = passband_end / 100;
+ if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100;
+ if (phase_response >=0) q_spec.phase_response = phase_response;
+ io_spec.flags = ospec & ~7u;
+
+ soxr = soxr_create(
irate, orate, chans, &error, &io_spec, &q_spec, &runtime_spec);
- unsigned i;
for (i = 0; i < chans; ++i) {
ibuf_ptrs[i] = iptr;
obuf_ptrs[i] = optr;
diff --git a/soxr/examples/CMakeLists.txt b/soxr/examples/CMakeLists.txt
index 8107a4e..c8c17c9 100644
--- a/soxr/examples/CMakeLists.txt
+++ b/soxr/examples/CMakeLists.txt
@@ -1,25 +1,23 @@
-# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
# Licence for this file: LGPL v2.1 See LICENCE for details.
-if (${BUILD_EXAMPLES})
- project (soxr) # Adds c++ compiler
- file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/[1-9]-*.[cC])
-elseif (${BUILD_TESTS})
- file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/3*.c)
-endif ()
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PROJECT_CXX_FLAGS}")
+link_libraries (${PROJECT_NAME} ${LIBM_LIBRARIES})
if (${BUILD_EXAMPLES} OR ${BUILD_TESTS})
+ set (SOURCES 3-options-input-fn)
if (${WITH_LSR_BINDINGS})
- set (LSR_SOURCES 1a-lsr.c)
+ set (LSR_SOURCES 1a-lsr)
endif ()
endif ()
-if (NOT BUILD_SHARED_LIBS AND OPENMP_FOUND)
- set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_C_FLAGS}")
+if (${BUILD_EXAMPLES})
+ list (APPEND SOURCES 1-single-block 2-stream 4-split-channels)
+ if (${WITH_VR32})
+ list (APPEND SOURCES 5-variable-rate)
+ endif ()
endif ()
-set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}")
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PROJECT_CXX_FLAGS}")
-link_libraries (soxr)
foreach (fe ${SOURCES} ${LSR_SOURCES})
get_filename_component (f ${fe} NAME_WE)
@@ -34,4 +32,5 @@ if (${BUILD_TESTS} AND ${WITH_LSR_BINDINGS})
endif ()
file (GLOB INSTALL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.[cCh])
-install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README DESTINATION ${DOC_INSTALL_DIR}/examples)
+install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README
+ DESTINATION ${DOC_INSTALL_DIR}/examples)
diff --git a/soxr/examples/examples-common.h b/soxr/examples/examples-common.h
index 585fac3..fc8ed82 100644
--- a/soxr/examples/examples-common.h
+++ b/soxr/examples/examples-common.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
/* Common includes etc. for the examples. */
@@ -17,10 +17,7 @@
#include
#include
#define USE_STD_STDIO _setmode(_fileno(stdout), _O_BINARY), \
- _setmode(_fileno(stdin ), _O_BINARY);
- /* Sometimes missing, so ensure that it is defined: */
- #undef M_PI
- #define M_PI 3.14159265358979323846
+ _setmode(_fileno(stdin ), _O_BINARY)
#else
#define USE_STD_STDIO
#endif
@@ -38,8 +35,13 @@
#endif
#undef min
-#undef max
#define min(x,y) ((x)<(y)?(x):(y))
+
+#undef max
#define max(x,y) ((x)>(y)?(x):(y))
+#undef AL
#define AL(a) (sizeof(a)/sizeof((a)[0])) /* Array Length */
+
+#undef M_PI /* Sometimes missing, so ensure that it is defined: */
+#define M_PI 3.14159265358979323846
diff --git a/soxr/go b/soxr/go
new file mode 100644
index 0000000..7fba810
--- /dev/null
+++ b/soxr/go
@@ -0,0 +1,18 @@
+#!/bin/sh
+set -e
+
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+case "$1" in -j*) j="$1"; shift;; esac # Support -jX for parallel build/test
+
+test x"$1" = x && build=Release || build="$1"
+
+rm -f CMakeCache.txt # Prevent interference from any in-tree build
+
+mkdir -p "$build"
+cd "$build"
+
+cmake -Wno-dev -DCMAKE_BUILD_TYPE="$build" ..
+make $j
+ctest $j || echo "FAILURE details in $build/Testing/Temporary/LastTest.log"
diff --git a/soxr/go.bat b/soxr/go.bat
new file mode 100644
index 0000000..aabff75
--- /dev/null
+++ b/soxr/go.bat
@@ -0,0 +1,27 @@
+@echo off
+rem SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+rem Licence for this file: LGPL v2.1 See LICENCE for details.
+
+set build=%1
+if x%build% == x set build=Release
+
+rem Prevent interference from any in-tree build
+del/f CMakeCache.txt
+
+mkdir %build%
+cd %build%
+
+cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=%build% -Wno-dev ..
+if errorlevel 1 goto end
+
+nmake
+if errorlevel 1 goto end
+
+nmake test
+if errorlevel 1 goto error
+goto end
+
+:error
+echo FAILURE details in Testing\Temporary\LastTest.log
+
+:end
diff --git a/soxr/inst-check b/soxr/inst-check
new file mode 100644
index 0000000..8cf64b7
--- /dev/null
+++ b/soxr/inst-check
@@ -0,0 +1,25 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# Sanity-check of library installed on unix-like system
+
+# This script checks the installation of the entire library (including lsr).
+#
+# Distros using three separate packages can do the following (in order):
+#
+# * Install soxr pkg (i.e. basically, just the shared object)
+# * ./inst-check-soxr
+# * Install soxr-lsr pkg (i.e. basically, just the shared object)
+# * ./inst-check-soxr-lsr
+# * Install the -dev pkg (i.e. examples, headers, & pkg-config)
+# * ./inst-check PATH-OF-INSTALLED-EXAMPLES-DIR (e.g. /usr/share/doc/libsoxr/examples)
+
+# Where are the example source files:
+src=$1
+test x$src = x && src=/usr/local/share/doc/libsoxr/examples
+
+dir="$(dirname $(readlink -f $0))"
+$dir/inst-check-soxr $src
+$dir/inst-check-soxr-lsr $src
diff --git a/soxr/inst-check-soxr b/soxr/inst-check-soxr
new file mode 100644
index 0000000..5f923b8
--- /dev/null
+++ b/soxr/inst-check-soxr
@@ -0,0 +1,52 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# Sanity-check of sub-library installed on unix-like system
+
+arg="$1" # path to installed examples (if dev pkg installed); otherwise omitted
+dir="$(dirname $(readlink -f $0))"
+
+# Find the examples:
+src="$arg"
+test x"$src" = x && src="$dir/examples"
+cd $src
+
+# Somewhere to put the binaries:
+tmp=`mktemp -d`
+
+build_examples() {
+ if [ x"$arg" = x ]; then
+ echo "Examples in `pwd`; using local headers:" # for when dev pkg not installed
+ libs=-l$1
+ cflags=-I$dir/src
+ else
+ echo "Examples in `pwd`; using pkg-config:"
+ libs=$(pkg-config --libs $1)
+ cflags=$(pkg-config --cflags $1)
+ fi
+ for f in ?$2-*.[cC]; do
+ cc=cc; echo $f | grep -q C$ && cc=c++
+ out=$tmp/`echo $f | sed "s/.[cC]$//"`
+ cmd="$cc $cflags -o $out $f $libs -lm"
+ echo $cmd; $cmd
+ done
+}
+
+# Determine library:
+if [ `basename $0` = inst-check-soxr ]; then
+ build_examples soxr
+ gen="dd if=/dev/urandom count=1000"
+ $tmp/1-single-block 1 2 .
+ $gen 2> /dev/null | $tmp/2-stream 2>&1 >$tmp/stdout
+ $gen 2> /dev/null | $tmp/3-options-input-fn 6 7 2 2 0 2>&1 >$tmp/stdout
+ $gen 2> /dev/null | $tmp/4-split-channels 7 6 2 2 3 2>&1 >$tmp/stdout # Clipping expected here
+ $gen 2> /dev/null | $tmp/5-variable-rate 2>&1 >$tmp/stdout
+else
+ build_examples soxr-lsr a # lsr has 'a' suffix on example number.
+ $tmp/1a-lsr 1 2 .
+fi
+
+# Tidy up:
+rm -rf $tmp
diff --git a/soxr/inst-check-soxr-lsr b/soxr/inst-check-soxr-lsr
new file mode 100644
index 0000000..5f923b8
--- /dev/null
+++ b/soxr/inst-check-soxr-lsr
@@ -0,0 +1,52 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# Sanity-check of sub-library installed on unix-like system
+
+arg="$1" # path to installed examples (if dev pkg installed); otherwise omitted
+dir="$(dirname $(readlink -f $0))"
+
+# Find the examples:
+src="$arg"
+test x"$src" = x && src="$dir/examples"
+cd $src
+
+# Somewhere to put the binaries:
+tmp=`mktemp -d`
+
+build_examples() {
+ if [ x"$arg" = x ]; then
+ echo "Examples in `pwd`; using local headers:" # for when dev pkg not installed
+ libs=-l$1
+ cflags=-I$dir/src
+ else
+ echo "Examples in `pwd`; using pkg-config:"
+ libs=$(pkg-config --libs $1)
+ cflags=$(pkg-config --cflags $1)
+ fi
+ for f in ?$2-*.[cC]; do
+ cc=cc; echo $f | grep -q C$ && cc=c++
+ out=$tmp/`echo $f | sed "s/.[cC]$//"`
+ cmd="$cc $cflags -o $out $f $libs -lm"
+ echo $cmd; $cmd
+ done
+}
+
+# Determine library:
+if [ `basename $0` = inst-check-soxr ]; then
+ build_examples soxr
+ gen="dd if=/dev/urandom count=1000"
+ $tmp/1-single-block 1 2 .
+ $gen 2> /dev/null | $tmp/2-stream 2>&1 >$tmp/stdout
+ $gen 2> /dev/null | $tmp/3-options-input-fn 6 7 2 2 0 2>&1 >$tmp/stdout
+ $gen 2> /dev/null | $tmp/4-split-channels 7 6 2 2 3 2>&1 >$tmp/stdout # Clipping expected here
+ $gen 2> /dev/null | $tmp/5-variable-rate 2>&1 >$tmp/stdout
+else
+ build_examples soxr-lsr a # lsr has 'a' suffix on example number.
+ $tmp/1a-lsr 1 2 .
+fi
+
+# Tidy up:
+rm -rf $tmp
diff --git a/soxr/lsr-tests/CMakeLists.txt b/soxr/lsr-tests/CMakeLists.txt
new file mode 100644
index 0000000..4f718f7
--- /dev/null
+++ b/soxr/lsr-tests/CMakeLists.txt
@@ -0,0 +1,50 @@
+# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+list (APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules)
+
+find_package (FFTW)
+if (FFTW_FOUND)
+ include_directories (${FFTW_INCLUDE_DIRS})
+ link_libraries (${FFTW_LIBRARIES})
+ set (HAVE_FFTW3 1)
+endif ()
+
+find_package (sndfile)
+if (SNDFILE_FOUND)
+ include_directories (${SNDFILE_INCLUDE_DIRS})
+ link_libraries (${SNDFILE_LIBRARIES})
+ set (HAVE_SNDFILE 1)
+endif ()
+
+check_function_exists (lrintf HAVE_LRINTF)
+check_function_exists (alarm HAVE_ALARM)
+check_function_exists (signal HAVE_SIGNAL)
+check_include_files (sys/times.h HAVE_SYS_TIMES_H)
+
+configure_file (${CMAKE_CURRENT_SOURCE_DIR}/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h)
+include_directories (${CMAKE_CURRENT_BINARY_DIR})
+
+add_library (tests_lib STATIC util calc_snr)
+
+link_libraries (tests_lib ${PROJECT_NAME}-lsr ${LIBM_LIBRARIES})
+
+enable_testing ()
+
+set (tests
+ callback_hang_test callback_test downsample_test
+ float_short_test misc_test multi_channel_test
+ reset_test simple_test termination_test varispeed_test)
+if (WITH_CR64 OR WITH_CR64S)
+ set (tests ${tests} snr_bw_test)
+endif ()
+
+foreach (test ${tests})
+ add_executable (${test} ${test})
+ add_test (lsr-${test} ${BIN}${test})
+ set_property (TEST lsr-${test} PROPERTY ENVIRONMENT "SOXR_LSR_STRICT=1")
+endforeach ()
+
+add_executable (multichan_throughput_test multichan_throughput_test)
+add_executable (throughput_test throughput_test )
+add_executable (sndfile-resample sndfile-resample)
diff --git a/soxr/lsr-tests/COPYING b/soxr/lsr-tests/COPYING
new file mode 100644
index 0000000..d60c31a
--- /dev/null
+++ b/soxr/lsr-tests/COPYING
@@ -0,0 +1,340 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+
+ Copyright (C)
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ , 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/soxr/lsr-tests/README b/soxr/lsr-tests/README
new file mode 100644
index 0000000..f468446
--- /dev/null
+++ b/soxr/lsr-tests/README
@@ -0,0 +1,8 @@
+The C source and header files in this directory have been copied from
+the `libsamplerate' project and are copyrighted by its authors -- see
+the notices within the files and the file `COPYING' for details.
+
+They are used here to test libsoxr's optional libsamplerate-like
+wrapper. The only modifications made are to the file `snr_bw_test.c' to
+remove reliance on certain frequency response troughs that are specific
+to libsamplerate.
diff --git a/soxr/lsr-tests/calc_snr.c b/soxr/lsr-tests/calc_snr.c
new file mode 100644
index 0000000..ddfc04c
--- /dev/null
+++ b/soxr/lsr-tests/calc_snr.c
@@ -0,0 +1,242 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include "util.h"
+
+#if (HAVE_FFTW3 == 1)
+
+#include
+#include
+#include
+#include
+
+#include
+
+#define MAX_SPEC_LEN (1<<18)
+#define MAX_PEAKS 10
+
+static void log_mag_spectrum (double *input, int len, double *magnitude) ;
+static void smooth_mag_spectrum (double *magnitude, int len) ;
+static double find_snr (const double *magnitude, int len, int expected_peaks) ;
+
+typedef struct
+{ double peak ;
+ int index ;
+} PEAK_DATA ;
+
+double
+calculate_snr (float *data, int len, int expected_peaks)
+{ static double magnitude [MAX_SPEC_LEN] ;
+ static double datacopy [MAX_SPEC_LEN] ;
+
+ double snr = 200.0 ;
+ int k ;
+
+ if (len > MAX_SPEC_LEN)
+ { printf ("%s : line %d : data length too large.\n", __FILE__, __LINE__) ;
+ exit (1) ;
+ } ;
+
+ for (k = 0 ; k < len ; k++)
+ datacopy [k] = data [k] ;
+
+ /* Pad the data just a little to speed up the FFT. */
+ while ((len & 0x1F) && len < MAX_SPEC_LEN)
+ { datacopy [len] = 0.0 ;
+ len ++ ;
+ } ;
+
+ log_mag_spectrum (datacopy, len, magnitude) ;
+ smooth_mag_spectrum (magnitude, len / 2) ;
+
+ snr = find_snr (magnitude, len, expected_peaks) ;
+
+ return snr ;
+} /* calculate_snr */
+
+/*==============================================================================
+** There is a slight problem with trying to measure SNR with the method used
+** here; the side lobes of the windowed FFT can look like a noise/aliasing peak.
+** The solution is to smooth the magnitude spectrum by wiping out troughs
+** between adjacent peaks as done here.
+** This removes side lobe peaks without affecting noise/aliasing peaks.
+*/
+
+static void linear_smooth (double *mag, PEAK_DATA *larger, PEAK_DATA *smaller) ;
+
+static void
+smooth_mag_spectrum (double *mag, int len)
+{ PEAK_DATA peaks [2] ;
+
+ int k ;
+
+ memset (peaks, 0, sizeof (peaks)) ;
+
+ /* Find first peak. */
+ for (k = 1 ; k < len - 1 ; k++)
+ { if (mag [k - 1] < mag [k] && mag [k] >= mag [k + 1])
+ { peaks [0].peak = mag [k] ;
+ peaks [0].index = k ;
+ break ;
+ } ;
+ } ;
+
+ /* Find subsequent peaks ans smooth between peaks. */
+ for (k = peaks [0].index + 1 ; k < len - 1 ; k++)
+ { if (mag [k - 1] < mag [k] && mag [k] >= mag [k + 1])
+ { peaks [1].peak = mag [k] ;
+ peaks [1].index = k ;
+
+ if (peaks [1].peak > peaks [0].peak)
+ linear_smooth (mag, &peaks [1], &peaks [0]) ;
+ else
+ linear_smooth (mag, &peaks [0], &peaks [1]) ;
+ peaks [0] = peaks [1] ;
+ } ;
+ } ;
+
+} /* smooth_mag_spectrum */
+
+static void
+linear_smooth (double *mag, PEAK_DATA *larger, PEAK_DATA *smaller)
+{ int k ;
+
+ if (smaller->index < larger->index)
+ { for (k = smaller->index + 1 ; k < larger->index ; k++)
+ mag [k] = (mag [k] < mag [k - 1]) ? 0.999 * mag [k - 1] : mag [k] ;
+ }
+ else
+ { for (k = smaller->index - 1 ; k >= larger->index ; k--)
+ mag [k] = (mag [k] < mag [k + 1]) ? 0.999 * mag [k + 1] : mag [k] ;
+ } ;
+
+} /* linear_smooth */
+
+/*==============================================================================
+*/
+
+static int
+peak_compare (const void *vp1, const void *vp2)
+{ const PEAK_DATA *peak1, *peak2 ;
+
+ peak1 = (const PEAK_DATA*) vp1 ;
+ peak2 = (const PEAK_DATA*) vp2 ;
+
+ return (peak1->peak < peak2->peak) ? 1 : -1 ;
+} /* peak_compare */
+
+static double
+find_snr (const double *magnitude, int len, int expected_peaks)
+{ PEAK_DATA peaks [MAX_PEAKS] ;
+
+ int k, peak_count = 0 ;
+ double snr ;
+
+ memset (peaks, 0, sizeof (peaks)) ;
+
+ /* Find the MAX_PEAKS largest peaks. */
+ for (k = 1 ; k < len - 1 ; k++)
+ { if (magnitude [k - 1] < magnitude [k] && magnitude [k] >= magnitude [k + 1])
+ { if (peak_count < MAX_PEAKS)
+ { peaks [peak_count].peak = magnitude [k] ;
+ peaks [peak_count].index = k ;
+ peak_count ++ ;
+ qsort (peaks, peak_count, sizeof (PEAK_DATA), peak_compare) ;
+ }
+ else if (magnitude [k] > peaks [MAX_PEAKS - 1].peak)
+ { peaks [MAX_PEAKS - 1].peak = magnitude [k] ;
+ peaks [MAX_PEAKS - 1].index = k ;
+ qsort (peaks, MAX_PEAKS, sizeof (PEAK_DATA), peak_compare) ;
+ } ;
+ } ;
+ } ;
+
+ if (peak_count < expected_peaks)
+ { printf ("\n%s : line %d : bad peak_count (%d), expected %d.\n\n", __FILE__, __LINE__, peak_count, expected_peaks) ;
+ return -1.0 ;
+ } ;
+
+ /* Sort the peaks. */
+ qsort (peaks, peak_count, sizeof (PEAK_DATA), peak_compare) ;
+
+ snr = peaks [0].peak ;
+ for (k = 1 ; k < peak_count ; k++)
+ if (fabs (snr - peaks [k].peak) > 10.0)
+ return fabs (peaks [k].peak) ;
+
+ return snr ;
+} /* find_snr */
+
+static void
+log_mag_spectrum (double *input, int len, double *magnitude)
+{ fftw_plan plan = NULL ;
+
+ double maxval ;
+ int k ;
+
+ if (input == NULL || magnitude == NULL)
+ return ;
+
+ plan = fftw_plan_r2r_1d (len, input, magnitude, FFTW_R2HC, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT) ;
+ if (plan == NULL)
+ { printf ("%s : line %d : create plan failed.\n", __FILE__, __LINE__) ;
+ exit (1) ;
+ } ;
+
+ fftw_execute (plan) ;
+
+ fftw_destroy_plan (plan) ;
+
+ /* (k < N/2 rounded up) */
+ maxval = 0.0 ;
+ for (k = 1 ; k < len / 2 ; k++)
+ { magnitude [k] = sqrt (magnitude [k] * magnitude [k] + magnitude [len - k - 1] * magnitude [len - k - 1]) ;
+ maxval = (maxval < magnitude [k]) ? magnitude [k] : maxval ;
+ } ;
+
+ memset (magnitude + len / 2, 0, len / 2 * sizeof (magnitude [0])) ;
+
+ /* Don't care about DC component. Make it zero. */
+ magnitude [0] = 0.0 ;
+
+ /* log magnitude. */
+ for (k = 0 ; k < len ; k++)
+ { magnitude [k] = magnitude [k] / maxval ;
+ magnitude [k] = (magnitude [k] < 1e-15) ? -200.0 : 20.0 * log10 (magnitude [k]) ;
+ } ;
+
+ return ;
+} /* log_mag_spectrum */
+
+#else /* ! (HAVE_LIBFFTW && HAVE_LIBRFFTW) */
+
+double
+calculate_snr (float *data, int len, int expected_peaks)
+{ double snr = 200.0 ;
+
+ data = data ;
+ len = len ;
+ expected_peaks = expected_peaks ;
+
+ return snr ;
+} /* calculate_snr */
+
+#endif
+
diff --git a/soxr/lsr-tests/callback_hang_test.c b/soxr/lsr-tests/callback_hang_test.c
new file mode 100644
index 0000000..be89369
--- /dev/null
+++ b/soxr/lsr-tests/callback_hang_test.c
@@ -0,0 +1,131 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include
+#include
+#include
+#include
+
+#if HAVE_ALARM && HAVE_SIGNAL && HAVE_SIGALRM
+
+#include
+
+#include
+
+#include "util.h"
+
+#define SHORT_BUFFER_LEN 512
+#define LONG_BUFFER_LEN (1 << 14)
+
+typedef struct
+{ double ratio ;
+ int count ;
+} SRC_PAIR ;
+
+static void callback_hang_test (int converter) ;
+
+static void alarm_handler (int number) ;
+static long input_callback (void *cb_data, float **data) ;
+
+
+int
+main (void)
+{
+ /* Set up SIGALRM handler. */
+ signal (SIGALRM, alarm_handler) ;
+
+ puts ("") ;
+ callback_hang_test (SRC_ZERO_ORDER_HOLD) ;
+ callback_hang_test (SRC_LINEAR) ;
+ callback_hang_test (SRC_SINC_FASTEST) ;
+ puts ("") ;
+
+ return 0 ;
+} /* main */
+
+
+static void
+callback_hang_test (int converter)
+{ static float output [LONG_BUFFER_LEN] ;
+ static SRC_PAIR pairs [] =
+ {
+ { 1.2, 5 }, { 1.1, 1 }, { 1.0, 1 }, { 3.0, 1 }, { 2.0, 1 }, { 0.3, 1 },
+ { 1.2, 0 }, { 1.1, 10 }, { 1.0, 1 }
+ } ;
+
+
+ SRC_STATE *src_state ;
+
+ double src_ratio = 1.0 ;
+ int k, error ;
+
+ printf ("\tcallback_hang_test (%-28s) ....... ", src_get_name (converter)) ;
+ fflush (stdout) ;
+
+ /* Perform sample rate conversion. */
+ src_state = src_callback_new (input_callback, converter, 1, &error, NULL) ;
+ if (src_state == NULL)
+ { printf ("\n\nLine %d : src_callback_new () failed : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ for (k = 0 ; k < ARRAY_LEN (pairs) ; k++)
+ { alarm (1) ;
+ src_ratio = pairs [k].ratio ;
+ src_callback_read (src_state, src_ratio, pairs [k].count, output) ;
+ } ;
+
+ src_state = src_delete (src_state) ;
+
+ alarm (0) ;
+ puts ("ok") ;
+
+ return ;
+} /* callback_hang_test */
+
+static void
+alarm_handler (int number)
+{
+ (void) number ;
+ printf ("\n\n Error : Hang inside src_callback_read() detected. Exiting!\n\n") ;
+ exit (1) ;
+} /* alarm_handler */
+
+static long
+input_callback (void *cb_data, float **data)
+{
+ static float buffer [20] ;
+
+ (void) cb_data ;
+ *data = buffer ;
+
+ return ARRAY_LEN (buffer) ;
+} /* input_callback */
+
+#else
+
+int
+main (void)
+{
+ puts ("\tCan't run this test on this platform.") ;
+ return 0 ;
+} /* main */
+
+#endif
diff --git a/soxr/lsr-tests/callback_test.c b/soxr/lsr-tests/callback_test.c
new file mode 100644
index 0000000..0854d64
--- /dev/null
+++ b/soxr/lsr-tests/callback_test.c
@@ -0,0 +1,243 @@
+/*
+** Copyright (C) 2003-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+#include
+
+#include
+
+#include "util.h"
+
+#define BUFFER_LEN 10000
+#define CB_READ_LEN 256
+
+static void callback_test (int converter, double ratio) ;
+static void end_of_stream_test (int converter) ;
+
+int
+main (void)
+{ static double src_ratios [] =
+ { 1.0, 0.099, 0.1, 0.33333333, 0.789, 1.0001, 1.9, 3.1, 9.9
+ } ;
+
+ int k ;
+
+ puts ("") ;
+
+ puts (" Zero Order Hold interpolator :") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ callback_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ;
+
+ puts (" Linear interpolator :") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ callback_test (SRC_LINEAR, src_ratios [k]) ;
+
+ puts (" Sinc interpolator :") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ callback_test (SRC_SINC_FASTEST, src_ratios [k]) ;
+
+ puts ("") ;
+
+ puts (" End of stream test :") ;
+ end_of_stream_test (SRC_ZERO_ORDER_HOLD) ;
+ end_of_stream_test (SRC_LINEAR) ;
+ end_of_stream_test (SRC_SINC_FASTEST) ;
+
+ puts ("") ;
+ return 0 ;
+} /* main */
+
+/*=====================================================================================
+*/
+
+typedef struct
+{ int channels ;
+ long count, total ;
+ int end_of_data ;
+ float data [BUFFER_LEN] ;
+} TEST_CB_DATA ;
+
+static long
+test_callback_func (void *cb_data, float **data)
+{ TEST_CB_DATA *pcb_data ;
+
+ long frames ;
+
+ if ((pcb_data = cb_data) == NULL)
+ return 0 ;
+
+ if (data == NULL)
+ return 0 ;
+
+ if (pcb_data->total - pcb_data->count > CB_READ_LEN)
+ frames = CB_READ_LEN / pcb_data->channels ;
+ else
+ frames = (pcb_data->total - pcb_data->count) / pcb_data->channels ;
+
+ *data = pcb_data->data + pcb_data->count ;
+ pcb_data->count += frames ;
+
+ return frames ;
+} /* test_callback_func */
+
+
+static void
+callback_test (int converter, double src_ratio)
+{ static TEST_CB_DATA test_callback_data ;
+ static float output [BUFFER_LEN] ;
+
+ SRC_STATE *src_state ;
+
+ long read_count, read_total ;
+ int error ;
+
+ printf ("\tcallback_test (SRC ratio = %6.4f) ........... ", src_ratio) ;
+ fflush (stdout) ;
+
+ test_callback_data.channels = 2 ;
+ test_callback_data.count = 0 ;
+ test_callback_data.end_of_data = 0 ;
+ test_callback_data.total = ARRAY_LEN (test_callback_data.data) ;
+
+ if ((src_state = src_callback_new (test_callback_func, converter, test_callback_data.channels, &error, &test_callback_data)) == NULL)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ read_total = 0 ;
+ do
+ { /* We will be throwing away output data, so just grab as much as possible. */
+ read_count = ARRAY_LEN (output) / test_callback_data.channels ;
+ read_count = src_callback_read (src_state, src_ratio, read_count, output) ;
+ read_total += read_count ;
+ }
+ while (read_count > 0) ;
+
+ if ((error = src_error (src_state)) != 0)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_state = src_delete (src_state) ;
+
+ if (fabs (read_total / src_ratio - ARRAY_LEN (test_callback_data.data)) > 2.0)
+ { printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
+ printf (" input len : %d\n", ARRAY_LEN (test_callback_data.data)) ;
+ printf (" output len : %ld (should be %g +/- 2)\n\n", read_total,
+ floor (0.5 + src_ratio * ARRAY_LEN (test_callback_data.data))) ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* callback_test */
+
+/*=====================================================================================
+*/
+
+static long
+eos_callback_func (void *cb_data, float **data)
+{
+ TEST_CB_DATA *pcb_data ;
+ long frames ;
+
+ if (data == NULL)
+ return 0 ;
+
+ if ((pcb_data = cb_data) == NULL)
+ return 0 ;
+
+ /*
+ ** Return immediately if there is no more data.
+ ** In this case, the output pointer 'data' will not be set and
+ ** valgrind should not warn about it.
+ */
+ if (pcb_data->end_of_data)
+ return 0 ;
+
+ if (pcb_data->total - pcb_data->count > CB_READ_LEN)
+ frames = CB_READ_LEN / pcb_data->channels ;
+ else
+ frames = (pcb_data->total - pcb_data->count) / pcb_data->channels ;
+
+ *data = pcb_data->data + pcb_data->count ;
+ pcb_data->count += frames ;
+
+ /*
+ ** Set end_of_data so that the next call to the callback function will
+ ** return zero ocunt without setting the 'data' pointer.
+ */
+ if (pcb_data->total < 2 * pcb_data->count)
+ pcb_data->end_of_data = 1 ;
+
+ return frames ;
+} /* eos_callback_data */
+
+
+static void
+end_of_stream_test (int converter)
+{ static TEST_CB_DATA test_callback_data ;
+ static float output [BUFFER_LEN] ;
+
+ SRC_STATE *src_state ;
+
+ double src_ratio = 0.3 ;
+ long read_count, read_total ;
+ int error ;
+
+ printf ("\t%-30s ........... ", src_get_name (converter)) ;
+ fflush (stdout) ;
+
+ test_callback_data.channels = 2 ;
+ test_callback_data.count = 0 ;
+ test_callback_data.end_of_data = 0 ;
+ test_callback_data.total = ARRAY_LEN (test_callback_data.data) ;
+
+ if ((src_state = src_callback_new (eos_callback_func, converter, test_callback_data.channels, &error, &test_callback_data)) == NULL)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ read_total = 0 ;
+ do
+ { /* We will be throwing away output data, so just grab as much as possible. */
+ read_count = ARRAY_LEN (output) / test_callback_data.channels ;
+ read_count = src_callback_read (src_state, src_ratio, read_count, output) ;
+ read_total += read_count ;
+ }
+ while (read_count > 0) ;
+
+ if ((error = src_error (src_state)) != 0)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_state = src_delete (src_state) ;
+
+ if (test_callback_data.end_of_data == 0)
+ { printf ("\n\nLine %d : test_callback_data.end_of_data should not be 0."
+ " This is a bug in the test.\n\n", __LINE__) ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+ return ;
+} /* end_of_stream_test */
diff --git a/soxr/lsr-tests/cmake/Modules/FindFFTW.cmake b/soxr/lsr-tests/cmake/Modules/FindFFTW.cmake
new file mode 100644
index 0000000..409268e
--- /dev/null
+++ b/soxr/lsr-tests/cmake/Modules/FindFFTW.cmake
@@ -0,0 +1,23 @@
+# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# - Find FFTW
+# Find the native installation of this package: includes and libraries.
+#
+# FFTW_INCLUDES - where to find headers for this package.
+# FFTW_LIBRARIES - List of libraries when using this package.
+# FFTW_FOUND - True if this package can be found.
+
+if (FFTW_INCLUDES)
+ set (FFTW_FIND_QUIETLY TRUE)
+endif (FFTW_INCLUDES)
+
+find_path (FFTW_INCLUDES fftw3.h)
+
+find_library (FFTW_LIBRARIES NAMES fftw3)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (
+ FFTW DEFAULT_MSG FFTW_LIBRARIES FFTW_INCLUDES)
+
+mark_as_advanced (FFTW_LIBRARIES FFTW_INCLUDES)
diff --git a/soxr/lsr-tests/cmake/Modules/Findsndfile.cmake b/soxr/lsr-tests/cmake/Modules/Findsndfile.cmake
new file mode 100644
index 0000000..b2fd725
--- /dev/null
+++ b/soxr/lsr-tests/cmake/Modules/Findsndfile.cmake
@@ -0,0 +1,23 @@
+# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+# - Find SNDFILE
+# Find the native installation of this package: includes and libraries.
+#
+# SNDFILE_INCLUDES - where to find headers for this package.
+# SNDFILE_LIBRARIES - List of libraries when using this package.
+# SNDFILE_FOUND - True if this package can be found.
+
+if (SNDFILE_INCLUDES)
+ set (SNDFILE_FIND_QUIETLY TRUE)
+endif (SNDFILE_INCLUDES)
+
+find_path (SNDFILE_INCLUDES sndfile.h)
+
+find_library (SNDFILE_LIBRARIES NAMES sndfile)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (
+ SNDFILE DEFAULT_MSG SNDFILE_LIBRARIES SNDFILE_INCLUDES)
+
+mark_as_advanced (SNDFILE_LIBRARIES SNDFILE_INCLUDES)
diff --git a/soxr/lsr-tests/config.h.in b/soxr/lsr-tests/config.h.in
new file mode 100644
index 0000000..1095e00
--- /dev/null
+++ b/soxr/lsr-tests/config.h.in
@@ -0,0 +1,24 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if !defined soxsrc_lsr_tests_config_included
+#define soxsrc_lsr_tests_config_included
+
+#cmakedefine01 HAVE_ALARM
+#cmakedefine01 HAVE_FFTW3
+#cmakedefine01 HAVE_LRINTF
+#cmakedefine01 HAVE_LRINT
+#cmakedefine01 HAVE_SIGNAL
+#cmakedefine01 HAVE_SNDFILE
+#cmakedefine01 HAVE_SYS_TIMES_H
+
+#if HAVE_SIGNAL
+ #include
+ #if defined SIGALRM
+ #define HAVE_SIGALRM 1
+ #else
+ #define HAVE_SIGALRM 0
+ #endif
+#endif
+
+#endif
diff --git a/soxr/lsr-tests/downsample_test.c b/soxr/lsr-tests/downsample_test.c
new file mode 100644
index 0000000..87243e7
--- /dev/null
+++ b/soxr/lsr-tests/downsample_test.c
@@ -0,0 +1,61 @@
+/*
+** Copyright (C) 2008-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+
+#include "util.h"
+
+static void
+downsample_test (int converter)
+{ static float in [1000], out [10] ;
+ SRC_DATA data ;
+
+ printf (" downsample_test (%-28s) ....... ", src_get_name (converter)) ;
+ fflush (stdout) ;
+
+ data.src_ratio = 1.0 / 255.0 ;
+ data.input_frames = ARRAY_LEN (in) ;
+ data.output_frames = ARRAY_LEN (out) ;
+ data.data_in = in ;
+ data.data_out = out ;
+
+ if (src_simple (&data, converter, 1))
+ { puts ("src_simple failed.") ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+} /* downsample_test */
+
+int
+main (void)
+{
+ puts ("") ;
+
+ downsample_test (SRC_ZERO_ORDER_HOLD) ;
+ downsample_test (SRC_LINEAR) ;
+ downsample_test (SRC_SINC_FASTEST) ;
+ downsample_test (SRC_SINC_MEDIUM_QUALITY) ;
+ downsample_test (SRC_SINC_BEST_QUALITY) ;
+
+ puts ("") ;
+
+ return 0 ;
+} /* main */
diff --git a/soxr/lsr-tests/float_cast.h b/soxr/lsr-tests/float_cast.h
new file mode 100644
index 0000000..77ad5b4
--- /dev/null
+++ b/soxr/lsr-tests/float_cast.h
@@ -0,0 +1,281 @@
+/*
+** Copyright (C) 2001-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU Lesser General Public License as published by
+** the Free Software Foundation; either version 2.1 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU Lesser General Public License for more details.
+**
+** You should have received a copy of the GNU Lesser General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+/* Version 1.5 */
+
+#ifndef FLOAT_CAST_HEADER
+#define FLOAT_CAST_HEADER
+
+/*============================================================================
+** On Intel Pentium processors (especially PIII and probably P4), converting
+** from float to int is very slow. To meet the C specs, the code produced by
+** most C compilers targeting Pentium needs to change the FPU rounding mode
+** before the float to int conversion is performed.
+**
+** Changing the FPU rounding mode causes the FPU pipeline to be flushed. It
+** is this flushing of the pipeline which is so slow.
+**
+** Fortunately the ISO C99 specifications define the functions lrint, lrintf,
+** llrint and llrintf which fix this problem as a side effect.
+**
+** On Unix-like systems, the configure process should have detected the
+** presence of these functions. If they weren't found we have to replace them
+** here with a standard C cast.
+*/
+
+/*
+** The C99 prototypes for lrint and lrintf are as follows:
+**
+** long int lrintf (float x) ;
+** long int lrint (double x) ;
+*/
+
+#include "config.h"
+
+/*
+** The presence of the required functions are detected during the configure
+** process and the values HAVE_LRINT and HAVE_LRINTF are set accordingly in
+** the config.h file.
+*/
+
+#define HAVE_LRINT_REPLACEMENT 0
+
+#if (HAVE_LRINT && HAVE_LRINTF)
+
+ /*
+ ** These defines enable functionality introduced with the 1999 ISO C
+ ** standard. They must be defined before the inclusion of math.h to
+ ** engage them. If optimisation is enabled, these functions will be
+ ** inlined. With optimisation switched off, you have to link in the
+ ** maths library using -lm.
+ */
+
+ #define _ISOC9X_SOURCE 1
+ #define _ISOC99_SOURCE 1
+
+ #define __USE_ISOC9X 1
+ #define __USE_ISOC99 1
+
+ #include
+
+#elif (defined (__CYGWIN__))
+
+ #include
+
+ #undef HAVE_LRINT_REPLACEMENT
+ #define HAVE_LRINT_REPLACEMENT 1
+
+ #undef lrint
+ #undef lrintf
+
+ #define lrint double2int
+ #define lrintf float2int
+
+ /*
+ ** The native CYGWIN lrint and lrintf functions are buggy:
+ ** http://sourceware.org/ml/cygwin/2005-06/msg00153.html
+ ** http://sourceware.org/ml/cygwin/2005-09/msg00047.html
+ ** and slow.
+ ** These functions (pulled from the Public Domain MinGW math.h header)
+ ** replace the native versions.
+ */
+
+ static inline long double2int (double in)
+ { long retval ;
+
+ __asm__ __volatile__
+ ( "fistpl %0"
+ : "=m" (retval)
+ : "t" (in)
+ : "st"
+ ) ;
+
+ return retval ;
+ } /* double2int */
+
+ static inline long float2int (float in)
+ { long retval ;
+
+ __asm__ __volatile__
+ ( "fistpl %0"
+ : "=m" (retval)
+ : "t" (in)
+ : "st"
+ ) ;
+
+ return retval ;
+ } /* float2int */
+
+#elif (defined (WIN64) || defined(_WIN64))
+
+ /* Win64 section should be places before Win32 one, because
+ ** most likely both WIN32 and WIN64 will be defined in 64-bit case.
+ */
+
+ #include
+
+ /* Win64 doesn't seem to have these functions, nor inline assembly.
+ ** Therefore implement inline versions of these functions here.
+ */
+ #include
+ #include
+
+ __inline long int
+ lrint(double flt)
+ {
+ return _mm_cvtsd_si32(_mm_load_sd(&flt));
+ }
+
+ __inline long int
+ lrintf(float flt)
+ {
+ return _mm_cvtss_si32(_mm_load_ss(&flt));
+ }
+
+#elif (defined (WIN32) || defined (_WIN32))
+
+ #undef HAVE_LRINT_REPLACEMENT
+ #define HAVE_LRINT_REPLACEMENT 1
+
+ #include
+
+ /*
+ ** Win32 doesn't seem to have these functions.
+ ** Therefore implement inline versions of these functions here.
+ */
+
+ __inline long int
+ lrint (double flt)
+ { int intgr ;
+
+ _asm
+ { fld flt
+ fistp intgr
+ } ;
+
+ return intgr ;
+ }
+
+ __inline long int
+ lrintf (float flt)
+ { int intgr ;
+
+ _asm
+ { fld flt
+ fistp intgr
+ } ;
+
+ return intgr ;
+ }
+
+#elif (defined (__MWERKS__) && defined (macintosh))
+
+ /* This MacOS 9 solution was provided by Stephane Letz */
+
+ #undef HAVE_LRINT_REPLACEMENT
+ #define HAVE_LRINT_REPLACEMENT 1
+ #include
+
+ #undef lrint
+ #undef lrintf
+
+ #define lrint double2int
+ #define lrintf float2int
+
+ inline int
+ float2int (register float in)
+ { long res [2] ;
+
+ asm
+ { fctiw in, in
+ stfd in, res
+ }
+ return res [1] ;
+ } /* float2int */
+
+ inline int
+ double2int (register double in)
+ { long res [2] ;
+
+ asm
+ { fctiw in, in
+ stfd in, res
+ }
+ return res [1] ;
+ } /* double2int */
+
+#elif (defined (__MACH__) && defined (__APPLE__))
+
+ /* For Apple MacOSX. */
+
+ #undef HAVE_LRINT_REPLACEMENT
+ #define HAVE_LRINT_REPLACEMENT 1
+ #include
+
+ #undef lrint
+ #undef lrintf
+
+ #define lrint double2int
+ #define lrintf float2int
+
+ inline static long
+ float2int (register float in)
+ { int res [2] ;
+
+ __asm__ __volatile__
+ ( "fctiw %1, %1\n\t"
+ "stfd %1, %0"
+ : "=m" (res) /* Output */
+ : "f" (in) /* Input */
+ : "memory"
+ ) ;
+
+ return res [1] ;
+ } /* lrintf */
+
+ inline static long
+ double2int (register double in)
+ { int res [2] ;
+
+ __asm__ __volatile__
+ ( "fctiw %1, %1\n\t"
+ "stfd %1, %0"
+ : "=m" (res) /* Output */
+ : "f" (in) /* Input */
+ : "memory"
+ ) ;
+
+ return res [1] ;
+ } /* lrint */
+
+#else
+ #ifndef __sgi
+ #warning "Don't have the functions lrint() and lrintf()."
+ #warning "Replacing these functions with a standard C cast."
+ #endif
+
+ #include
+
+ #define lrint(dbl) ((long) (dbl))
+ #define lrintf(flt) ((long) (flt))
+
+#endif
+
+
+#endif /* FLOAT_CAST_HEADER */
+
diff --git a/soxr/lsr-tests/float_short_test.c b/soxr/lsr-tests/float_short_test.c
new file mode 100644
index 0000000..6664a3b
--- /dev/null
+++ b/soxr/lsr-tests/float_short_test.c
@@ -0,0 +1,192 @@
+/*
+** Copyright (C) 2003-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+
+#include
+
+#include "util.h"
+
+#define BUFFER_LEN 10000
+
+static void float_to_short_test (void) ;
+static void short_to_float_test (void) ;
+
+static void float_to_int_test (void) ;
+static void int_to_float_test (void) ;
+
+int
+main (void)
+{
+ puts ("") ;
+
+ float_to_short_test () ;
+ short_to_float_test () ;
+
+ float_to_int_test () ;
+ int_to_float_test () ;
+
+ puts ("") ;
+
+ return 0 ;
+} /* main */
+
+/*=====================================================================================
+*/
+
+static void
+float_to_short_test (void)
+{
+ static float fpos [] =
+ { 0.95, 0.99, 1.0, 1.01, 1.1, 2.0, 11.1, 111.1, 2222.2, 33333.3
+ } ;
+ static float fneg [] =
+ { -0.95, -0.99, -1.0, -1.01, -1.1, -2.0, -11.1, -111.1, -2222.2, -33333.3
+ } ;
+
+ static short out [MAX (ARRAY_LEN (fpos), ARRAY_LEN (fneg))] ;
+
+ int k ;
+
+ printf ("\tfloat_to_short_test ............................. ") ;
+
+ src_float_to_short_array (fpos, out, ARRAY_LEN (fpos)) ;
+
+ for (k = 0 ; k < ARRAY_LEN (fpos) ; k++)
+ if (out [k] < 30000)
+ { printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ;
+ exit (1) ;
+ } ;
+
+ src_float_to_short_array (fneg, out, ARRAY_LEN (fneg)) ;
+
+ for (k = 0 ; k < ARRAY_LEN (fneg) ; k++)
+ if (out [k] > -30000)
+ { printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* float_to_short_test */
+
+/*-------------------------------------------------------------------------------------
+*/
+
+static void
+short_to_float_test (void)
+{
+ static short input [BUFFER_LEN] ;
+ static short output [BUFFER_LEN] ;
+ static float temp [BUFFER_LEN] ;
+
+ int k ;
+
+ printf ("\tshort_to_float_test ............................. ") ;
+
+ for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+ input [k] = (k * 0x8000) / ARRAY_LEN (input) ;
+
+ src_short_to_float_array (input, temp, ARRAY_LEN (temp)) ;
+ src_float_to_short_array (temp, output, ARRAY_LEN (output)) ;
+
+ for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+ if (ABS (input [k] - output [k]) > 0)
+ { printf ("\n\n\tLine %d : index %d %d -> %d\n", __LINE__, k, input [k], output [k]) ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* short_to_float_test */
+
+/*=====================================================================================
+*/
+
+static void
+float_to_int_test (void)
+{
+ static float fpos [] =
+ { 0.95, 0.99, 1.0, 1.01, 1.1, 2.0, 11.1, 111.1, 2222.2, 33333.3
+ } ;
+ static float fneg [] =
+ { -0.95, -0.99, -1.0, -1.01, -1.1, -2.0, -11.1, -111.1, -2222.2, -33333.3
+ } ;
+
+ static int out [MAX (ARRAY_LEN (fpos), ARRAY_LEN (fneg))] ;
+
+ int k ;
+
+ printf ("\tfloat_to_int_test ............................... ") ;
+
+ src_float_to_int_array (fpos, out, ARRAY_LEN (fpos)) ;
+
+ for (k = 0 ; k < ARRAY_LEN (fpos) ; k++)
+ if (out [k] < 30000 * 0x10000)
+ { printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ;
+ exit (1) ;
+ } ;
+
+ src_float_to_int_array (fneg, out, ARRAY_LEN (fneg)) ;
+
+ for (k = 0 ; k < ARRAY_LEN (fneg) ; k++)
+ if (out [k] > -30000 * 0x1000)
+ { printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* float_to_int_test */
+
+/*-------------------------------------------------------------------------------------
+*/
+
+static void
+int_to_float_test (void)
+{
+ static int input [BUFFER_LEN] ;
+ static int output [BUFFER_LEN] ;
+ static float temp [BUFFER_LEN] ;
+
+ int k ;
+
+ printf ("\tint_to_float_test ............................... ") ;
+
+ for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+ input [k] = (k * 0x80000000) / ARRAY_LEN (input) ;
+
+ src_int_to_float_array (input, temp, ARRAY_LEN (temp)) ;
+ src_float_to_int_array (temp, output, ARRAY_LEN (output)) ;
+
+ for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+ if (ABS (input [k] - output [k]) > 0)
+ { printf ("\n\n\tLine %d : index %d %d -> %d\n", __LINE__, k, input [k], output [k]) ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* int_to_float_test */
+
diff --git a/soxr/lsr-tests/misc_test.c b/soxr/lsr-tests/misc_test.c
new file mode 100644
index 0000000..4baa334
--- /dev/null
+++ b/soxr/lsr-tests/misc_test.c
@@ -0,0 +1,175 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+
+#include
+
+#include "util.h"
+
+static void name_test (void) ;
+static void error_test (void) ;
+static void src_ratio_test (void) ;
+static void zero_input_test (int converter) ;
+
+int
+main (void)
+{
+ puts ("") ;
+
+ printf (" version : %s\n\n", src_get_version ()) ;
+
+ /* Current max converter is SRC_LINEAR. */
+ name_test () ;
+
+ error_test () ;
+
+ src_ratio_test () ;
+
+ zero_input_test (SRC_ZERO_ORDER_HOLD) ;
+ zero_input_test (SRC_LINEAR) ;
+ zero_input_test (SRC_SINC_FASTEST) ;
+
+ puts ("") ;
+ return 0 ;
+} /* main */
+
+static void
+name_test (void)
+{ const char *name ;
+ int k = 0 ;
+
+ puts (" name_test :") ;
+
+ while (1)
+ { name = src_get_name (k) ;
+ if (name == NULL)
+ break ;
+ printf ("\tName %d : %s\n", k, name) ;
+ printf ("\tDesc %d : %s\n", k, src_get_description (k)) ;
+ k ++ ;
+ } ;
+
+ puts ("") ;
+
+ return ;
+} /* name_test */
+
+/*------------------------------------------------------------------------------
+*/
+
+typedef struct
+{ double ratio ;
+ int should_pass ;
+} RATIO_TEST ;
+
+static RATIO_TEST ratio_test [] =
+{ { 1.0 / 256.1, 0 },
+ { 1.0 / 256.0, 1 },
+ { 1.0, 1 },
+ { 256.0, 1 },
+ { 256.1, 0 },
+ { -1.0, 0 }
+} ;
+
+static void
+src_ratio_test (void)
+{ int k ;
+
+ puts (" src_ratio_test (SRC ratio must be in range [1/256, 256]):" ) ;
+
+
+ for (k = 0 ; k < ARRAY_LEN (ratio_test) ; k++)
+ { if (ratio_test [k].should_pass && src_is_valid_ratio (ratio_test [k].ratio) == 0)
+ { printf ("\n\nLine %d : SRC ratio %f should have passed.\n\n", __LINE__, ratio_test [k].ratio) ;
+ exit (1) ;
+ } ;
+ if (! ratio_test [k].should_pass && src_is_valid_ratio (ratio_test [k].ratio) != 0)
+ { printf ("\n\nLine %d : SRC ratio %f should not have passed.\n\n", __LINE__, ratio_test [k].ratio) ;
+ exit (1) ;
+ } ;
+ printf ("\t SRC ratio (%9.5f) : %s ................... ok\n", ratio_test [k].ratio,
+ (ratio_test [k].should_pass ? "pass" : "fail")) ;
+ } ;
+
+ puts ("") ;
+
+ return ;
+} /* src_ratio_test */
+
+static void
+error_test (void)
+{ const char *errorstr ;
+ int k, errors = 0 ;
+
+ puts (" error_test :") ;
+
+ for (k = 0 ; 1 ; k++)
+ { errorstr = src_strerror (k) ;
+ printf ("\t%-2d : %s\n", k, errorstr) ;
+ if (errorstr == NULL)
+ { errors ++ ;
+ continue ;
+ } ;
+ if (strstr (errorstr, "Placeholder.") == errorstr)
+ break ;
+ } ;
+
+ if (errors != 0)
+ { printf ("\n\nLine %d : Missing error numbers above.\n\n", __LINE__) ;
+ exit (1) ;
+ } ;
+
+ puts ("") ;
+
+ return ;
+} /* error_test */
+
+static void
+zero_input_test (int converter)
+{ SRC_DATA data ;
+ SRC_STATE *state ;
+ float out [100] ;
+ int error ;
+
+ printf (" %s (%-26s) ........ ", __func__, src_get_name (converter)) ;
+ fflush (stdout) ;
+
+ if ((state = src_new (converter, 1, &error)) == NULL)
+ { printf ("\n\nLine %d : src_new failed : %s.\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ data.data_in = (float *) 0xdeadbeef ;
+ data.input_frames = 0 ;
+ data.data_out = out ;
+ data.output_frames = ARRAY_LEN (out) ;
+ data.end_of_input = 0 ;
+ data.src_ratio = 1.0 ;
+
+ if ((error = src_process (state, &data)))
+ { printf ("\n\nLine %d : src_new failed : %s.\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ state = src_delete (state) ;
+
+ puts ("ok") ;
+} /* zero_input_test */
diff --git a/soxr/lsr-tests/multi_channel_test.c b/soxr/lsr-tests/multi_channel_test.c
new file mode 100644
index 0000000..1ad9ced
--- /dev/null
+++ b/soxr/lsr-tests/multi_channel_test.c
@@ -0,0 +1,364 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include "util.h"
+#define BUFFER_LEN 50000
+#define BLOCK_LEN (12)
+
+#define MAX_CHANNELS 10
+
+static void simple_test (int converter, int channel_count, double target_snr) ;
+static void process_test (int converter, int channel_count, double target_snr) ;
+static void callback_test (int converter, int channel_count, double target_snr) ;
+
+int
+main (void)
+{ double target ;
+ int k ;
+
+ puts ("\n Zero Order Hold interpolator :") ;
+ target = 38.0 ;
+ for (k = 1 ; k <= 3 ; k++)
+ { simple_test (SRC_ZERO_ORDER_HOLD, k, target) ;
+ process_test (SRC_ZERO_ORDER_HOLD, k, target) ;
+ callback_test (SRC_ZERO_ORDER_HOLD, k, target) ;
+ } ;
+
+ puts ("\n Linear interpolator :") ;
+ target = 79.0 ;
+ for (k = 1 ; k <= 3 ; k++)
+ { simple_test (SRC_LINEAR, k, target) ;
+ process_test (SRC_LINEAR, k, target) ;
+ callback_test (SRC_LINEAR, k, target) ;
+ } ;
+
+ puts ("\n Sinc interpolator :") ;
+ target = 100.0 ;
+ for (k = 1 ; k <= MAX_CHANNELS ; k++)
+ { simple_test (SRC_SINC_FASTEST, k, target) ;
+ process_test (SRC_SINC_FASTEST, k, target) ;
+ callback_test (SRC_SINC_FASTEST, k, target) ;
+ } ;
+
+ puts ("") ;
+
+ return 0 ;
+} /* main */
+
+/*==============================================================================
+*/
+
+static float input_serial [BUFFER_LEN * MAX_CHANNELS] ;
+static float input_interleaved [BUFFER_LEN * MAX_CHANNELS] ;
+static float output_interleaved [BUFFER_LEN * MAX_CHANNELS] ;
+static float output_serial [BUFFER_LEN * MAX_CHANNELS] ;
+
+static void
+simple_test (int converter, int channel_count, double target_snr)
+{ SRC_DATA src_data ;
+
+ double freq, snr ;
+ int ch, error, frames ;
+
+ printf ("\t%-22s (%2d channel%c) ............ ", "simple_test", channel_count, channel_count > 1 ? 's' : ' ') ;
+ fflush (stdout) ;
+
+ assert (channel_count <= MAX_CHANNELS) ;
+
+ memset (input_serial, 0, sizeof (input_serial)) ;
+ memset (input_interleaved, 0, sizeof (input_interleaved)) ;
+ memset (output_interleaved, 0, sizeof (output_interleaved)) ;
+ memset (output_serial, 0, sizeof (output_serial)) ;
+
+ frames = BUFFER_LEN ;
+
+ /* Calculate channel_count separate windowed sine waves. */
+ for (ch = 0 ; ch < channel_count ; ch++)
+ { freq = (200.0 + 33.333333333 * ch) / 44100.0 ;
+ gen_windowed_sines (1, &freq, 1.0, input_serial + ch * frames, frames) ;
+ } ;
+
+ /* Interleave the data in preparation for SRC. */
+ interleave_data (input_serial, input_interleaved, frames, channel_count) ;
+
+ /* Choose a converstion ratio <= 1.0. */
+ src_data.src_ratio = 0.95 ;
+
+ src_data.data_in = input_interleaved ;
+ src_data.input_frames = frames ;
+
+ src_data.data_out = output_interleaved ;
+ src_data.output_frames = frames ;
+
+ if ((error = src_simple (&src_data, converter, channel_count)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ if (fabs (src_data.output_frames_gen - src_data.src_ratio * src_data.input_frames) > 2)
+ { printf ("\n\nLine %d : bad output data length %ld should be %d.\n", __LINE__,
+ src_data.output_frames_gen, (int) floor (src_data.src_ratio * src_data.input_frames)) ;
+ printf ("\tsrc_ratio : %.4f\n", src_data.src_ratio) ;
+ printf ("\tinput_len : %ld\n", src_data.input_frames) ;
+ printf ("\toutput_len : %ld\n\n", src_data.output_frames_gen) ;
+ exit (1) ;
+ } ;
+
+ /* De-interleave data so SNR can be calculated for each channel. */
+ deinterleave_data (output_interleaved, output_serial, frames, channel_count) ;
+
+ for (ch = 0 ; ch < channel_count ; ch++)
+ { snr = calculate_snr (output_serial + ch * frames, frames, 1) ;
+ if (snr < target_snr)
+ { printf ("\n\nLine %d: channel %d snr %f should be %f\n", __LINE__, ch, snr, target_snr) ;
+ save_oct_float ("output.dat", input_serial, channel_count * frames, output_serial, channel_count * frames) ;
+ exit (1) ;
+ } ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* simple_test */
+
+/*==============================================================================
+*/
+
+static void
+process_test (int converter, int channel_count, double target_snr)
+{ SRC_STATE *src_state ;
+ SRC_DATA src_data ;
+
+ double freq, snr ;
+ int ch, error, frames, current_in, current_out ;
+
+ printf ("\t%-22s (%2d channel%c) ............ ", "process_test", channel_count, channel_count > 1 ? 's' : ' ') ;
+ fflush (stdout) ;
+
+ assert (channel_count <= MAX_CHANNELS) ;
+
+ memset (input_serial, 0, sizeof (input_serial)) ;
+ memset (input_interleaved, 0, sizeof (input_interleaved)) ;
+ memset (output_interleaved, 0, sizeof (output_interleaved)) ;
+ memset (output_serial, 0, sizeof (output_serial)) ;
+
+ frames = BUFFER_LEN ;
+
+ /* Calculate channel_count separate windowed sine waves. */
+ for (ch = 0 ; ch < channel_count ; ch++)
+ { freq = (400.0 + 11.333333333 * ch) / 44100.0 ;
+ gen_windowed_sines (1, &freq, 1.0, input_serial + ch * frames, frames) ;
+ } ;
+
+ /* Interleave the data in preparation for SRC. */
+ interleave_data (input_serial, input_interleaved, frames, channel_count) ;
+
+ /* Perform sample rate conversion. */
+ if ((src_state = src_new (converter, channel_count, &error)) == NULL)
+ { printf ("\n\nLine %d : src_new() failed : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_data.end_of_input = 0 ; /* Set this later. */
+
+ /* Choose a converstion ratio < 1.0. */
+ src_data.src_ratio = 0.95 ;
+
+ src_data.data_in = input_interleaved ;
+ src_data.data_out = output_interleaved ;
+
+ current_in = current_out = 0 ;
+
+ while (1)
+ { src_data.input_frames = MAX (MIN (BLOCK_LEN, frames - current_in), 0) ;
+ src_data.output_frames = MAX (MIN (BLOCK_LEN, frames - current_out), 0) ;
+
+ if ((error = src_process (src_state, &src_data)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ if (src_data.end_of_input && src_data.output_frames_gen == 0)
+ break ;
+
+ current_in += src_data.input_frames_used ;
+ current_out += src_data.output_frames_gen ;
+
+ src_data.data_in += src_data.input_frames_used * channel_count ;
+ src_data.data_out += src_data.output_frames_gen * channel_count ;
+
+ src_data.end_of_input = (current_in >= frames) ? 1 : 0 ;
+ } ;
+
+ src_state = src_delete (src_state) ;
+
+ if (fabs (current_out - src_data.src_ratio * current_in) > 2)
+ { printf ("\n\nLine %d : bad output data length %d should be %d.\n", __LINE__,
+ current_out, (int) floor (src_data.src_ratio * current_in)) ;
+ printf ("\tsrc_ratio : %.4f\n", src_data.src_ratio) ;
+ printf ("\tinput_len : %d\n", frames) ;
+ printf ("\toutput_len : %d\n\n", current_out) ;
+ exit (1) ;
+ } ;
+
+ /* De-interleave data so SNR can be calculated for each channel. */
+ deinterleave_data (output_interleaved, output_serial, frames, channel_count) ;
+
+ for (ch = 0 ; ch < channel_count ; ch++)
+ { snr = calculate_snr (output_serial + ch * frames, frames, 1) ;
+ if (snr < target_snr)
+ { printf ("\n\nLine %d: channel %d snr %f should be %f\n", __LINE__, ch, snr, target_snr) ;
+ save_oct_float ("output.dat", input_serial, channel_count * frames, output_serial, channel_count * frames) ;
+ exit (1) ;
+ } ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* process_test */
+
+/*==============================================================================
+*/
+
+typedef struct
+{ int channels ;
+ long total_frames ;
+ long current_frame ;
+ float *data ;
+} TEST_CB_DATA ;
+
+static long
+test_callback_func (void *cb_data, float **data)
+{ TEST_CB_DATA *pcb_data ;
+
+ long frames ;
+
+ if ((pcb_data = cb_data) == NULL)
+ return 0 ;
+
+ if (data == NULL)
+ return 0 ;
+
+ *data = pcb_data->data + (pcb_data->current_frame * pcb_data->channels) ;
+
+ if (pcb_data->total_frames - pcb_data->current_frame < BLOCK_LEN)
+ frames = pcb_data->total_frames - pcb_data->current_frame ;
+ else
+ frames = BLOCK_LEN ;
+
+ pcb_data->current_frame += frames ;
+
+ return frames ;
+} /* test_callback_func */
+
+static void
+callback_test (int converter, int channel_count, double target_snr)
+{ TEST_CB_DATA test_callback_data ;
+ SRC_STATE *src_state = NULL ;
+
+ double freq, snr, src_ratio ;
+ int ch, error, frames, read_total, read_count ;
+
+ printf ("\t%-22s (%2d channel%c) ............ ", "callback_test", channel_count, channel_count > 1 ? 's' : ' ') ;
+ fflush (stdout) ;
+
+ assert (channel_count <= MAX_CHANNELS) ;
+
+ memset (input_serial, 0, sizeof (input_serial)) ;
+ memset (input_interleaved, 0, sizeof (input_interleaved)) ;
+ memset (output_interleaved, 0, sizeof (output_interleaved)) ;
+ memset (output_serial, 0, sizeof (output_serial)) ;
+ memset (&test_callback_data, 0, sizeof (test_callback_data)) ;
+
+ frames = BUFFER_LEN ;
+
+ /* Calculate channel_count separate windowed sine waves. */
+ for (ch = 0 ; ch < channel_count ; ch++)
+ { freq = (200.0 + 33.333333333 * ch) / 44100.0 ;
+ gen_windowed_sines (1, &freq, 1.0, input_serial + ch * frames, frames) ;
+ } ;
+
+ /* Interleave the data in preparation for SRC. */
+ interleave_data (input_serial, input_interleaved, frames, channel_count) ;
+
+ /* Perform sample rate conversion. */
+ src_ratio = 0.95 ;
+ test_callback_data.channels = channel_count ;
+ test_callback_data.total_frames = frames ;
+ test_callback_data.current_frame = 0 ;
+ test_callback_data.data = input_interleaved ;
+
+ if ((src_state = src_callback_new (test_callback_func, converter, channel_count, &error, &test_callback_data)) == NULL)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ read_total = 0 ;
+ while (read_total < frames)
+ { read_count = src_callback_read (src_state, src_ratio, frames - read_total, output_interleaved + read_total * channel_count) ;
+
+ if (read_count <= 0)
+ break ;
+
+ read_total += read_count ;
+ } ;
+
+ if ((error = src_error (src_state)) != 0)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_state = src_delete (src_state) ;
+
+ if (fabs (read_total - src_ratio * frames) > 2)
+ { printf ("\n\nLine %d : bad output data length %d should be %d.\n", __LINE__,
+ read_total, (int) floor (src_ratio * frames)) ;
+ printf ("\tsrc_ratio : %.4f\n", src_ratio) ;
+ printf ("\tinput_len : %d\n", frames) ;
+ printf ("\toutput_len : %d\n\n", read_total) ;
+ exit (1) ;
+ } ;
+
+ /* De-interleave data so SNR can be calculated for each channel. */
+ deinterleave_data (output_interleaved, output_serial, frames, channel_count) ;
+
+ for (ch = 0 ; ch < channel_count ; ch++)
+ { snr = calculate_snr (output_serial + ch * frames, frames, 1) ;
+ if (snr < target_snr)
+ { printf ("\n\nLine %d: channel %d snr %f should be %f\n", __LINE__, ch, snr, target_snr) ;
+ save_oct_float ("output.dat", input_serial, channel_count * frames, output_serial, channel_count * frames) ;
+ exit (1) ;
+ } ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* callback_test */
+
diff --git a/soxr/lsr-tests/multichan_throughput_test.c b/soxr/lsr-tests/multichan_throughput_test.c
new file mode 100644
index 0000000..523139e
--- /dev/null
+++ b/soxr/lsr-tests/multichan_throughput_test.c
@@ -0,0 +1,216 @@
+/*
+** Copyright (C) 2008-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include "config.h"
+
+#include "util.h"
+#include "float_cast.h"
+
+#define BUFFER_LEN (1<<17)
+
+static float input [BUFFER_LEN] ;
+static float output [BUFFER_LEN] ;
+
+static long
+throughput_test (int converter, int channels, long best_throughput)
+{ SRC_DATA src_data ;
+ clock_t start_time, clock_time ;
+ double duration ;
+ long total_frames = 0, throughput ;
+ int error ;
+
+ printf (" %-30s %2d ", src_get_name (converter), channels) ;
+ fflush (stdout) ;
+
+ src_data.data_in = input ;
+ src_data.input_frames = ARRAY_LEN (input) / channels ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = ARRAY_LEN (output) / channels ;
+
+ src_data.src_ratio = 0.99 ;
+
+ sleep (2) ;
+
+ start_time = clock () ;
+
+ do
+ {
+ if ((error = src_simple (&src_data, converter, channels)) != 0)
+ { puts (src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ total_frames += src_data.output_frames_gen ;
+
+ clock_time = clock () - start_time ;
+ duration = (1.0 * clock_time) / CLOCKS_PER_SEC ;
+ }
+ while (duration < 5.0) ;
+
+ if (src_data.input_frames_used != src_data.input_frames)
+ { printf ("\n\nLine %d : input frames used %ld should be %ld\n", __LINE__, src_data.input_frames_used, src_data.input_frames) ;
+ exit (1) ;
+ } ;
+
+ if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2)
+ { printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
+ printf (" input len : %d\n", ARRAY_LEN (input) / channels) ;
+ printf (" output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen,
+ floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ;
+ exit (1) ;
+ } ;
+
+ throughput = lrint (floor (total_frames / duration)) ;
+
+ if (best_throughput == 0)
+ { best_throughput = MAX (throughput, best_throughput) ;
+ printf ("%5.2f %10ld\n", duration, throughput) ;
+ }
+ else
+ { best_throughput = MAX (throughput, best_throughput) ;
+ printf ("%5.2f %10ld %10ld\n", duration, throughput, best_throughput) ;
+ }
+
+ return best_throughput ;
+} /* throughput_test */
+
+static void
+single_run (void)
+{ const int max_channels = 10 ;
+ int k ;
+
+ printf ("\n CPU name : %s\n", get_cpu_name ()) ;
+
+ puts (
+ "\n"
+ " Converter Channels Duration Throughput\n"
+ " ---------------------------------------------------------------------"
+ ) ;
+
+ for (k = 1 ; k <= max_channels / 2 ; k++)
+ throughput_test (SRC_SINC_FASTEST, k, 0) ;
+
+ puts ("") ;
+ for (k = 1 ; k <= max_channels / 2 ; k++)
+ throughput_test (SRC_SINC_MEDIUM_QUALITY, k, 0) ;
+
+ puts ("") ;
+ for (k = 1 ; k <= max_channels ; k++)
+ throughput_test (SRC_SINC_BEST_QUALITY, k, 0) ;
+
+ puts ("") ;
+ return ;
+} /* single_run */
+
+static void
+multi_run (int run_count)
+{ int k, ch ;
+
+ printf ("\n CPU name : %s\n", get_cpu_name ()) ;
+
+ puts (
+ "\n"
+ " Converter Channels Duration Throughput Best Throughput\n"
+ " ----------------------------------------------------------------------------------------"
+ ) ;
+
+ for (ch = 1 ; ch <= 5 ; ch++)
+ { long sinc_fastest = 0, sinc_medium = 0, sinc_best = 0 ;
+
+ for (k = 0 ; k < run_count ; k++)
+ { sinc_fastest = throughput_test (SRC_SINC_FASTEST, ch, sinc_fastest) ;
+ sinc_medium = throughput_test (SRC_SINC_MEDIUM_QUALITY, ch, sinc_medium) ;
+ sinc_best = throughput_test (SRC_SINC_BEST_QUALITY, ch, sinc_best) ;
+
+ puts ("") ;
+
+ /* Let the CPU cool down. We might be running on a laptop. */
+ sleep (10) ;
+ } ;
+
+ puts (
+ "\n"
+ " Converter Best Throughput\n"
+ " ------------------------------------------------"
+ ) ;
+
+ printf (" %-30s %10ld\n", src_get_name (SRC_SINC_FASTEST), sinc_fastest) ;
+ printf (" %-30s %10ld\n", src_get_name (SRC_SINC_MEDIUM_QUALITY), sinc_medium) ;
+ printf (" %-30s %10ld\n", src_get_name (SRC_SINC_BEST_QUALITY), sinc_best) ;
+ } ;
+
+ puts ("") ;
+} /* multi_run */
+
+static void
+usage_exit (const char * argv0)
+{ const char * cptr ;
+
+ if ((cptr = strrchr (argv0, '/')) != NULL)
+ argv0 = cptr ;
+
+ printf (
+ "Usage :\n"
+ " %s - Single run of the throughput test.\n"
+ " %s --best-of N - Do N runs of test a print bext result.\n"
+ "\n",
+ argv0, argv0) ;
+
+ exit (0) ;
+} /* usage_exit */
+
+int
+main (int argc, char ** argv)
+{ double freq ;
+
+ memset (input, 0, sizeof (input)) ;
+ freq = 0.01 ;
+ gen_windowed_sines (1, &freq, 1.0, input, BUFFER_LEN) ;
+
+ if (argc == 1)
+ single_run () ;
+ else if (argc == 3 && strcmp (argv [1], "--best-of") == 0)
+ { int run_count = atoi (argv [2]) ;
+
+ if (run_count < 1 || run_count > 20)
+ { printf ("Please be sensible. Run count should be in range (1, 10].\n") ;
+ exit (1) ;
+ } ;
+
+ multi_run (run_count) ;
+ }
+ else
+ usage_exit (argv [0]) ;
+
+ puts (
+ " Duration is in seconds.\n"
+ " Throughput is in frames/sec (more is better).\n"
+ ) ;
+
+ return 0 ;
+} /* main */
+
diff --git a/soxr/lsr-tests/reset_test.c b/soxr/lsr-tests/reset_test.c
new file mode 100644
index 0000000..40485c2
--- /dev/null
+++ b/soxr/lsr-tests/reset_test.c
@@ -0,0 +1,238 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+
+#include
+
+#include "util.h"
+
+#define BUFFER_LEN 2048
+#define CB_READ_LEN 256
+
+static void process_reset_test (int converter) ;
+static void callback_reset_test (int converter) ;
+
+static float data_one [BUFFER_LEN] ;
+static float data_zero [BUFFER_LEN] ;
+
+int
+main (void)
+{
+ puts ("") ;
+
+ process_reset_test (SRC_ZERO_ORDER_HOLD) ;
+ process_reset_test (SRC_LINEAR) ;
+ process_reset_test (SRC_SINC_FASTEST) ;
+
+ callback_reset_test (SRC_ZERO_ORDER_HOLD) ;
+ callback_reset_test (SRC_LINEAR) ;
+ callback_reset_test (SRC_SINC_FASTEST) ;
+
+ puts ("") ;
+
+ return 0 ;
+} /* main */
+
+static void
+process_reset_test (int converter)
+{ static float output [BUFFER_LEN] ;
+
+ SRC_STATE *src_state ;
+ SRC_DATA src_data ;
+ int k, error ;
+
+ printf ("\tprocess_reset_test (%-28s) ....... ", src_get_name (converter)) ;
+ fflush (stdout) ;
+
+ for (k = 0 ; k < BUFFER_LEN ; k++)
+ { data_one [k] = 1.0 ;
+ data_zero [k] = 0.0 ;
+ } ;
+
+ /* Get a converter. */
+ if ((src_state = src_new (converter, 1, &error)) == NULL)
+ { printf ("\n\nLine %d : src_new() failed : %s.\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ /* Process a bunch of 1.0 valued samples. */
+ src_data.data_in = data_one ;
+ src_data.data_out = output ;
+ src_data.input_frames = BUFFER_LEN ;
+ src_data.output_frames = BUFFER_LEN ;
+ src_data.src_ratio = 0.9 ;
+ src_data.end_of_input = 1 ;
+
+ if ((error = src_process (src_state, &src_data)) != 0)
+ { printf ("\n\nLine %d : src_simple () returned error : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ /* Reset the state of the converter.*/
+ src_reset (src_state) ;
+
+ /* Now process some zero data. */
+ src_data.data_in = data_zero ;
+ src_data.data_out = output ;
+ src_data.input_frames = BUFFER_LEN ;
+ src_data.output_frames = BUFFER_LEN ;
+ src_data.src_ratio = 0.9 ;
+ src_data.end_of_input = 1 ;
+
+ if ((error = src_process (src_state, &src_data)) != 0)
+ { printf ("\n\nLine %d : src_simple () returned error : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ /* Finally make sure that the output data is zero ie reset was sucessful. */
+ for (k = 0 ; k < BUFFER_LEN / 2 ; k++)
+ if (output [k] != 0.0)
+ { printf ("\n\nLine %d : output [%d] should be 0.0, is %f.\n", __LINE__, k, output [k]) ;
+ exit (1) ;
+ } ;
+
+ /* Make sure that this function has been exported. */
+ src_set_ratio (src_state, 1.0) ;
+
+ /* Delete converter. */
+ src_state = src_delete (src_state) ;
+
+ puts ("ok") ;
+} /* process_reset_test */
+
+/*==============================================================================
+*/
+
+typedef struct
+{ int channels ;
+ long count, total ;
+ float *data ;
+} TEST_CB_DATA ;
+
+static long
+test_callback_func (void *cb_data, float **data)
+{ TEST_CB_DATA *pcb_data ;
+
+ long frames ;
+
+ if ((pcb_data = cb_data) == NULL)
+ return 0 ;
+
+ if (data == NULL)
+ return 0 ;
+
+ if (pcb_data->total - pcb_data->count > 0)
+ frames = pcb_data->total - pcb_data->count ;
+ else
+ frames = 0 ;
+
+ *data = pcb_data->data + pcb_data->count ;
+ pcb_data->count += frames ;
+
+ return frames ;
+} /* test_callback_func */
+
+static void
+callback_reset_test (int converter)
+{ static TEST_CB_DATA test_callback_data ;
+
+ static float output [BUFFER_LEN] ;
+
+ SRC_STATE *src_state ;
+
+ double src_ratio = 1.1 ;
+ long read_count, read_total ;
+ int k, error ;
+
+ printf ("\tcallback_reset_test (%-28s) ....... ", src_get_name (converter)) ;
+ fflush (stdout) ;
+
+ for (k = 0 ; k < ARRAY_LEN (data_one) ; k++)
+ { data_one [k] = 1.0 ;
+ data_zero [k] = 0.0 ;
+ } ;
+
+ if ((src_state = src_callback_new (test_callback_func, converter, 1, &error, &test_callback_data)) == NULL)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ /* Process a bunch of 1.0 valued samples. */
+ test_callback_data.channels = 1 ;
+ test_callback_data.count = 0 ;
+ test_callback_data.total = ARRAY_LEN (data_one) ;
+ test_callback_data.data = data_one ;
+
+ read_total = 0 ;
+ do
+ { read_count = (ARRAY_LEN (output) - read_total > CB_READ_LEN) ? CB_READ_LEN : ARRAY_LEN (output) - read_total ;
+ read_count = src_callback_read (src_state, src_ratio, read_count, output + read_total) ;
+ read_total += read_count ;
+ }
+ while (read_count > 0) ;
+
+ /* Check for errors. */
+ if ((error = src_error (src_state)) != 0)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ /* Reset the state of the converter.*/
+ src_reset (src_state) ;
+
+ /* Process a bunch of 0.0 valued samples. */
+ test_callback_data.channels = 1 ;
+ test_callback_data.count = 0 ;
+ test_callback_data.total = ARRAY_LEN (data_zero) ;
+ test_callback_data.data = data_zero ;
+
+ /* Now process some zero data. */
+ read_total = 0 ;
+ do
+ { read_count = (ARRAY_LEN (output) - read_total > CB_READ_LEN) ? CB_READ_LEN : ARRAY_LEN (output) - read_total ;
+ read_count = src_callback_read (src_state, src_ratio, read_count, output + read_total) ;
+ read_total += read_count ;
+ }
+ while (read_count > 0) ;
+
+ /* Check for errors. */
+ if ((error = src_error (src_state)) != 0)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ /* Finally make sure that the output data is zero ie reset was sucessful. */
+ for (k = 0 ; k < BUFFER_LEN / 2 ; k++)
+ if (output [k] != 0.0)
+ { printf ("\n\nLine %d : output [%d] should be 0.0, is %f.\n\n", __LINE__, k, output [k]) ;
+ save_oct_float ("output.dat", data_one, ARRAY_LEN (data_one), output, ARRAY_LEN (output)) ;
+ exit (1) ;
+ } ;
+
+ /* Make sure that this function has been exported. */
+ src_set_ratio (src_state, 1.0) ;
+
+ /* Delete converter. */
+ src_state = src_delete (src_state) ;
+
+ puts ("ok") ;
+} /* callback_reset_test */
+
+
diff --git a/soxr/lsr-tests/simple_test.c b/soxr/lsr-tests/simple_test.c
new file mode 100644
index 0000000..91dcde3
--- /dev/null
+++ b/soxr/lsr-tests/simple_test.c
@@ -0,0 +1,117 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+#include
+
+#include
+
+#include "util.h"
+
+#define BUFFER_LEN 2048
+
+static void simple_test (int converter, double ratio) ;
+
+int
+main (void)
+{ static double src_ratios [] =
+ { 1.0001, 0.099, 0.1, 0.33333333, 0.789, 1.9, 3.1, 9.9
+ } ;
+
+ int k ;
+
+ puts ("") ;
+
+ puts (" Zero Order Hold interpolator :") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ simple_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ;
+
+ puts (" Linear interpolator :") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ simple_test (SRC_LINEAR, src_ratios [k]) ;
+
+ puts (" Sinc interpolator :") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ simple_test (SRC_SINC_FASTEST, src_ratios [k]) ;
+
+ puts ("") ;
+
+ return 0 ;
+} /* main */
+
+static void
+simple_test (int converter, double src_ratio)
+{ static float input [BUFFER_LEN], output [BUFFER_LEN] ;
+
+ SRC_DATA src_data ;
+
+ int input_len, output_len, error, terminate ;
+
+ printf ("\tsimple_test (SRC ratio = %6.4f) ........... ", src_ratio) ;
+ fflush (stdout) ;
+
+ /* Calculate maximun input and output lengths. */
+ if (src_ratio >= 1.0)
+ { output_len = BUFFER_LEN ;
+ input_len = (int) floor (BUFFER_LEN / src_ratio) ;
+ }
+ else
+ { input_len = BUFFER_LEN ;
+ output_len = (int) floor (BUFFER_LEN * src_ratio) ;
+ } ;
+
+ /* Reduce input_len by 10 so output is longer than necessary. */
+ input_len -= 10 ;
+
+ if (output_len > BUFFER_LEN)
+ { printf ("\n\nLine %d : output_len > BUFFER_LEN\n\n", __LINE__) ;
+ exit (1) ;
+ } ;
+
+ memset (&src_data, 0, sizeof (src_data)) ;
+
+ src_data.data_in = input ;
+ src_data.input_frames = input_len ;
+
+ src_data.src_ratio = src_ratio ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = BUFFER_LEN ;
+
+ if ((error = src_simple (&src_data, converter, 1)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ terminate = (int) ceil ((src_ratio >= 1.0) ? src_ratio : 1.0 / src_ratio) ;
+
+ if (fabs (src_data.output_frames_gen - src_ratio * input_len) > 2 * terminate)
+ { printf ("\n\nLine %d : bad output data length %ld should be %d.\n", __LINE__,
+ src_data.output_frames_gen, (int) floor (src_ratio * input_len)) ;
+ printf ("\tsrc_ratio : %.4f\n", src_ratio) ;
+ printf ("\tinput_len : %d\n\toutput_len : %d\n\n", input_len, output_len) ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* simple_test */
+
diff --git a/soxr/lsr-tests/sndfile-resample.c b/soxr/lsr-tests/sndfile-resample.c
new file mode 100644
index 0000000..63d179c
--- /dev/null
+++ b/soxr/lsr-tests/sndfile-resample.c
@@ -0,0 +1,332 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include
+#include
+#include
+#include
+#include
+
+#if (HAVE_SNDFILE)
+
+#include
+#include
+
+#define DEFAULT_CONVERTER SRC_SINC_MEDIUM_QUALITY
+
+#define BUFFER_LEN 4096 /*-(1<<16)-*/
+
+static void usage_exit (const char *progname) ;
+static sf_count_t sample_rate_convert (SNDFILE *infile, SNDFILE *outfile, int converter, double src_ratio, int channels, double * gain) ;
+static double apply_gain (float * data, long frames, int channels, double max, double gain) ;
+
+int
+main (int argc, char *argv [])
+{ SNDFILE *infile, *outfile = NULL ;
+ SF_INFO sfinfo ;
+
+ sf_count_t count ;
+ double src_ratio = -1.0, gain = 1.0 ;
+ int new_sample_rate = -1, k, converter, max_speed = SF_FALSE ;
+
+ if (argc == 2 && strcmp (argv [1], "--version") == 0)
+ { char buffer [64], *cptr ;
+
+ if ((cptr = strrchr (argv [0], '/')) != NULL)
+ argv [0] = cptr + 1 ;
+ if ((cptr = strrchr (argv [0], '\\')) != NULL)
+ argv [0] = cptr + 1 ;
+
+ sf_command (NULL, SFC_GET_LIB_VERSION, buffer, sizeof (buffer)) ;
+
+ printf ("%s (%s,%s)\n", argv [0], src_get_version (), buffer) ;
+ exit (0) ;
+ } ;
+
+ if (argc != 5 && argc != 7 && argc != 8)
+ usage_exit (argv [0]) ;
+
+ /* Set default converter. */
+ converter = DEFAULT_CONVERTER ;
+
+ for (k = 1 ; k < argc - 2 ; k++)
+ { if (strcmp (argv [k], "--max-speed") == 0)
+ max_speed = SF_TRUE ;
+ else if (strcmp (argv [k], "-to") == 0)
+ { k ++ ;
+ new_sample_rate = atoi (argv [k]) ;
+ }
+ else if (strcmp (argv [k], "-by") == 0)
+ { k ++ ;
+ src_ratio = atof (argv [k]) ;
+ }
+ else if (strcmp (argv [k], "-c") == 0)
+ { k ++ ;
+ converter = atoi (argv [k]) ;
+ }
+ else
+ usage_exit (argv [0]) ;
+ } ;
+
+ if (new_sample_rate <= 0 && src_ratio <= 0.0)
+ usage_exit (argv [0]) ;
+
+ if (src_get_name (converter) == NULL)
+ { printf ("Error : bad converter number.\n") ;
+ usage_exit (argv [0]) ;
+ } ;
+
+ if (strcmp (argv [argc - 2], argv [argc - 1]) == 0)
+ { printf ("Error : input and output file names are the same.\n") ;
+ exit (1) ;
+ } ;
+
+ if ((infile = sf_open (argv [argc - 2], SFM_READ, &sfinfo)) == NULL)
+ { printf ("Error : Not able to open input file '%s'\n", argv [argc - 2]) ;
+ exit (1) ;
+ } ;
+
+ printf ("Input File : %s\n", argv [argc - 2]) ;
+ printf ("Sample Rate : %d\n", sfinfo.samplerate) ;
+ printf ("Input Frames : %ld\n\n", (long) sfinfo.frames) ;
+
+ if (new_sample_rate > 0)
+ { src_ratio = (1.0 * new_sample_rate) / sfinfo.samplerate ;
+ sfinfo.samplerate = new_sample_rate ;
+ }
+ else if (src_is_valid_ratio (src_ratio))
+ sfinfo.samplerate = (int) floor (sfinfo.samplerate * src_ratio) ;
+ else
+ { printf ("Not able to determine new sample rate. Exiting.\n") ;
+ sf_close (infile) ;
+ exit (1) ;
+ } ;
+
+ if (fabs (src_ratio - 1.0) < 1e-20)
+ { printf ("Target samplerate and input samplerate are the same. Exiting.\n") ;
+ sf_close (infile) ;
+ exit (0) ;
+ } ;
+
+ printf ("SRC Ratio : %f\n", src_ratio) ;
+ printf ("Converter : %s\n\n", src_get_name (converter)) ;
+
+ if (src_is_valid_ratio (src_ratio) == 0)
+ { printf ("Error : Sample rate change out of valid range.\n") ;
+ sf_close (infile) ;
+ exit (1) ;
+ } ;
+
+ /* Delete the output file length to zero if already exists. */
+ remove (argv [argc - 1]) ;
+
+ printf ("Output file : %s\n", argv [argc - 1]) ;
+ printf ("Sample Rate : %d\n", sfinfo.samplerate) ;
+
+ do
+ { sf_close (outfile) ;
+
+ if ((outfile = sf_open (argv [argc - 1], SFM_WRITE, &sfinfo)) == NULL)
+ { printf ("Error : Not able to open output file '%s'\n", argv [argc - 1]) ;
+ sf_close (infile) ;
+ exit (1) ;
+ } ;
+
+ if (max_speed)
+ { /* This is mainly for the comparison program tests/src-evaluate.c */
+ sf_command (outfile, SFC_SET_ADD_PEAK_CHUNK, NULL, SF_FALSE) ;
+ }
+ else
+ { /* Update the file header after every write. */
+ sf_command (outfile, SFC_SET_UPDATE_HEADER_AUTO, NULL, SF_TRUE) ;
+ } ;
+
+ sf_command (outfile, SFC_SET_CLIPPING, NULL, SF_TRUE) ;
+
+ count = sample_rate_convert (infile, outfile, converter, src_ratio, sfinfo.channels, &gain) ;
+ }
+ while (count < 0) ;
+
+ printf ("Output Frames : %ld\n\n", (long) count) ;
+
+ sf_close (infile) ;
+ sf_close (outfile) ;
+
+ return 0 ;
+} /* main */
+
+/*==============================================================================
+*/
+
+static sf_count_t
+sample_rate_convert (SNDFILE *infile, SNDFILE *outfile, int converter, double src_ratio, int channels, double * gain)
+{ static float input [BUFFER_LEN] ;
+ static float output [BUFFER_LEN] ;
+
+ SRC_STATE *src_state ;
+ SRC_DATA src_data ;
+ int error ;
+ double max = 0.0 ;
+ sf_count_t output_count = 0 ;
+
+ sf_seek (infile, 0, SEEK_SET) ;
+ sf_seek (outfile, 0, SEEK_SET) ;
+
+ /* Initialize the sample rate converter. */
+ if ((src_state = src_new (converter, channels, &error)) == NULL)
+ { printf ("\n\nError : src_new() failed : %s.\n\n", src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_data.end_of_input = 0 ; /* Set this later. */
+
+ /* Start with zero to force load in while loop. */
+ src_data.input_frames = 0 ;
+ src_data.data_in = input ;
+
+ src_data.src_ratio = src_ratio ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = BUFFER_LEN /channels ;
+
+ while (1)
+ {
+ /* If the input buffer is empty, refill it. */
+ if (src_data.input_frames == 0)
+ { src_data.input_frames = sf_readf_float (infile, input, BUFFER_LEN / channels) ;
+ src_data.data_in = input ;
+
+ /* The last read will not be a full buffer, so snd_of_input. */
+ if (src_data.input_frames < BUFFER_LEN / channels)
+ src_data.end_of_input = SF_TRUE ;
+ } ;
+
+ if ((error = src_process (src_state, &src_data)))
+ { printf ("\nError : %s\n", src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ /* Terminate if done. */
+ if (src_data.end_of_input && src_data.output_frames_gen == 0)
+ break ;
+
+ max = apply_gain (src_data.data_out, src_data.output_frames_gen, channels, max, *gain) ;
+
+ /* Write output. */
+ sf_writef_float (outfile, output, src_data.output_frames_gen) ;
+ output_count += src_data.output_frames_gen ;
+
+ src_data.data_in += src_data.input_frames_used * channels ;
+ src_data.input_frames -= src_data.input_frames_used ;
+ } ;
+
+ src_state = src_delete (src_state) ;
+
+ if (max > 1.0)
+ { *gain = 1.0 / max ;
+ printf ("\nOutput has clipped. Restarting conversion to prevent clipping.\n\n") ;
+ return -1 ;
+ } ;
+
+ return output_count ;
+} /* sample_rate_convert */
+
+static double
+apply_gain (float * data, long frames, int channels, double max, double gain)
+{
+ long k ;
+
+ for (k = 0 ; k < frames * channels ; k++)
+ { data [k] *= gain ;
+
+ if (fabs (data [k]) > max)
+ max = fabs (data [k]) ;
+ } ;
+
+ return max ;
+} /* apply_gain */
+
+static void
+usage_exit (const char *progname)
+{ char lsf_ver [128] ;
+ const char *cptr ;
+ int k ;
+
+ if ((cptr = strrchr (progname, '/')) != NULL)
+ progname = cptr + 1 ;
+
+ if ((cptr = strrchr (progname, '\\')) != NULL)
+ progname = cptr + 1 ;
+
+
+ sf_command (NULL, SFC_GET_LIB_VERSION, lsf_ver, sizeof (lsf_ver)) ;
+
+ printf ("\n"
+ " A Sample Rate Converter using libsndfile for file I/O and Secret \n"
+ " Rabbit Code (aka libsamplerate) for performing the conversion.\n"
+ " It works on any file format supported by libsndfile with any \n"
+ " number of channels (limited only by host memory).\n"
+ "\n"
+ " %s\n"
+ " %s\n"
+ "\n"
+ " Usage : \n"
+ " %s -to [-c ] \n"
+ " %s -by [-c ] \n"
+ "\n", src_get_version (), lsf_ver, progname, progname) ;
+
+ puts (
+ " The optional -c argument allows the converter type to be chosen from\n"
+ " the following list :"
+ "\n"
+ ) ;
+
+ for (k = 0 ; (cptr = src_get_name (k)) != NULL ; k++)
+ printf (" %d : %s%s\n", k, cptr, k == DEFAULT_CONVERTER ? " (default)" : "") ;
+
+ puts ("") ;
+
+ exit (1) ;
+} /* usage_exit */
+
+/*==============================================================================
+*/
+
+#else /* (HAVE_SNFILE == 0) */
+
+/* Alternative main function when libsndfile is not available. */
+
+int
+main (void)
+{ puts (
+ "\n"
+ "****************************************************************\n"
+ " This example program was compiled without libsndfile \n"
+ " (http://www.mega-nerd.com/libsndfile/).\n"
+ " It is therefore completely broken and non-functional.\n"
+ "****************************************************************\n"
+ "\n"
+ ) ;
+
+ return 0 ;
+} /* main */
+
+#endif
+
diff --git a/soxr/lsr-tests/snr_bw_test.c b/soxr/lsr-tests/snr_bw_test.c
new file mode 100644
index 0000000..55130b4
--- /dev/null
+++ b/soxr/lsr-tests/snr_bw_test.c
@@ -0,0 +1,401 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include
+#include
+#include
+#include
+#include
+
+#if (HAVE_FFTW3)
+
+#include
+
+#include "util.h"
+
+#define BUFFER_LEN 50000
+#define MAX_FREQS 4
+#define MAX_RATIOS 6
+#define MAX_SPEC_LEN (1<<15)
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846264338
+#endif
+
+enum
+{ BOOLEAN_FALSE = 0,
+ BOOLEAN_TRUE = 1
+} ;
+
+typedef struct
+{ int freq_count ;
+ double freqs [MAX_FREQS] ;
+
+ double src_ratio ;
+ int pass_band_peaks ;
+
+ double snr ;
+ double peak_value ;
+} SINGLE_TEST ;
+
+typedef struct
+{ int converter ;
+ int tests ;
+ int do_bandwidth_test ;
+ SINGLE_TEST test_data [10] ;
+} CONVERTER_TEST ;
+
+static double snr_test (SINGLE_TEST *snr_test_data, int number, int converter, int verbose) ;
+static double find_peak (float *output, int output_len) ;
+static double bandwidth_test (int converter, int verbose) ;
+
+int
+main (int argc, char *argv [])
+{ CONVERTER_TEST snr_test_data [] =
+ {
+ { SRC_ZERO_ORDER_HOLD,
+ 8,
+ BOOLEAN_FALSE,
+ { { 1, { 0.01111111111 }, 3.0, 1, 28.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.6, 1, 36.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.3, 1, 36.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.0, 1, 150.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.001, 1, 38.0, 1.0 },
+ { 2, { 0.011111, 0.324 }, 1.9999, 2, 14.0, .96 },
+ { 2, { 0.012345, 0.457 }, 0.456789, 1, 12.0, .96 },
+ { 1, { 0.3511111111 }, 1.33, 1, 10.0, 1.0 }
+ }
+ },
+
+ { SRC_LINEAR,
+ 8,
+ BOOLEAN_FALSE,
+ { { 1, { 0.01111111111 }, 3.0, 1, 73.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.6, 1, 73.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.3, 1, 73.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.0, 1, 150.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.001, 1, 77.0, 1.0 },
+ { 2, { 0.011111, 0.324 }, 1.9999, 2, 16.0, 0.96 },
+ { 2, { 0.012345, 0.457 }, 0.456789, 1, 26.0, 0.96 },
+ { 1, { 0.3511111111 }, 1.33, 1, 14.4, 0.99 }
+ }
+ },
+
+ { SRC_SINC_FASTEST,
+ 9,
+ BOOLEAN_TRUE,
+ { { 1, { 0.01111111111 }, 3.0, 1, 100.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.6, 1, 99.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.3, 1, 100.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.0, 1, 150.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.001, 1, 100.0, 1.0 },
+ { 2, { 0.011111, 0.324 }, 1.9999, 2, 97.0, 1.0 },
+ { 2, { 0.012345, 0.457 }, 0.456789, 1, 100.0, 0.5 },
+ { 2, { 0.011111, 0.45 }, 0.6, 1, 97.0, 0.5 },
+ { 1, { 0.3511111111 }, 1.33, 1, 97.0, 1.0 }
+ }
+ },
+
+ { SRC_SINC_MEDIUM_QUALITY,
+ 9,
+ BOOLEAN_TRUE,
+ { { 1, { 0.01111111111 }, 3.0, 1, 130.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.6, 1, 132.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.3, 1, 135.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.0, 1, 155.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.001, 1, 133.0, 1.0 },
+ { 2, { 0.011111, 0.324 }, 1.9999, 2, 127.0, 1.0 },
+ { 2, { 0.012345, 0.457 }, 0.456789, 1, 124.0, 0.5 },
+ { 2, { 0.011111, 0.45 }, 0.6, 1, 126.0, 0.5 },
+ { 1, { 0.43111111111 }, 1.33, 1, 121.0, 1.0 }
+ }
+ },
+
+ { SRC_SINC_BEST_QUALITY,
+ 9,
+ BOOLEAN_TRUE,
+ { { 1, { 0.01111111111 }, 3.0, 1, 147.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.6, 1, 147.0, 1.0 },
+ { 1, { 0.01111111111 }, 0.3, 1, 147.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.0, 1, 155.0, 1.0 },
+ { 1, { 0.01111111111 }, 1.001, 1, 146.0, 1.0 },
+ { 2, { 0.011111, 0.324 }, 1.9999, 2, 147.0, 1.0 },
+ { 2, { 0.012345, 0.457 }, 0.456789, 1, 148.0, 0.5 },
+ { 2, { 0.011111, 0.45 }, 0.6, 1, 145.0, 0.5 },
+ { 1, { 0.43111111111 }, 1.33, 1, 145.0, 1.0 }
+ }
+ },
+ } ; /* snr_test_data */
+
+ double best_snr, snr, freq3dB ;
+ int j, k, converter, verbose = 0 ;
+
+ if (argc == 2 && strcmp (argv [1], "--verbose") == 0)
+ verbose = 1 ;
+
+ puts ("") ;
+
+ for (j = 0 ; j < ARRAY_LEN (snr_test_data) ; j++)
+ { best_snr = 5000.0 ;
+
+ converter = snr_test_data [j].converter ;
+
+ printf (" Converter %d : %s\n", converter, src_get_name (converter)) ;
+ printf (" %s\n", src_get_description (converter)) ;
+
+ for (k = 0 ; k < snr_test_data [j].tests ; k++)
+ { snr = snr_test (&(snr_test_data [j].test_data [k]), k, converter, verbose) ;
+ if (best_snr > snr)
+ best_snr = snr ;
+ } ;
+
+ printf (" Worst case Signal-to-Noise Ratio : %.2f dB.\n", best_snr) ;
+
+ if (snr_test_data [j].do_bandwidth_test == BOOLEAN_FALSE)
+ { puts (" Bandwith test not performed on this converter.\n") ;
+ continue ;
+ }
+
+ freq3dB = bandwidth_test (converter, verbose) ;
+
+ printf (" Measured -3dB rolloff point : %5.2f %%.\n\n", freq3dB) ;
+ } ;
+
+ return 0 ;
+} /* main */
+
+/*==============================================================================
+*/
+
+static double
+snr_test (SINGLE_TEST *test_data, int number, int converter, int verbose)
+{ static float data [BUFFER_LEN + 1] ;
+ static float output [MAX_SPEC_LEN] ;
+
+ SRC_STATE *src_state ;
+ SRC_DATA src_data ;
+
+ double output_peak, snr ;
+ int k, output_len, input_len, error ;
+
+ if (verbose != 0)
+ { printf ("\tSignal-to-Noise Ratio Test %d.\n"
+ "\t=====================================\n", number) ;
+ printf ("\tFrequencies : [ ") ;
+ for (k = 0 ; k < test_data->freq_count ; k++)
+ printf ("%6.4f ", test_data->freqs [k]) ;
+
+ printf ("]\n\tSRC Ratio : %8.4f\n", test_data->src_ratio) ;
+ }
+ else
+ { printf ("\tSignal-to-Noise Ratio Test %d : ", number) ;
+ fflush (stdout) ;
+ } ;
+
+ /* Set up the output array. */
+ if (test_data->src_ratio >= 1.0)
+ { output_len = MAX_SPEC_LEN ;
+ input_len = (int) ceil (MAX_SPEC_LEN / test_data->src_ratio) ;
+ if (input_len > BUFFER_LEN)
+ input_len = BUFFER_LEN ;
+ }
+ else
+ { input_len = BUFFER_LEN ;
+ output_len = (int) ceil (BUFFER_LEN * test_data->src_ratio) ;
+ output_len &= ((-1) << 4) ;
+ if (output_len > MAX_SPEC_LEN)
+ output_len = MAX_SPEC_LEN ;
+ input_len = (int) ceil (output_len / test_data->src_ratio) ;
+ } ;
+
+ memset (output, 0, sizeof (output)) ;
+
+ /* Generate input data array. */
+ gen_windowed_sines (test_data->freq_count, test_data->freqs, 1.0, data, input_len) ;
+
+ /* Perform sample rate conversion. */
+ if ((src_state = src_new (converter, 1, &error)) == NULL)
+ { printf ("\n\nLine %d : src_new() failed : %s.\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_data.end_of_input = 1 ; /* Only one buffer worth of input. */
+
+ src_data.data_in = data ;
+ src_data.input_frames = input_len ;
+
+ src_data.src_ratio = test_data->src_ratio ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = output_len ;
+
+ if ((error = src_process (src_state, &src_data)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_state = src_delete (src_state) ;
+
+ if (verbose != 0)
+ printf ("\tOutput Len : %ld\n", src_data.output_frames_gen) ;
+
+ if (abs (src_data.output_frames_gen - output_len) > 4)
+ { printf ("\n\nLine %d : output data length should be %d.\n\n", __LINE__, output_len) ;
+ exit (1) ;
+ } ;
+
+ /* Check output peak. */
+ output_peak = find_peak (output, src_data.output_frames_gen) ;
+
+ if (verbose != 0)
+ printf ("\tOutput Peak : %6.4f\n", output_peak) ;
+
+ if (fabs (output_peak - test_data->peak_value) > 0.01)
+ { printf ("\n\nLine %d : output peak (%6.4f) should be %6.4f\n\n", __LINE__, output_peak, test_data->peak_value) ;
+ save_oct_float ("snr_test.dat", data, BUFFER_LEN, output, output_len) ;
+ exit (1) ;
+ } ;
+
+ /* Calculate signal-to-noise ratio. */
+ snr = calculate_snr (output, src_data.output_frames_gen, test_data->pass_band_peaks) ;
+
+ if (snr < 0.0)
+ { /* An error occurred. */
+ save_oct_float ("snr_test.dat", data, BUFFER_LEN, output, src_data.output_frames_gen) ;
+ exit (1) ;
+ } ;
+
+ if (verbose != 0)
+ printf ("\tSNR Ratio : %.2f dB\n", snr) ;
+
+ if (snr < test_data->snr)
+ { printf ("\n\nLine %d : SNR (%5.2f) should be > %6.2f dB\n\n", __LINE__, snr, test_data->snr) ;
+ exit (1) ;
+ } ;
+
+ if (verbose != 0)
+ puts ("\t-------------------------------------\n\tPass\n") ;
+ else
+ puts ("Pass") ;
+
+ return snr ;
+} /* snr_test */
+
+static double
+find_peak (float *data, int len)
+{ double peak = 0.0 ;
+ int k = 0 ;
+
+ for (k = 0 ; k < len ; k++)
+ if (fabs (data [k]) > peak)
+ peak = fabs (data [k]) ;
+
+ return peak ;
+} /* find_peak */
+
+
+static double
+find_attenuation (double freq, int converter, int verbose)
+{ static float input [BUFFER_LEN] ;
+ static float output [2 * BUFFER_LEN] ;
+
+ SRC_DATA src_data ;
+ double output_peak ;
+ int error ;
+
+ gen_windowed_sines (1, &freq, 1.0, input, BUFFER_LEN) ;
+
+ src_data.end_of_input = 1 ; /* Only one buffer worth of input. */
+
+ src_data.data_in = input ;
+ src_data.input_frames = BUFFER_LEN ;
+
+ src_data.src_ratio = 1.999 ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = ARRAY_LEN (output) ;
+
+ if ((error = src_simple (&src_data, converter, 1)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ output_peak = find_peak (output, ARRAY_LEN (output)) ;
+
+ if (verbose)
+ printf ("\tFreq : %6f InPeak : %6f OutPeak : %6f Atten : %6.2f dB\n",
+ freq, 1.0, output_peak, 20.0 * log10 (1.0 / output_peak)) ;
+
+ return 20.0 * log10 (1.0 / output_peak) ;
+} /* find_attenuation */
+
+static double
+bandwidth_test (int converter, int verbose)
+{ double f1, f2, a1, a2 ;
+ double freq, atten ;
+
+ f1 = 0.35 ;
+ a1 = find_attenuation (f1, converter, verbose) ;
+
+ f2 = 0.495 ;
+ a2 = find_attenuation (f2, converter, verbose) ;
+
+ if (a1 > 3.0 || a2 < 3.0)
+ { printf ("\n\nLine %d : cannot bracket 3dB point.\n\n", __LINE__) ;
+ exit (1) ;
+ } ;
+
+ while (a2 - a1 > 1.0)
+ { freq = f1 + 0.5 * (f2 - f1) ;
+ atten = find_attenuation (freq, converter, verbose) ;
+
+ if (atten < 3.0)
+ { f1 = freq ;
+ a1 = atten ;
+ }
+ else
+ { f2 = freq ;
+ a2 = atten ;
+ } ;
+ } ;
+
+ freq = f1 + (3.0 - a1) * (f2 - f1) / (a2 - a1) ;
+
+ return 200.0 * freq ;
+} /* bandwidth_test */
+
+#else /* (HAVE_FFTW3) == 0 */
+
+/* Alternative main function when librfftw is not available. */
+
+int
+main (void)
+{ puts ("\n"
+ "****************************************************************\n"
+ " This test cannot be run without FFTW (http://www.fftw.org/).\n"
+ " Both the real and the complex versions of the library are\n"
+ " required.") ;
+ puts ("****************************************************************\n") ;
+
+ return 0 ;
+} /* main */
+
+#endif
+
diff --git a/soxr/lsr-tests/termination_test.c b/soxr/lsr-tests/termination_test.c
new file mode 100644
index 0000000..6bb0fc0
--- /dev/null
+++ b/soxr/lsr-tests/termination_test.c
@@ -0,0 +1,339 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+
+#include
+
+#include "util.h"
+
+#define SHORT_BUFFER_LEN 2048
+#define LONG_BUFFER_LEN ((1 << 16) - 20)
+
+static void simple_test (int converter) ;
+static void stream_test (int converter, double ratio) ;
+static void init_term_test (int converter, double ratio) ;
+
+static int next_block_length (int reset) ;
+
+int
+main (void)
+{ static double src_ratios [] =
+ { 0.999900, 1.000100, 0.789012, 1.200000, 0.333333, 3.100000,
+ 0.125000, 8.000000, 0.099900, 9.990000, 0.100000, 10.00000
+ } ;
+
+ int k ;
+
+ puts ("\n Zero Order Hold interpolator:") ;
+
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ init_term_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ;
+ puts ("") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ stream_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ;
+
+
+ puts ("\n Linear interpolator:") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ init_term_test (SRC_LINEAR, src_ratios [k]) ;
+ puts ("") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ stream_test (SRC_LINEAR, src_ratios [k]) ;
+
+
+ puts ("\n Sinc interpolator:") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ init_term_test (SRC_SINC_FASTEST, src_ratios [k]) ;
+ puts ("") ;
+ for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+ stream_test (SRC_SINC_FASTEST, src_ratios [k]) ;
+
+ puts ("") ;
+
+ simple_test (SRC_SINC_FASTEST) ;
+
+ return 0 ;
+} /* main */
+
+static void
+simple_test (int converter)
+{
+ int ilen = 199030, olen = 1000, error ;
+
+ {
+ float in [ilen] ;
+ float out [olen] ;
+ double ratio = (1.0 * olen) / ilen ;
+ SRC_DATA src_data =
+ { in, out,
+ ilen, olen,
+ 0, 0, 0,
+ ratio
+ } ;
+
+ error = src_simple (&src_data, converter, 1) ;
+ if (error)
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+ } ;
+
+ return ;
+} /* simple_test */
+
+static void
+init_term_test (int converter, double src_ratio)
+{ static float input [SHORT_BUFFER_LEN], output [SHORT_BUFFER_LEN] ;
+
+ SRC_DATA src_data ;
+
+ int k, input_len, output_len, error, terminate ;
+
+ printf ("\tinit_term_test (SRC ratio = %7.4f) .......... ", src_ratio) ;
+ fflush (stdout) ;
+
+ /* Calculate maximun input and output lengths. */
+ if (src_ratio >= 1.0)
+ { output_len = SHORT_BUFFER_LEN ;
+ input_len = (int) floor (SHORT_BUFFER_LEN / src_ratio) ;
+ }
+ else
+ { input_len = SHORT_BUFFER_LEN ;
+ output_len = (int) floor (SHORT_BUFFER_LEN * src_ratio) ;
+ } ;
+
+ /* Reduce input_len by 10 so output is longer than necessary. */
+ input_len -= 10 ;
+
+ for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+ input [k] = 1.0 ;
+
+ if (output_len > SHORT_BUFFER_LEN)
+ { printf ("\n\nLine %d : output_len > SHORT_BUFFER_LEN\n\n", __LINE__) ;
+ exit (1) ;
+ } ;
+
+ src_data.data_in = input ;
+ src_data.input_frames = input_len ;
+
+ src_data.src_ratio = src_ratio ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = SHORT_BUFFER_LEN ;
+
+ if ((error = src_simple (&src_data, converter, 1)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ terminate = (int) ceil ((src_ratio >= 1.0) ? 1 : 1.0 / src_ratio) ;
+
+ if (fabs (src_ratio * input_len - src_data.output_frames_gen) > terminate)
+ { printf ("\n\nLine %d : Bad output frame count.\n\n", __LINE__) ;
+ printf ("\tterminate : %d\n", terminate) ;
+ printf ("\tsrc_ratio : %.4f\n", src_ratio) ;
+ printf ("\tinput_len : %d\n"
+ "\tinput_len * src_ratio : %f\n", input_len, input_len * src_ratio) ;
+ printf ("\toutput_frames_gen : %ld\n\n", src_data.output_frames_gen) ;
+ exit (1) ;
+ } ;
+
+ if (abs (src_data.input_frames_used - input_len) > 1)
+ { printf ("\n\nLine %d : input_frames_used should be %d, is %ld.\n\n",
+ __LINE__, input_len, src_data.input_frames_used) ;
+ printf ("\tsrc_ratio : %.4f\n", src_ratio) ;
+ printf ("\tinput_len : %d\n\tinput_used : %ld\n\n", input_len, src_data.input_frames_used) ;
+ exit (1) ;
+ } ;
+
+ if (fabs (output [0]) < 0.1)
+ { printf ("\n\nLine %d : First output sample is bad.\n\n", __LINE__) ;
+ printf ("\toutput [0] == %f\n\n", output [0]) ;
+ exit (1) ;
+ }
+
+ puts ("ok") ;
+
+ return ;
+} /* init_term_test */
+
+static void
+stream_test (int converter, double src_ratio)
+{ static float input [LONG_BUFFER_LEN], output [LONG_BUFFER_LEN] ;
+
+ SRC_STATE *src_state ;
+ SRC_DATA src_data ;
+
+ int input_len, output_len, current_in, current_out ;
+ int k, error, terminate ;
+
+ printf ("\tstream_test (SRC ratio = %7.4f) .......... ", src_ratio) ;
+ fflush (stdout) ;
+
+/* Erik */
+for (k = 0 ; k < LONG_BUFFER_LEN ; k++) input [k] = k * 1.0 ;
+
+ /* Calculate maximun input and output lengths. */
+ if (src_ratio >= 1.0)
+ { output_len = LONG_BUFFER_LEN ;
+ input_len = (int) floor (LONG_BUFFER_LEN / src_ratio) ;
+ }
+ else
+ { input_len = LONG_BUFFER_LEN ;
+ output_len = (int) floor (LONG_BUFFER_LEN * src_ratio) ;
+ } ;
+
+ /* Reduce input_len by 10 so output is longer than necessary. */
+ input_len -= 20 ;
+
+ if (output_len > LONG_BUFFER_LEN)
+ { printf ("\n\nLine %d : output_len > LONG_BUFFER_LEN\n\n", __LINE__) ;
+ exit (1) ;
+ } ;
+
+ current_in = current_out = 0 ;
+
+ /* Perform sample rate conversion. */
+ if ((src_state = src_new (converter, 1, &error)) == NULL)
+ { printf ("\n\nLine %d : src_new() failed : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_data.end_of_input = 0 ; /* Set this later. */
+
+ src_data.data_in = input ;
+
+ src_data.src_ratio = src_ratio ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = ARRAY_LEN (output) / 10 ;
+
+ terminate = 1 + (int) ceil ((src_ratio >= 1.0) ? src_ratio : 1.0 / src_ratio) ;
+
+ while (1)
+ {
+ src_data.input_frames = next_block_length (0) ;
+ src_data.input_frames = MIN (src_data.input_frames, input_len - current_in) ;
+
+ src_data.output_frames = ARRAY_LEN (output) - current_out ;
+ /*-Erik MIN (src_data.output_frames, output_len - current_out) ;-*/
+
+ src_data.end_of_input = (current_in >= input_len) ? 1 : 0 ;
+
+ if ((error = src_process (src_state, &src_data)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ printf (" src_data.input_frames : %ld\n", src_data.input_frames) ;
+ printf (" src_data.output_frames : %ld\n\n", src_data.output_frames) ;
+ exit (1) ;
+ } ;
+
+ if (src_data.end_of_input && src_data.output_frames_gen == 0)
+ break ;
+
+ if (src_data.input_frames_used > src_data.input_frames)
+ { printf ("\n\nLine %d : input_frames_used > input_frames\n\n", __LINE__) ;
+ printf (" src_data.input_frames : %ld\n", src_data.input_frames) ;
+ printf (" src_data.input_frames_used : %ld\n", src_data.input_frames_used) ;
+ printf (" src_data.output_frames : %ld\n", src_data.output_frames) ;
+ printf (" src_data.output_frames_gen : %ld\n\n", src_data.output_frames_gen) ;
+ exit (1) ;
+ } ;
+
+ if (src_data.input_frames_used < 0)
+ { printf ("\n\nLine %d : input_frames_used (%ld) < 0\n\n", __LINE__, src_data.input_frames_used) ;
+ exit (1) ;
+ } ;
+
+ if (src_data.output_frames_gen < 0)
+ { printf ("\n\nLine %d : output_frames_gen (%ld) < 0\n\n", __LINE__, src_data.output_frames_gen) ;
+ exit (1) ;
+ } ;
+
+ current_in += src_data.input_frames_used ;
+ current_out += src_data.output_frames_gen ;
+
+ if (current_in > input_len + terminate)
+ { printf ("\n\nLine %d : current_in (%d) > input_len (%d + %d)\n\n", __LINE__, current_in, input_len, terminate) ;
+ exit (1) ;
+ } ;
+
+ if (current_out > output_len)
+ { printf ("\n\nLine %d : current_out (%d) > output_len (%d)\n\n", __LINE__, current_out, output_len) ;
+ exit (1) ;
+ } ;
+
+ if (src_data.input_frames_used > input_len)
+ { printf ("\n\nLine %d : input_frames_used (%ld) > %d\n\n", __LINE__, src_data.input_frames_used, input_len) ;
+ exit (1) ;
+ } ;
+
+ if (src_data.output_frames_gen > output_len)
+ { printf ("\n\nLine %d : output_frames_gen (%ld) > %d\n\n", __LINE__, src_data.output_frames_gen, output_len) ;
+ exit (1) ;
+ } ;
+
+ if (src_data.data_in == NULL && src_data.output_frames_gen == 0)
+ break ;
+
+
+ src_data.data_in += src_data.input_frames_used ;
+ src_data.data_out += src_data.output_frames_gen ;
+ } ;
+
+ src_state = src_delete (src_state) ;
+
+ if (fabs (current_out - src_ratio * input_len) > terminate)
+ { printf ("\n\nLine %d : bad output data length %d should be %2.1f +/- %d.\n", __LINE__,
+ current_out, src_ratio * input_len, terminate) ;
+ printf ("\tsrc_ratio : %.4f\n", src_ratio) ;
+ printf ("\tinput_len : %d\n\tinput_used : %d\n", input_len, current_in) ;
+ printf ("\toutput_len : %d\n\toutput_gen : %d\n\n", output_len, current_out) ;
+ exit (1) ;
+ } ;
+
+ if (current_in != input_len)
+ { printf ("\n\nLine %d : unused input.\n", __LINE__) ;
+ printf ("\tinput_len : %d\n", input_len) ;
+ printf ("\tinput_frames_used : %d\n\n", current_in) ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* stream_test */
+
+static int
+next_block_length (int reset)
+{ static int block_lengths [] = /* Should be an odd length. */
+ { /*-2, 500, 5, 400, 10, 300, 20, 200, 50, 100, 70 -*/
+ 5, 400, 10, 300, 20, 200, 50, 100, 70
+ } ;
+ static int block_len_index = 0 ;
+
+ if (reset)
+ block_len_index = 0 ;
+ else
+ block_len_index = (block_len_index + 1) % ARRAY_LEN (block_lengths) ;
+
+ return block_lengths [block_len_index] ;
+} /* next_block_length */
+
diff --git a/soxr/lsr-tests/throughput_test.c b/soxr/lsr-tests/throughput_test.c
new file mode 100644
index 0000000..28b6fe5
--- /dev/null
+++ b/soxr/lsr-tests/throughput_test.c
@@ -0,0 +1,212 @@
+/*
+** Copyright (C) 2004-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+#include
+#include
+
+#include
+
+#include "config.h"
+
+#include "util.h"
+#include "float_cast.h"
+
+#define BUFFER_LEN (1<<16)
+
+static float input [BUFFER_LEN] ;
+static float output [BUFFER_LEN] ;
+
+static long
+throughput_test (int converter, long best_throughput)
+{ SRC_DATA src_data ;
+ clock_t start_time, clock_time ;
+ double duration ;
+ long total_frames = 0, throughput ;
+ int error ;
+
+ printf (" %-30s ", src_get_name (converter)) ;
+ fflush (stdout) ;
+
+ src_data.data_in = input ;
+ src_data.input_frames = ARRAY_LEN (input) ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = ARRAY_LEN (output) ;
+
+ src_data.src_ratio = 0.99 ;
+
+ sleep (2) ;
+
+ start_time = clock () ;
+
+ do
+ {
+ if ((error = src_simple (&src_data, converter, 1)) != 0)
+ { puts (src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ total_frames += src_data.output_frames_gen ;
+
+ clock_time = clock () - start_time ;
+ duration = (1.0 * clock_time) / CLOCKS_PER_SEC ;
+ }
+ while (duration < 3.0) ;
+
+ if (src_data.input_frames_used != ARRAY_LEN (input))
+ { printf ("\n\nLine %d : input frames used %ld should be %d\n", __LINE__, src_data.input_frames_used, ARRAY_LEN (input)) ;
+ exit (1) ;
+ } ;
+
+ if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2)
+ { printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
+ printf (" input len : %d\n", ARRAY_LEN (input)) ;
+ printf (" output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen,
+ floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ;
+ exit (1) ;
+ } ;
+
+ throughput = lrint (floor (total_frames / duration)) ;
+
+ if (best_throughput == 0)
+ { best_throughput = MAX (throughput, best_throughput) ;
+ printf ("%5.2f %10ld\n", duration, throughput) ;
+ }
+ else
+ { best_throughput = MAX (throughput, best_throughput) ;
+ printf ("%5.2f %10ld %10ld\n", duration, throughput, best_throughput) ;
+ }
+
+
+ return best_throughput ;
+} /* throughput_test */
+
+static void
+single_run (void)
+{
+
+ printf ("\n CPU name : %s\n", get_cpu_name ()) ;
+
+ puts (
+ "\n"
+ " Converter Duration Throughput\n"
+ " -----------------------------------------------------------"
+ ) ;
+
+ throughput_test (SRC_ZERO_ORDER_HOLD, 0) ;
+ throughput_test (SRC_LINEAR, 0) ;
+ throughput_test (SRC_SINC_FASTEST, 0) ;
+ throughput_test (SRC_SINC_MEDIUM_QUALITY, 0) ;
+ throughput_test (SRC_SINC_BEST_QUALITY, 0) ;
+
+ puts ("") ;
+ return ;
+} /* single_run */
+
+static void
+multi_run (int run_count)
+{ long zero_order_hold = 0, linear = 0 ;
+ long sinc_fastest = 0, sinc_medium = 0, sinc_best = 0 ;
+ int k ;
+
+ puts (
+ "\n"
+ " Converter Duration Throughput Best Throughput\n"
+ " --------------------------------------------------------------------------------"
+ ) ;
+
+ for (k = 0 ; k < run_count ; k++)
+ { zero_order_hold = throughput_test (SRC_ZERO_ORDER_HOLD, zero_order_hold) ;
+ linear = throughput_test (SRC_LINEAR, linear) ;
+ sinc_fastest = throughput_test (SRC_SINC_FASTEST, sinc_fastest) ;
+ sinc_medium = throughput_test (SRC_SINC_MEDIUM_QUALITY, sinc_medium) ;
+ sinc_best = throughput_test (SRC_SINC_BEST_QUALITY, sinc_best) ;
+
+ puts ("") ;
+
+ /* Let the CPU cool down. We might be running on a laptop. */
+ sleep (10) ;
+ } ;
+
+ printf ("\n CPU name : %s\n", get_cpu_name ()) ;
+
+ puts (
+ "\n"
+ " Converter Best Throughput\n"
+ " ------------------------------------------------"
+ ) ;
+ printf (" %-30s %10ld\n", src_get_name (SRC_ZERO_ORDER_HOLD), zero_order_hold) ;
+ printf (" %-30s %10ld\n", src_get_name (SRC_LINEAR), linear) ;
+ printf (" %-30s %10ld\n", src_get_name (SRC_SINC_FASTEST), sinc_fastest) ;
+ printf (" %-30s %10ld\n", src_get_name (SRC_SINC_MEDIUM_QUALITY), sinc_medium) ;
+ printf (" %-30s %10ld\n", src_get_name (SRC_SINC_BEST_QUALITY), sinc_best) ;
+
+ puts ("") ;
+} /* multi_run */
+
+static void
+usage_exit (const char * argv0)
+{ const char * cptr ;
+
+ if ((cptr = strrchr (argv0, '/')) != NULL)
+ argv0 = cptr ;
+
+ printf (
+ "Usage :\n"
+ " %s - Single run of the throughput test.\n"
+ " %s --best-of N - Do N runs of test a print bext result.\n"
+ "\n",
+ argv0, argv0) ;
+
+ exit (0) ;
+} /* usage_exit */
+
+int
+main (int argc, char ** argv)
+{ double freq ;
+
+ memset (input, 0, sizeof (input)) ;
+ freq = 0.01 ;
+ gen_windowed_sines (1, &freq, 1.0, input, BUFFER_LEN) ;
+
+ if (argc == 1)
+ single_run () ;
+ else if (argc == 3 && strcmp (argv [1], "--best-of") == 0)
+ { int run_count = atoi (argv [2]) ;
+
+ if (run_count < 1 || run_count > 20)
+ { printf ("Please be sensible. Run count should be in range (1, 10].\n") ;
+ exit (1) ;
+ } ;
+
+ multi_run (run_count) ;
+ }
+ else
+ usage_exit (argv [0]) ;
+
+ puts (
+ " Duration is in seconds.\n"
+ " Throughput is in samples/sec (more is better).\n"
+ ) ;
+
+ return 0 ;
+} /* main */
+
diff --git a/soxr/lsr-tests/util.c b/soxr/lsr-tests/util.c
new file mode 100644
index 0000000..fefcaf2
--- /dev/null
+++ b/soxr/lsr-tests/util.c
@@ -0,0 +1,230 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+#include
+#include
+
+#include "util.h"
+
+#ifndef M_PI
+#define M_PI 3.14159265358979323846264338
+#endif
+
+void
+gen_windowed_sines (int freq_count, const double *freqs, double max, float *output, int output_len)
+{ int k, freq ;
+ double amplitude, phase ;
+
+ amplitude = max / freq_count ;
+
+ for (k = 0 ; k < output_len ; k++)
+ output [k] = 0.0 ;
+
+ for (freq = 0 ; freq < freq_count ; freq++)
+ { phase = 0.9 * M_PI / freq_count ;
+
+ if (freqs [freq] <= 0.0 || freqs [freq] >= 0.5)
+ { printf ("\n%s : Error : freq [%d] == %g is out of range. Should be < 0.5.\n", __FILE__, freq, freqs [freq]) ;
+ exit (1) ;
+ } ;
+
+ for (k = 0 ; k < output_len ; k++)
+ output [k] += amplitude * sin (freqs [freq] * (2 * k) * M_PI + phase) ;
+ } ;
+
+ /* Apply Hanning Window. */
+ for (k = 0 ; k < output_len ; k++)
+ output [k] *= 0.5 - 0.5 * cos ((2 * k) * M_PI / (output_len - 1)) ;
+
+ /* data [k] *= 0.3635819 - 0.4891775 * cos ((2 * k) * M_PI / (output_len - 1))
+ + 0.1365995 * cos ((4 * k) * M_PI / (output_len - 1))
+ - 0.0106411 * cos ((6 * k) * M_PI / (output_len - 1)) ;
+ */
+
+ return ;
+} /* gen_windowed_sines */
+
+void
+save_oct_float (char *filename, float *input, int in_len, float *output, int out_len)
+{ FILE *file ;
+ int k ;
+
+ printf ("Dumping input and output data to file : %s.\n\n", filename) ;
+
+ if (! (file = fopen (filename, "w")))
+ return ;
+
+ fprintf (file, "# Not created by Octave\n") ;
+
+ fprintf (file, "# name: input\n") ;
+ fprintf (file, "# type: matrix\n") ;
+ fprintf (file, "# rows: %d\n", in_len) ;
+ fprintf (file, "# columns: 1\n") ;
+
+ for (k = 0 ; k < in_len ; k++)
+ fprintf (file, "% g\n", input [k]) ;
+
+ fprintf (file, "# name: output\n") ;
+ fprintf (file, "# type: matrix\n") ;
+ fprintf (file, "# rows: %d\n", out_len) ;
+ fprintf (file, "# columns: 1\n") ;
+
+ for (k = 0 ; k < out_len ; k++)
+ fprintf (file, "% g\n", output [k]) ;
+
+ fclose (file) ;
+ return ;
+} /* save_oct_float */
+
+void
+save_oct_double (char *filename, double *input, int in_len, double *output, int out_len)
+{ FILE *file ;
+ int k ;
+
+ printf ("Dumping input and output data to file : %s.\n\n", filename) ;
+
+ if (! (file = fopen (filename, "w")))
+ return ;
+
+ fprintf (file, "# Not created by Octave\n") ;
+
+ fprintf (file, "# name: input\n") ;
+ fprintf (file, "# type: matrix\n") ;
+ fprintf (file, "# rows: %d\n", in_len) ;
+ fprintf (file, "# columns: 1\n") ;
+
+ for (k = 0 ; k < in_len ; k++)
+ fprintf (file, "% g\n", input [k]) ;
+
+ fprintf (file, "# name: output\n") ;
+ fprintf (file, "# type: matrix\n") ;
+ fprintf (file, "# rows: %d\n", out_len) ;
+ fprintf (file, "# columns: 1\n") ;
+
+ for (k = 0 ; k < out_len ; k++)
+ fprintf (file, "% g\n", output [k]) ;
+
+ fclose (file) ;
+ return ;
+} /* save_oct_double */
+
+void
+interleave_data (const float *in, float *out, int frames, int channels)
+{ int fr, ch ;
+
+ for (fr = 0 ; fr < frames ; fr++)
+ for (ch = 0 ; ch < channels ; ch++)
+ out [ch + channels * fr] = in [fr + frames * ch] ;
+
+ return ;
+} /* interleave_data */
+
+void
+deinterleave_data (const float *in, float *out, int frames, int channels)
+{ int fr, ch ;
+
+ for (ch = 0 ; ch < channels ; ch++)
+ for (fr = 0 ; fr < frames ; fr++)
+ out [fr + frames * ch] = in [ch + channels * fr] ;
+
+ return ;
+} /* deinterleave_data */
+
+void
+reverse_data (float *data, int datalen)
+{ int left, right ;
+ float temp ;
+
+ left = 0 ;
+ right = datalen - 1 ;
+
+ while (left < right)
+ { temp = data [left] ;
+ data [left] = data [right] ;
+ data [right] = temp ;
+ left ++ ;
+ right -- ;
+ } ;
+
+} /* reverse_data */
+
+const char *
+get_cpu_name (void)
+{
+ const char *name = "Unknown", *search = NULL ;
+ static char buffer [512] ;
+ FILE * file = NULL ;
+ int is_pipe = 0 ;
+
+#if defined (__linux__)
+ file = fopen ("/proc/cpuinfo", "r") ;
+ search = "model name" ;
+#elif defined (__APPLE__)
+ file = popen ("/usr/sbin/system_profiler -detailLevel full SPHardwareDataType", "r") ;
+ search = "Processor Name" ;
+ is_pipe = 1 ;
+#elif defined (__FreeBSD__)
+ file = popen ("sysctl -a", "r") ;
+ search = "hw.model" ;
+ is_pipe = 1 ;
+#else
+ file = NULL ;
+#endif
+
+ if (file == NULL)
+ return name ;
+
+ if (search == NULL)
+ { printf ("Error : search is NULL in function %s.\n", __func__) ;
+ return name ;
+ } ;
+
+ while (fgets (buffer, sizeof (buffer), file) != NULL)
+ if (strstr (buffer, search))
+ { char *src, *dest ;
+
+ if ((src = strchr (buffer, ':')) != NULL)
+ { src ++ ;
+ while (isspace (src [0]))
+ src ++ ;
+ name = src ;
+
+ /* Remove consecutive spaces. */
+ src ++ ;
+ for (dest = src ; src [0] ; src ++)
+ { if (isspace (src [0]) && isspace (dest [-1]))
+ continue ;
+ dest [0] = src [0] ;
+ dest ++ ;
+ } ;
+ dest [0] = 0 ;
+ break ;
+ } ;
+ } ;
+
+ if (is_pipe)
+ pclose (file) ;
+ else
+ fclose (file) ;
+
+ return name ;
+} /* get_cpu_name */
+
diff --git a/soxr/lsr-tests/util.h b/soxr/lsr-tests/util.h
new file mode 100644
index 0000000..80b1b49
--- /dev/null
+++ b/soxr/lsr-tests/util.h
@@ -0,0 +1,50 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#define ABS(a) (((a) < 0) ? - (a) : (a))
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+#define MAX(a,b) (((a) >= (b)) ? (a) : (b))
+
+#define ARRAY_LEN(x) ((int) (sizeof (x) / sizeof ((x) [0])))
+
+void gen_windowed_sines (int freq_count, const double *freqs, double max, float *output, int output_len) ;
+
+void save_oct_float (char *filename, float *input, int in_len, float *output, int out_len) ;
+void save_oct_double (char *filename, double *input, int in_len, double *output, int out_len) ;
+
+void interleave_data (const float *in, float *out, int frames, int channels) ;
+
+void deinterleave_data (const float *in, float *out, int frames, int channels) ;
+
+void reverse_data (float *data, int datalen) ;
+
+double calculate_snr (float *data, int len, int expected_peaks) ;
+
+const char * get_cpu_name (void) ;
+
+#if OS_IS_WIN32
+/*
+** Extra Win32 hacks.
+**
+** Despite Microsoft claim of windows being POSIX compatibile it has '_sleep'
+** instead of 'sleep'.
+*/
+
+#define sleep _sleep
+#endif
+
diff --git a/soxr/lsr-tests/varispeed_test.c b/soxr/lsr-tests/varispeed_test.c
new file mode 100644
index 0000000..52b2f43
--- /dev/null
+++ b/soxr/lsr-tests/varispeed_test.c
@@ -0,0 +1,152 @@
+/*
+** Copyright (C) 2006-2011 Erik de Castro Lopo
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include
+#include
+#include
+#include
+
+#include
+
+#include "util.h"
+
+#define BUFFER_LEN (1 << 16)
+
+static void varispeed_test (int converter, double target_snr) ;
+
+int
+main (void)
+{
+ puts ("") ;
+ printf (" Zero Order Hold interpolator : ") ;
+ varispeed_test (SRC_ZERO_ORDER_HOLD, 10.0) ;
+
+ printf (" Linear interpolator : ") ;
+ varispeed_test (SRC_LINEAR, 10.0) ;
+
+ printf (" Sinc interpolator : ") ;
+ varispeed_test (SRC_SINC_FASTEST, 115.0) ;
+
+ puts ("") ;
+
+ return 0 ;
+} /* main */
+
+static void
+varispeed_test (int converter, double target_snr)
+{ static float input [BUFFER_LEN], output [BUFFER_LEN] ;
+ double sine_freq, snr ;
+
+ SRC_STATE *src_state ;
+ SRC_DATA src_data ;
+
+ int input_len, error ;
+
+ memset (input, 0, sizeof (input)) ;
+
+ input_len = ARRAY_LEN (input) / 2 ;
+
+ sine_freq = 0.0111 ;
+ gen_windowed_sines (1, &sine_freq, 1.0, input, input_len) ;
+
+ /* Perform sample rate conversion. */
+ if ((src_state = src_new (converter, 1, &error)) == NULL)
+ { printf ("\n\nLine %d : src_new() failed : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_data.end_of_input = 1 ;
+
+ src_data.data_in = input ;
+ src_data.input_frames = input_len ;
+
+ src_data.src_ratio = 3.0 ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = ARRAY_LEN (output) ;
+
+ if ((error = src_set_ratio (src_state, 1.0 / src_data.src_ratio)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ if ((error = src_process (src_state, &src_data)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ printf (" src_data.input_frames : %ld\n", src_data.input_frames) ;
+ printf (" src_data.output_frames : %ld\n\n", src_data.output_frames) ;
+ exit (1) ;
+ } ;
+
+ if (src_data.input_frames_used != input_len)
+ { printf ("\n\nLine %d : unused input.\n", __LINE__) ;
+ printf ("\tinput_len : %d\n", input_len) ;
+ printf ("\tinput_frames_used : %ld\n\n", src_data.input_frames_used) ;
+ exit (1) ;
+ } ;
+
+ /* Copy the last output to the input. */
+ memcpy (input, output, sizeof (input)) ;
+ reverse_data (input, src_data.output_frames_gen) ;
+
+ if ((error = src_reset (src_state)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ src_data.end_of_input = 1 ;
+
+ src_data.data_in = input ;
+ input_len = src_data.input_frames = src_data.output_frames_gen ;
+
+ src_data.data_out = output ;
+ src_data.output_frames = ARRAY_LEN (output) ;
+
+ if ((error = src_set_ratio (src_state, 1.0 / src_data.src_ratio)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ exit (1) ;
+ } ;
+
+ if ((error = src_process (src_state, &src_data)))
+ { printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+ printf (" src_data.input_frames : %ld\n", src_data.input_frames) ;
+ printf (" src_data.output_frames : %ld\n\n", src_data.output_frames) ;
+ exit (1) ;
+ } ;
+
+ if (src_data.input_frames_used != input_len)
+ { printf ("\n\nLine %d : unused input.\n", __LINE__) ;
+ printf ("\tinput_len : %d\n", input_len) ;
+ printf ("\tinput_frames_used : %ld\n\n", src_data.input_frames_used) ;
+ exit (1) ;
+ } ;
+
+ src_state = src_delete (src_state) ;
+
+ snr = calculate_snr (output, src_data.output_frames_gen, 1) ;
+
+ if (target_snr > snr)
+ { printf ("\n\nLine %d : snr (%3.1f) does not meet target (%3.1f)\n\n", __LINE__, snr, target_snr) ;
+ save_oct_float ("varispeed.mat", input, src_data.input_frames, output, src_data.output_frames_gen) ;
+ exit (1) ;
+ } ;
+
+ puts ("ok") ;
+
+ return ;
+} /* varispeed_test */
+
diff --git a/soxr/msvc/README b/soxr/msvc/README
new file mode 100644
index 0000000..5b7f60a
--- /dev/null
+++ b/soxr/msvc/README
@@ -0,0 +1,22 @@
+SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+
+Cmake is the recommended way to configure, build (as either a DLL or a static
+library), and install libsoxr for general use on MS-Windows, as on other OSs.
+
+However, building within MS Visual Studio is also possible, as exemplified by
+the accompanying files:
+
+ * soxr-config.h Pre-configured for a modern Win32 system.
+
+ * libsoxr.vcproj Builds the library as a DLL, per above.
+
+ * libsoxr.sln, Build an example exe using the above.
+ example1.vcproj
+
+The following notes apply to adaptation of these files:
+
+ * For a system without AVX support, set WITH_CR64S to 0 in
+ soxr-config.h and exclude the three files ...64s.c from the build.
+
+ * If changing libsoxr.vcproj to build a static library, then also
+ remove the preprocessor definition: SOXR_DLL.
diff --git a/soxr/msvc/example1.vcproj b/soxr/msvc/example1.vcproj
new file mode 100644
index 0000000..170a522
--- /dev/null
+++ b/soxr/msvc/example1.vcproj
@@ -0,0 +1,82 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/soxr/msvc/libsoxr.sln b/soxr/msvc/libsoxr.sln
new file mode 100644
index 0000000..c1a840b
--- /dev/null
+++ b/soxr/msvc/libsoxr.sln
@@ -0,0 +1,29 @@
+
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual C++ Express 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example1", "example1.vcproj", "{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}"
+ ProjectSection(ProjectDependencies) = postProject
+ {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB} = {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}
+ EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libsoxr", "libsoxr.vcproj", "{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Release|Win32 = Release|Win32
+ Debug|Win32 = Debug|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Release|Win32.ActiveCfg = Release|Win32
+ {CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Release|Win32.Build.0 = Release|Win32
+ {CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Debug|Win32.ActiveCfg = Debug|Win32
+ {CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Debug|Win32.Build.0 = Debug|Win32
+ {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Release|Win32.ActiveCfg = Release|Win32
+ {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Release|Win32.Build.0 = Release|Win32
+ {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Debug|Win32.ActiveCfg = Debug|Win32
+ {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Debug|Win32.Build.0 = Debug|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/soxr/msvc/libsoxr.vcproj b/soxr/msvc/libsoxr.vcproj
new file mode 100644
index 0000000..499f895
--- /dev/null
+++ b/soxr/msvc/libsoxr.vcproj
@@ -0,0 +1,97 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/soxr/msvc/soxr-config.h b/soxr/msvc/soxr-config.h
new file mode 100644
index 0000000..74415e2
--- /dev/null
+++ b/soxr/msvc/soxr-config.h
@@ -0,0 +1,30 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+/* N.B. Pre-configured for modern MS-Windows systems. However, the normal
+ * procedure is to use the cmake configuration and build system. See INSTALL. */
+
+#if !defined soxr_config_included
+#define soxr_config_included
+
+#define AVCODEC_FOUND 0
+#define AVUTIL_FOUND 0
+#define WITH_PFFFT 1
+
+#define HAVE_FENV_H 1
+#define HAVE_STDBOOL_H 1
+#define HAVE_STDINT_H 1
+#define HAVE_LRINT 1
+#define HAVE_BIGENDIAN 0
+
+#define WITH_CR32 1
+#define WITH_CR32S 1
+#define WITH_CR64 1
+#define WITH_CR64S 1
+#define WITH_VR32 1
+
+#define WITH_HI_PREC_CLOCK 1
+#define WITH_FLOAT_STD_PREC_CLOCK 0
+#define WITH_DEV_TRACE 1
+
+#endif
diff --git a/soxr/multi-arch b/soxr/multi-arch
new file mode 100644
index 0000000..288b578
--- /dev/null
+++ b/soxr/multi-arch
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+rm -f CMakeCache.txt # Prevent interference from any in-tree build
+
+j=-j4
+build=Release
+
+for n in \
+ cc: \
+ clang: \
+ arm-linux-gnueabi-gcc:Linux \
+ x86_64-w64-mingw32-gcc:Windows \
+ i686-w64-mingw32-gcc:Windows \
+ ; do
+ compiler=$(echo $n | sed 's/:.*//')
+ system=$(echo $n | sed 's/.*://')
+ dir=$build-$compiler
+ which $compiler > /dev/null || echo $compiler not found && (
+ echo "***" $dir
+ mkdir -p $dir
+ cd $dir
+ cmake -DCMAKE_BUILD_TYPE=$build -DCMAKE_C_COMPILER=$compiler -DCMAKE_SYSTEM_NAME="$system" -DBUILD_SHARED_LIBS=OFF -DWITH_OPENMP=OFF ..
+ make $j && [ /$system = / ] && ctest -j || true
+ cd tests
+ ../../tests/throughput-test && SOXR_THROUGHPUT_GAIN=.6 ../../tests/throughput-test 2 3 || true
+ )
+done
diff --git a/soxr/soxr-config.h.in b/soxr/soxr-config.h.in
index 227bcfd..00b3b45 100644
--- a/soxr/soxr-config.h.in
+++ b/soxr/soxr-config.h.in
@@ -1,46 +1,27 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#if !defined soxr_config_included
#define soxr_config_included
-#define HAVE_SINGLE_PRECISION @HAVE_SINGLE_PRECISION@
-#define HAVE_DOUBLE_PRECISION @HAVE_DOUBLE_PRECISION@
-#define HAVE_AVFFT @HAVE_AVFFT@
-#define HAVE_SIMD @HAVE_SIMD@
-#define HAVE_FENV_H @HAVE_FENV_H@
-#define HAVE_LRINT @HAVE_LRINT@
-#define WORDS_BIGENDIAN @WORDS_BIGENDIAN@
+#cmakedefine01 AVCODEC_FOUND
+#cmakedefine01 AVUTIL_FOUND
+#cmakedefine01 WITH_PFFFT
-#include
+#cmakedefine01 HAVE_FENV_H
+#cmakedefine01 HAVE_STDBOOL_H
+#cmakedefine01 HAVE_STDINT_H
+#cmakedefine01 HAVE_LRINT
+#cmakedefine01 HAVE_BIGENDIAN
-#undef bool
-#undef false
-#undef true
-#define bool int
-#define false 0
-#define true 1
+#cmakedefine01 WITH_CR32
+#cmakedefine01 WITH_CR32S
+#cmakedefine01 WITH_CR64
+#cmakedefine01 WITH_CR64S
+#cmakedefine01 WITH_VR32
-#undef int16_t
-#undef int32_t
-#undef int64_t
-#undef uint32_t
-#undef uint64_t
-#define int16_t short
-#if LONG_MAX > 2147483647L
- #define int32_t int
- #define int64_t long
-#elif LONG_MAX < 2147483647L
-#error this library requires that 'long int' has at least 32-bits
-#else
- #define int32_t long
- #if defined _MSC_VER
- #define int64_t __int64
- #else
- #define int64_t long long
- #endif
-#endif
-#define uint32_t unsigned int32_t
-#define uint64_t unsigned int64_t
+#cmakedefine01 WITH_HI_PREC_CLOCK
+#cmakedefine01 WITH_FLOAT_STD_PREC_CLOCK
+#cmakedefine01 WITH_DEV_TRACE
#endif
diff --git a/soxr/src/CMakeLists.txt b/soxr/src/CMakeLists.txt
index cd41aa7..bb01a0d 100644
--- a/soxr/src/CMakeLists.txt
+++ b/soxr/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
# Licence for this file: LGPL v2.1 See LICENCE for details.
@@ -7,90 +7,89 @@
if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vr-coefs.h)
include_directories(${CMAKE_CURRENT_BINARY_DIR})
- set_property(SOURCE vr32.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h)
+ set_property(SOURCE vr32.c
+ APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h)
add_executable (vr-coefs vr-coefs.c)
+ target_link_libraries (vr-coefs ${LIBM_LIBRARIES})
ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
COMMAND vr-coefs > ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
DEPENDS vr-coefs)
endif ()
-# Minimalist boo configuration:
-add_definitions (${PROJECT_C_FLAGS} -DSOXR_LIB -DSOXR_SILENT=1)
-include (CheckFunctionExists)
-include (CheckIncludeFiles)
-set(WITH_LSR_BINDINGS OFF)
-set(WITH_SINGLE_PRECISION ON)
-set(WITH_DOUBLE_PRECISION OFF)
-set(WITH_SIMD ON)
-set(HAVE_SINGLE_PRECISION "1")
-set(HAVE_DOUBLE_PRECISION "0")
-set(HAVE_AVFFT "0")
-set(HAVE_SIMD "1")
-check_function_exists (lrint HAVE_LRINT)
-if(NOT HAVE_LRINT)
- set(HAVE_LRINT "0")
-endif()
-check_include_files (fenv.h HAVE_FENV_H)
-if(NOT HAVE_FENV_H)
- set(HAVE_FENV_H "0")
-endif()
-set(WORDS_BIGENDIAN "0")
+add_definitions (${PROJECT_C_FLAGS} -DSOXR_LIB)
+
-configure_file (
- ${CMAKE_CURRENT_SOURCE_DIR}/../soxr-config.h.in
- ${CMAKE_CURRENT_BINARY_DIR}/soxr-config.h)
-include_directories (${CMAKE_CURRENT_BINARY_DIR})
# Libsoxr configuration:
-set (RDFT32 fft4g32.c)
-if (WITH_AVFFT AND AVCODEC_FOUND)
- set (RDFT32 avfft32.c)
- set (RDFT32S avfft32s.c)
+set (RDFT32 fft4g32)
+if (AVCODEC_FOUND)
+ set (RDFT32 avfft32)
+ set (RDFT32S avfft32s)
elseif (WITH_PFFFT)
- #set (RDFT32 pffft32.c)
- set (RDFT32S pffft32s.c)
-elseif (WITH_SIMD)
- set (RDFT32S fft4g32s.c)
+ #set (RDFT32 pffft32)
+ set (RDFT32S pffft32s)
+elseif (WITH_CR32S)
+ set (RDFT32S fft4g32s)
+ if (NOT WITH_CR32)
+ list (APPEND RDFT32S fft4g32)
+ endif ()
endif ()
-if (WITH_DOUBLE_PRECISION)
- set (DP_SOURCES rate64.c)
+set (SOURCES ${PROJECT_NAME}.c data-io)
+
+if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
+ list (APPEND SOURCES dbesi0 filter fft4g64 cr)
endif ()
-if (WITH_SINGLE_PRECISION)
- set (SP_SOURCES rate32.c ${RDFT32})
+if (WITH_CR32)
+ list (APPEND SOURCES cr32 ${RDFT32})
endif ()
-if (HAVE_SIMD)
- set (SIMD_SOURCES rate32s.c vr32s.c ${RDFT32S} simd.c)
- foreach (source ${SIMD_SOURCES})
- set_property (SOURCE ${source} PROPERTY COMPILE_FLAGS ${SIMD_C_FLAGS})
+if (WITH_CR64)
+ list (APPEND SOURCES cr64)
+endif ()
+
+if (WITH_VR32)
+ list (APPEND SOURCES vr32)
+endif ()
+
+if (WITH_CR32S)
+ foreach (source cr32s ${RDFT32S} util32s)
+ list (APPEND SOURCES ${source})
+ set_property (SOURCE ${source}
+ APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD32_C_FLAGS})
+ endforeach ()
+endif ()
+
+if (WITH_CR64S)
+ foreach (source cr64s pffft64s util64s)
+ list (APPEND SOURCES ${source})
+ set_property (SOURCE ${source}
+ APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD64_C_FLAGS})
endforeach ()
-else ()
- set (SIMD_SOURCES vr32.c)
endif ()
# Libsoxr:
-add_library (soxr ${LIB_TYPE} soxr.c data-io.c dbesi0.c filter.c fft4g64.c
- ${SP_SOURCES} ${DP_SOURCES} ${SIMD_SOURCES})
-set_target_properties (soxr PROPERTIES
+add_library (${PROJECT_NAME} ${LIB_TYPE} ${SOURCES})
+target_link_libraries (${PROJECT_NAME} PRIVATE ${LIBS} ${LIBM_LIBRARIES})
+set_target_properties (${PROJECT_NAME} PROPERTIES
VERSION "${SO_VERSION}"
SOVERSION ${SO_VERSION_MAJOR}
INSTALL_NAME_DIR ${LIB_INSTALL_DIR}
LINK_INTERFACE_LIBRARIES ""
- PUBLIC_HEADER "soxr.h")
+ PUBLIC_HEADER "${PROJECT_NAME}.h")
if (BUILD_FRAMEWORK)
- set_target_properties (soxr PROPERTIES FRAMEWORK TRUE)
+ set_target_properties (${PROJECT_NAME} PROPERTIES FRAMEWORK TRUE)
elseif (NOT WIN32)
-# set (TARGET_PCS ${CMAKE_CURRENT_BINARY_DIR}/soxr.pc)
-# configure_file (${CMAKE_CURRENT_SOURCE_DIR}/soxr.pc.in ${TARGET_PCS})
-# install (FILES ${CMAKE_CURRENT_BINARY_DIR}/soxr.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
+ set (TARGET_PCS ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc)
+ configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}.pc.in ${TARGET_PCS})
+ install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
endif ()
@@ -98,11 +97,11 @@ endif ()
# LSR bindings:
if (WITH_LSR_BINDINGS)
- set (LSR soxr-lsr)
+ set (LSR ${PROJECT_NAME}-lsr)
set (LSR_SO_VERSION 0.1.9)
set (LSR_SO_VERSION_MAJOR 0)
- add_library (${LSR} ${LIB_TYPE} lsr)
- target_link_libraries (${LSR} soxr)
+ add_library (${LSR} ${LIB_TYPE} ${LSR})
+ target_link_libraries (${LSR} ${PROJECT_NAME})
set_target_properties (${LSR} PROPERTIES
VERSION "${LSR_SO_VERSION}"
SOVERSION ${LSR_SO_VERSION_MAJOR}
@@ -112,9 +111,9 @@ if (WITH_LSR_BINDINGS)
if (BUILD_FRAMEWORK)
set_target_properties (${LSR} PROPERTIES FRAMEWORK TRUE)
elseif (NOT WIN32)
-# set (TARGET_PCS "${TARGET_PCS} ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc")
-# configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc)
-# install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
+ set (TARGET_PCS "${TARGET_PCS} ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc")
+ configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc)
+ install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
endif ()
endif ()
@@ -122,29 +121,9 @@ endif ()
# Installation (from build from source):
-#install (TARGETS soxr ${LSR}
-# FRAMEWORK DESTINATION ${FRAMEWORK_INSTALL_DIR}
-# LIBRARY DESTINATION ${LIB_INSTALL_DIR}
-# RUNTIME DESTINATION ${BIN_INSTALL_DIR}
-# ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
-# PUBLIC_HEADER DESTINATION ${INCLUDE_INSTALL_DIR})
-
-
-
-# Packaging (for unix-like distributions):
-
-#get_property (LIB1 TARGET soxr PROPERTY LOCATION)
-#if (BUILD_SHARED_LIBS)
-# set (LIB1 ${LIB1}.${SO_VERSION_MAJOR} ${LIB1}.${SO_VERSION})
-#endif ()
-#list (APPEND TARGET_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/soxr.h")
-#if (WITH_LSR_BINDINGS)
-# get_property (LIB2 TARGET ${LSR} PROPERTY LOCATION)
-# if (BUILD_SHARED_LIBS)
-# set (LIB2 ${LIB2}.${LSR_SO_VERSION_MAJOR} ${LIB2}.${LSR_SO_VERSION})
-# endif ()
-# list (APPEND TARGET_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.h")
-#endif ()
-#set (TARGET_LIBS ${LIB1} ${LIB2})
-#configure_file (${CMAKE_CURRENT_SOURCE_DIR}/libsoxr.src.in ${CMAKE_CURRENT_BINARY_DIR}/libsoxr.src)
-#configure_file (${CMAKE_CURRENT_SOURCE_DIR}/libsoxr-dev.src.in ${CMAKE_CURRENT_BINARY_DIR}/libsoxr-dev.src)
+install (TARGETS ${PROJECT_NAME} ${LSR}
+ FRAMEWORK DESTINATION ${FRAMEWORK_INSTALL_DIR}
+ LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+ RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+ ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+ PUBLIC_HEADER DESTINATION ${INCLUDE_INSTALL_DIR})
diff --git a/soxr/src/aliases.h b/soxr/src/aliases.h
index eb42bdc..d1a392f 100644
--- a/soxr/src/aliases.h
+++ b/soxr/src/aliases.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#if defined SOXR_LIB
@@ -18,8 +18,10 @@
#define lsx_dfst_f _soxr_dfst_f
#define lsx_dfst _soxr_dfst
#define lsx_fir_to_phase _soxr_fir_to_phase
+#define lsx_f_resp _soxr_f_resp
#define lsx_init_fft_cache_f _soxr_init_fft_cache_f
#define lsx_init_fft_cache _soxr_init_fft_cache
+#define lsx_inv_f_resp _soxr_inv_f_resp
#define lsx_kaiser_beta _soxr_kaiser_beta
#define lsx_kaiser_params _soxr_kaiser_params
#define lsx_make_lpf _soxr_make_lpf
diff --git a/soxr/src/avfft32.c b/soxr/src/avfft32.c
index 5be13d2..c3096aa 100644
--- a/soxr/src/avfft32.c
+++ b/soxr/src/avfft32.c
@@ -1,27 +1,33 @@
/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
+#include
#include
#include
#include "filter.h"
+#include "rdft_t.h"
static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
static int multiplier(void) {return 2;}
static void nothing(void) {}
+static int flags(void) {return 0;}
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32_cb[] = {
- (fn_t)forward_setup,
- (fn_t)backward_setup,
- (fn_t)av_rdft_end,
- (fn_t)rdft,
- (fn_t)rdft,
- (fn_t)rdft,
- (fn_t)rdft,
- (fn_t)_soxr_ordered_convolve_f,
- (fn_t)_soxr_ordered_partial_convolve_f,
- (fn_t)multiplier,
- (fn_t)nothing,
+rdft_cb_table _soxr_rdft32_cb = {
+ forward_setup,
+ backward_setup,
+ av_rdft_end,
+ rdft,
+ rdft,
+ rdft,
+ rdft,
+ _soxr_ordered_convolve_f,
+ _soxr_ordered_partial_convolve_f,
+ multiplier,
+ nothing,
+ malloc,
+ calloc,
+ free,
+ flags,
};
diff --git a/soxr/src/avfft32s.c b/soxr/src/avfft32s.c
index 75e485e..2944144 100644
--- a/soxr/src/avfft32s.c
+++ b/soxr/src/avfft32s.c
@@ -3,25 +3,30 @@
#include
#include
-#include "simd.h"
+#include "util32s.h"
+#include "rdft_t.h"
static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
-static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
+static void rdft(int length, void * setup, void * H, void * scratch) {av_rdft_calc(setup, H); (void)length; (void)scratch;}
static int multiplier(void) {return 2;}
-static void nothing(void) {}
+static void nothing2(int u1, void *u2, void *u3, void *u4) {(void)u1; (void)u2; (void)u3; (void)u4;}
+static int flags(void) {return RDFT_IS_SIMD;}
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32s_cb[] = {
- (fn_t)forward_setup,
- (fn_t)backward_setup,
- (fn_t)av_rdft_end,
- (fn_t)rdft,
- (fn_t)rdft,
- (fn_t)rdft,
- (fn_t)rdft,
- (fn_t)_soxr_ordered_convolve_simd,
- (fn_t)_soxr_ordered_partial_convolve_simd,
- (fn_t)multiplier,
- (fn_t)nothing,
+rdft_cb_table _soxr_rdft32s_cb = {
+ forward_setup,
+ backward_setup,
+ av_rdft_end,
+ rdft,
+ rdft,
+ rdft,
+ rdft,
+ ORDERED_CONVOLVE_SIMD,
+ ORDERED_PARTIAL_CONVOLVE_SIMD,
+ multiplier,
+ nothing2,
+ SIMD_ALIGNED_MALLOC,
+ SIMD_ALIGNED_CALLOC,
+ SIMD_ALIGNED_FREE,
+ flags,
};
diff --git a/soxr/src/cb_t.h b/soxr/src/cb_t.h
new file mode 100644
index 0000000..d78ebd7
--- /dev/null
+++ b/soxr/src/cb_t.h
@@ -0,0 +1,26 @@
+/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+* Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+typedef struct {
+ void * (*input)(void *, void * samples, size_t n);
+ void (*process)(void *, size_t);
+ void const * (*output)(void *, void * samples, size_t * n);
+ void (*flush)(void *);
+ void (*close)(void *);
+ double (*delay)(void *);
+ void (*sizes)(size_t * shared, size_t * channel);
+ char const * (*create)(void * channel, void * shared, double io_ratio, void * q_spec, void * r_spec, double scale);
+ void (*set_io_ratio)(void *, double io_ratio, size_t len);
+ char const * (*id)(void);
+} control_block_t;
+
+#define resampler_input p->control_block.input
+#define resampler_process p->control_block.process
+#define resampler_output p->control_block.output
+#define resampler_flush p->control_block.flush
+#define resampler_close p->control_block.close
+#define resampler_delay p->control_block.delay
+#define resampler_sizes p->control_block.sizes
+#define resampler_create p->control_block.create
+#define resampler_set_io_ratio p->control_block.set_io_ratio
+#define resampler_id p->control_block.id
diff --git a/soxr/src/ccrw2.h b/soxr/src/ccrw2.h
index b42185b..09331a4 100644
--- a/soxr/src/ccrw2.h
+++ b/soxr/src/ccrw2.h
@@ -3,8 +3,8 @@
/* Concurrent Control with "Readers" and "Writers", P.J. Courtois et al, 1971 */
-#if !defined ccrw2_included
-#define ccrw2_included
+#if !defined soxr_ccrw2_included
+#define soxr_ccrw2_included
#if defined SOXR_LIB
#include "internal.h"
diff --git a/soxr/src/cr-core.c b/soxr/src/cr-core.c
new file mode 100644
index 0000000..5355de3
--- /dev/null
+++ b/soxr/src/cr-core.c
@@ -0,0 +1,316 @@
+/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details.
+ *
+ * Constant-rate resampling engine-specific code. */
+
+#include
+#include
+#include
+#include
+
+#include "filter.h"
+
+#if defined SOXR_LIB
+ #include "internal.h"
+ #include "cr.h"
+ #if CORE_TYPE & CORE_DBL
+ typedef double sample_t;
+ #if CORE_TYPE & CORE_SIMD_DFT
+ #define RDFT_CB _soxr_rdft64s_cb
+ #else
+ #define RDFT_CB _soxr_rdft64_cb
+ #endif
+ #else
+ typedef float sample_t;
+ #if CORE_TYPE & CORE_SIMD_DFT
+ #define RDFT_CB _soxr_rdft32s_cb
+ #else
+ #define RDFT_CB _soxr_rdft32_cb
+ #endif
+ #endif
+
+ #if CORE_TYPE & (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+ #if CORE_TYPE & CORE_DBL
+ #include "util64s.h"
+ #include "dev64s.h"
+ #else
+ #include "util32s.h"
+ #include "dev32s.h"
+ #endif
+ #endif
+
+ extern rdft_cb_table RDFT_CB;
+#else
+ #define RDFT_CB 0
+#endif
+
+
+
+static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+ sample_t const * input = stage_read_p(p);
+ int num_in = min(stage_occupancy(p), p->input_size);
+ int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio);
+ sample_t * output = fifo_reserve(output_fifo, max_num_out);
+
+ for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
+ sample_t const * s = input + p->at.integer;
+ double x = p->at.fraction * (1 / MULT32);
+ double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b);
+ double c = s[1]-*s-a-b;
+ output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s));
+ }
+ assert(max_num_out - i >= 0);
+ fifo_trim_by(output_fifo, max_num_out - i);
+ fifo_read(&p->fifo, p->at.integer, NULL);
+ p->at.integer = 0;
+}
+
+
+
+#if defined __AVX__
+ #define DEFINED_AVX 1
+#else
+ #define DEFINED_AVX 0
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+ #define DEFINED_X86 1
+#else
+ #define DEFINED_X86 0
+#endif
+
+#if defined __arm__
+ #define DEFINED_ARM 1
+#else
+ #define DEFINED_ARM 0
+#endif
+
+
+
+#if CORE_TYPE & CORE_DBL
+ #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_AVX)
+ #define SIMD_SSE 0
+#else
+ #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_X86)
+ #define SIMD_AVX 0
+#endif
+
+#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_ARM)
+
+
+
+#include "half-coefs.h"
+
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+#define FUNCTION_H h7
+#define CONVOLVE ____ __ _
+#include "half-fir.h"
+#endif
+
+#define FUNCTION_H h8
+#define CONVOLVE ____ ____
+#include "half-fir.h"
+
+#define FUNCTION_H h9
+#define CONVOLVE ____ ____ _
+#include "half-fir.h"
+
+#if CORE_TYPE & CORE_DBL
+ #define FUNCTION_H h10
+ #define CONVOLVE ____ ____ __
+ #include "half-fir.h"
+
+ #define FUNCTION_H h11
+ #define CONVOLVE ____ ____ __ _
+ #include "half-fir.h"
+
+ #define FUNCTION_H h12
+ #define CONVOLVE ____ ____ ____
+ #include "half-fir.h"
+
+ #define FUNCTION_H h13
+ #define CONVOLVE ____ ____ ____ _
+ #include "half-fir.h"
+#endif
+
+static half_fir_info_t const half_firs[] = {
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+ { 7, half_fir_coefs_7 , h7 , 0 , 120.65f},
+#endif
+ { 8, half_fir_coefs_8 , h8 , 0 , 136.51f},
+ { 9, half_fir_coefs_9 , h9 , 0 , 152.32f},
+#if CORE_TYPE & CORE_DBL
+ {10, half_fir_coefs_10, h10, 0 , 168.08f},
+ {11, half_fir_coefs_11, h11, 0 , 183.79f},
+ {12, half_fir_coefs_12, h12, 0 , 199.46f},
+ {13, half_fir_coefs_13, h13, 0 , 215.12f},
+#endif
+};
+
+#undef SIMD_AVX
+#undef SIMD_NEON
+#undef SIMD_SSE
+
+
+
+#if CORE_TYPE & CORE_DBL
+ #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_AVX)
+ #define SIMD_SSE 0
+#else
+ #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_X86)
+ #define SIMD_AVX 0
+#endif
+
+#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_ARM)
+
+
+
+#define COEFS (sample_t * __restrict)p->shared->poly_fir_coefs
+#define VAR_LENGTH p->n
+#define VAR_CONVOLVE(n) while (j < (n)) _
+#define VAR_POLY_PHASE_BITS p->phase_bits
+
+
+
+#define FUNCTION vpoly0
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir0.h"
+
+#define FUNCTION vpoly1
+#define COEF_INTERP 1
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+#define FUNCTION vpoly2
+#define COEF_INTERP 2
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+#define FUNCTION vpoly3
+#define COEF_INTERP 3
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+
+
+#if !(CORE_TYPE & CORE_SIMD_POLY)
+
+#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+#define FUNCTION U100_0
+#define FIR_LENGTH U100_l
+#define CONVOLVE(n) poly_fir_convolve_U100
+#include "poly-fir0.h"
+
+#define u100_l 11
+#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _
+#define FUNCTION u100_0
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir0.h"
+
+#define FUNCTION u100_1
+#define COEF_INTERP 1
+#define PHASE_BITS 8
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir.h"
+
+#define FUNCTION u100_2
+#define COEF_INTERP 2
+#define PHASE_BITS 6
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir.h"
+
+#endif
+
+#define u100_1_b 8
+#define u100_2_b 6
+
+
+
+static poly_fir_t const poly_firs[] = {
+ {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}},
+ {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}},
+ {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}},
+ {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}},
+ {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}},
+ {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}},
+
+ {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}},
+ {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}},
+ {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}},
+ {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}},
+ {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}},
+ {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}},
+
+#if CORE_TYPE & CORE_SIMD_POLY
+ {10.62f, {{0, vpoly0}, {0, 0}, {0, 0}}},
+ {-1, {{0, vpoly0}, {u100_1_b, vpoly1}, {u100_2_b, vpoly2}}},
+#else
+ {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}},
+ {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}},
+#endif
+ {-1, {{0, vpoly0}, { 9, vpoly1}, { 6, vpoly2}}},
+ {-1, {{0, vpoly0}, { 11, vpoly1}, { 7, vpoly2}}},
+ {-1, {{0, vpoly0}, { 13, vpoly1}, { 8, vpoly2}}},
+ {-1, {{0, vpoly0}, { 10, vpoly2}, { 8, vpoly3}}},
+ {-1, {{0, vpoly0}, { 12, vpoly2}, { 9, vpoly3}}},
+};
+
+
+
+static cr_core_t const cr_core = {
+
+#if CORE_TYPE & CORE_SIMD_POLY
+ {SIMD_ALIGNED_MALLOC, SIMD_ALIGNED_CALLOC, SIMD_ALIGNED_FREE},
+#else
+ {malloc, calloc, free},
+#endif
+ half_firs, array_length(half_firs),
+ 0, 0,
+ cubic_stage_fn,
+ poly_firs, &RDFT_CB
+};
+
+
+
+#if defined SOXR_LIB
+
+#include "soxr.h"
+
+static char const * rate_create(void * channel, void * shared, double io_ratio,
+ void * q_spec, void * r_spec, double scale)
+{
+ return _soxr_init(channel, shared, io_ratio, q_spec, r_spec, scale,
+ &cr_core, CORE_TYPE);
+}
+
+
+
+static char const * id(void) {return CORE_STR;}
+
+#include "cb_t.h"
+
+control_block_t RATE_CB = {
+ _soxr_input,
+ _soxr_process,
+ _soxr_output,
+ _soxr_flush,
+ _soxr_close,
+ _soxr_delay,
+ _soxr_sizes,
+ rate_create,
+ 0,
+ id,
+};
+
+#endif
diff --git a/soxr/src/cr.c b/soxr/src/cr.c
new file mode 100644
index 0000000..eabe700
--- /dev/null
+++ b/soxr/src/cr.c
@@ -0,0 +1,600 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details.
+ *
+ * Constant-rate resampling common code. */
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+#include "filter.h"
+
+#if defined SOXR_LIB
+ #include "internal.h"
+ #define STATIC
+#endif
+
+#include "cr.h"
+
+#define num_coefs4 ((core_flags&CORE_SIMD_POLY)? ((num_coefs+3)&~3) : num_coefs)
+
+#define coef_coef(C,T,x) \
+ C((T*)result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i)
+
+#define STORE(C,T) { \
+ if (interp_order > 2) coef_coef(C,T,3) = (T)d; \
+ if (interp_order > 1) coef_coef(C,T,2) = (T)c; \
+ if (interp_order > 0) coef_coef(C,T,1) = (T)b; \
+ coef_coef(C,T,0) = (T)f0;}
+
+static real * prepare_poly_fir_coefs(double const * coefs, int num_coefs,
+ int num_phases, int interp_order, double multiplier,
+ core_flags_t core_flags, alloc_t const * mem)
+{
+ int i, j, length = num_coefs4 * num_phases * (interp_order + 1);
+ real * result = mem->calloc(1,(size_t)length << LOG2_SIZEOF_REAL(core_flags));
+ double fm1 = coefs[0], f1 = 0, f2 = 0;
+
+ for (i = num_coefs - 1; i >= 0; --i)
+ for (j = num_phases - 1; j >= 0; --j) {
+ double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */
+ int pos = i * num_phases + j - 1;
+ fm1 = pos > 0 ? coefs[pos - 1] * multiplier : 0;
+ switch (interp_order) {
+ case 1: b = f1 - f0; break;
+ case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break;
+ case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break;
+ default: assert(!interp_order);
+ }
+ switch (core_flags & 3) {
+ case 0: if (WITH_CR32 ) STORE(coef , float ); break;
+ case 1: if (WITH_CR64 ) STORE(coef , double); break;
+ case 2: if (WITH_CR32S) STORE(coef4, float ); break;
+ default:if (WITH_CR64S) STORE(coef4, double); break;
+ }
+ f2 = f1, f1 = f0;
+ }
+ return result;
+}
+
+#undef STORE
+#undef coef_coef
+
+#define IS_FLOAT32 (WITH_CR32 || WITH_CR32S) && \
+ (!(WITH_CR64 || WITH_CR64S) || sizeof_real == sizeof(float))
+#define WITH_FLOAT64 WITH_CR64 || WITH_CR64S
+
+static void dft_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+ real * output, * dft_out;
+ int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
+ rate_shared_t const * s = p->shared;
+ dft_filter_t const * f = &s->dft_filter[p->dft_filter_num];
+ int const overlap = f->num_taps - 1;
+
+ if (p->at.integer + p->L * num_in >= f->dft_length) {
+ rdft_cb_table const * const RDFT_CB = p->rdft_cb;
+ size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(p->core_flags);
+ div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L);
+ real const * input = fifo_read_ptr(&p->fifo);
+ fifo_read(&p->fifo, divd.quot, NULL);
+ num_in -= divd.quot;
+
+ output = fifo_reserve(output_fifo, f->dft_length);
+ dft_out = (p->core_flags & CORE_SIMD_DFT)? p->dft_out : output;
+
+ if (lsx_is_power_of_2(p->L)) { /* F-domain */
+ int portion = f->dft_length / p->L;
+ memcpy(dft_out, input, (unsigned)portion * sizeof_real);
+ rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch);
+ if (IS_FLOAT32) {
+#define dft_out ((float *)dft_out)
+ for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
+ dft_out[i] = dft_out[(portion << 1) - i],
+ dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
+ dft_out[portion] = dft_out[1];
+ dft_out[portion + 1] = 0;
+ dft_out[1] = dft_out[0];
+#undef dft_out
+ }
+ else if (WITH_FLOAT64) {
+#define dft_out ((double *)dft_out)
+ for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
+ dft_out[i] = dft_out[(portion << 1) - i],
+ dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
+ dft_out[portion] = dft_out[1];
+ dft_out[portion + 1] = 0;
+ dft_out[1] = dft_out[0];
+#undef dft_out
+ }
+
+ for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
+ memcpy((char *)dft_out + (size_t)i * sizeof_real, dft_out, (size_t)portion * sizeof_real);
+ if (IS_FLOAT32)
+ #define dft_out ((float *)dft_out)
+ dft_out[i + 1] = 0;
+ #undef dft_out
+ else if (WITH_FLOAT64)
+ #define dft_out ((double *)dft_out)
+ dft_out[i + 1] = 0;
+ #undef dft_out
+ }
+
+ if (p->step.integer > 0) {
+ rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+ }
+ } else {
+ if (p->L == 1)
+ memcpy(dft_out, input, (size_t)f->dft_length * sizeof_real);
+ else {
+
+ memset(dft_out, 0, (size_t)f->dft_length * sizeof_real);
+ if (IS_FLOAT32)
+ for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
+ ((float *)dft_out)[i] = ((float *)input)[j];
+ else if (WITH_FLOAT64)
+ for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
+ ((double *)dft_out)[i] = ((double *)input)[j];
+ p->at.integer = p->L - 1 - divd.rem;
+ }
+ if (p->step.integer > 0)
+ rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+ else
+ rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+ }
+
+ if (p->step.integer > 0) {
+ rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs);
+ rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+ if ((p->core_flags & CORE_SIMD_DFT) && p->step.integer == 1)
+ memcpy(output, dft_out, (size_t)f->dft_length * sizeof_real);
+ if (p->step.integer != 1) {
+ if (IS_FLOAT32)
+ for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
+ i += p->step.integer)
+ ((float *)output)[j] = ((float *)dft_out)[i];
+ else if (WITH_FLOAT64)
+ for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
+ i += p->step.integer)
+ ((double *)output)[j] = ((double *)dft_out)[i];
+ p->remM = i - (f->dft_length - overlap);
+ fifo_trim_by(output_fifo, f->dft_length - j);
+ }
+ else fifo_trim_by(output_fifo, overlap);
+ }
+ else { /* F-domain */
+ int m = -p->step.integer;
+ rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs);
+ rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch);
+ if (p->core_flags & CORE_SIMD_DFT)
+ memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof_real);
+ fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m);
+ }
+ (rdft_cb_table const *)RDFT_CB;
+ }
+ p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L;
+}
+
+/* Set to 4 x nearest power of 2 or half of that */
+/* if danger of causing too many cache misses. */
+static int set_dft_length(int num_taps, int min, int large)
+{
+ double d = log((double)num_taps) / log(2.);
+ return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large));
+}
+
+static void dft_stage_init(
+ unsigned instance, double Fp, double Fs, double Fn, double att,
+ double phase_response, stage_t * p, int L, int M, double * multiplier,
+ unsigned min_dft_size, unsigned large_dft_size, core_flags_t core_flags,
+ rdft_cb_table const * rdft_table)
+{
+ rdft_cb_table const * const RDFT_CB = rdft_table;
+ dft_filter_t * f = &p->shared->dft_filter[instance];
+ int num_taps = 0, dft_length = f->dft_length, i, offset;
+ bool f_domain_m = abs(3-M) == 1 && Fs <= 1;
+ size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags);
+
+ if (!dft_length) {
+ int k = phase_response == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4;
+ double m, * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.);
+
+ if (phase_response != 50)
+ lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase_response);
+ else f->post_peak = num_taps / 2;
+
+ dft_length = set_dft_length(num_taps, (int)min_dft_size, (int)large_dft_size);
+ f->coefs = rdft_calloc((size_t)dft_length, sizeof_real);
+ offset = dft_length - num_taps + 1;
+ m = (1. / dft_length) * rdft_multiplier() * L * *multiplier;
+ if (IS_FLOAT32) for (i = 0; i < num_taps; ++i)
+ ((float *)f->coefs)[(i + offset) & (dft_length - 1)] =(float)(h[i] * m);
+ else if (WITH_FLOAT64) for (i = 0; i < num_taps; ++i)
+ ((double *)f->coefs)[(i + offset) & (dft_length - 1)] = h[i] * m;
+ free(h);
+ }
+
+ if (rdft_flags() & RDFT_IS_SIMD)
+ p->dft_out = rdft_malloc(sizeof_real * (size_t)dft_length);
+ if (rdft_flags() & RDFT_NEEDS_SCRATCH)
+ p->dft_scratch = rdft_malloc(2 * sizeof_real * (size_t)dft_length);
+
+ if (!f->dft_length) {
+ void * coef_setup = rdft_forward_setup(dft_length);
+ int Lp = lsx_is_power_of_2(L)? L : 1;
+ int Mp = f_domain_m? M : 1;
+ f->dft_forward_setup = rdft_forward_setup(dft_length / Lp);
+ f->dft_backward_setup = rdft_backward_setup(dft_length / Mp);
+ if (Mp == 1)
+ rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+ else
+ rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+ rdft_delete_setup(coef_setup);
+ f->num_taps = num_taps;
+ f->dft_length = dft_length;
+ lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i",
+ num_taps, dft_length, Fp, Fs, Fn, att, L, M);
+ }
+ *multiplier = 1;
+ p->out_in_ratio = (double)L / M;
+ p->core_flags = core_flags;
+ p->rdft_cb = rdft_table;
+ p->fn = dft_stage_fn;
+ p->preload = f->post_peak / L;
+ p->at.integer = f->post_peak % L;
+ p->L = L;
+ p->step.integer = f_domain_m? -M/2 : M;
+ p->dft_filter_num = instance;
+ p->block_len = f->dft_length - (f->num_taps - 1);
+ p->phase0 = p->at.integer / p->L;
+ p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L;
+}
+
+static struct half_fir_info const * find_half_fir(
+ struct half_fir_info const * firs, size_t len, double att)
+{
+ size_t i;
+ for (i = 0; i + 1 < len && att > firs[i].att; ++i);
+ return &firs[i];
+}
+
+#define have_pre_stage (preM * preL != 1)
+#define have_arb_stage (arbM * arbL != 1)
+#define have_post_stage (postM * postL != 1)
+
+#include "soxr.h"
+
+STATIC char const * _soxr_init(
+ rate_t * const p, /* Per audio channel. */
+ rate_shared_t * const shared, /* By channels undergoing same rate change. */
+ double const io_ratio, /* Input rate divided by output rate. */
+ soxr_quality_spec_t const * const q_spec,
+ soxr_runtime_spec_t const * const r_spec,
+ double multiplier, /* Linear gain to apply during conversion. */
+ cr_core_t const * const core,
+ core_flags_t const core_flags)
+{
+ size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags);
+ double const tolerance = 1 + 1e-5;
+
+ double bits = q_spec->precision;
+ rolloff_t const rolloff = (rolloff_t)(q_spec->flags & 3);
+ int interpolator = (int)(r_spec->flags & 3) - 1;
+ double const Fp0 = q_spec->passband_end, Fs0 = q_spec->stopband_begin;
+ double const phase_response = q_spec->phase_response, tbw0 = Fs0-Fp0;
+
+ bool const maintain_3dB_pt = !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT);
+ double tbw_tighten = 1, alpha;
+ #define tighten(x) (Fs0-(Fs0-(x))*tbw_tighten)
+
+ double arbM = io_ratio, Fn1, Fp1 = Fp0, Fs1 = Fs0, bits1 = min(bits,33);
+ double att = (bits1 + 1) * linear_to_dB(2.), attArb = att; /* +1: pass+stop */
+ int preL = 1, preM = 1, shr = 0, arbL = 1, postL = 1, postM = 1;
+ bool upsample=false, rational=false, iOpt=!(r_spec->flags&SOXR_NOSMALLINTOPT);
+ bool lq_bits= (q_spec->flags & SOXR_PROMOTE_TO_LQ)? bits <= 16 : bits == 16;
+ bool lq_Fp0 = (q_spec->flags & SOXR_PROMOTE_TO_LQ)? Fp0<=lq_bw0 : Fp0==lq_bw0;
+ int n = 0, i, mode = lq_bits && rolloff == rolloff_medium? io_ratio > 1 ||
+ phase_response != 50 || !lq_Fp0 || Fs0 != 1 : ((int)ceil(bits1) - 6) / 4;
+ struct half_fir_info const * half_fir_info;
+ stage_t * s;
+
+ if (io_ratio < 1 && Fs0 - 1 > 1 - Fp0 / tolerance)
+ return "imaging greater than rolloff";
+ if (.002 / tolerance > tbw0 || tbw0 > .5 * tolerance)
+ return "transition bandwidth not in [0.2,50] % of nyquist";
+ if (.5 / tolerance > Fp0 || Fs0 > 1.5 * tolerance)
+ return "transition band not within [50,150] % of nyquist";
+ if (bits!=0 && (15 > bits || bits > 33))
+ return "precision not in [15,33] bits";
+ if (io_ratio <= 0)
+ return "resampling factor not positive";
+ if (0 > phase_response || phase_response > 100)
+ return "phase response not in [0=min-phase,100=max-phase] %";
+
+ p->core = core;
+ p->io_ratio = io_ratio;
+ if (bits!=0) while (!n++) { /* Determine stages: */
+ int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 :
+ (int)ceil(r_spec->coef_size_kbytes * 1000. / (U100_l * (int)sizeof_real));
+ double d, epsilon = 0, frac;
+ upsample = arbM < 1;
+ for (i = (int)(.5 * arbM), shr = 0; i >>= 1; arbM *= .5, ++shr);
+ preM = upsample || (arbM > 1.5 && arbM < 2);
+ postM = 1 + (arbM > 1 && preM), arbM /= postM;
+ preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL;
+ if ((frac = arbM - (int)arbM)!=0)
+ epsilon = fabs(floor(frac * MULT32 + .5) / (frac * MULT32) - 1);
+ for (i = 1, rational = frac==0; i <= maxL && !rational; ++i) {
+ d = frac * i, try = (int)(d + .5);
+ if ((rational = fabs(try / d - 1) <= epsilon)) { /* No long doubles! */
+ if (try == i)
+ arbM = ceil(arbM), shr += x = arbM > 3, arbM /= 1 + x;
+ else arbM = i * (int)arbM + try, arbL = i;
+ }
+ }
+ L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x;
+ if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) {
+ for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1);
+ arbM = arbM * postL / arbL / preL, arbL = 1, n = 0;
+ } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt))
+ preL = L, preM = M, arbM = arbL = postM = 1;
+ if (!mode && (!rational || !n))
+ ++mode, n = 0;
+ }
+
+ p->num_stages = shr + have_pre_stage + have_arb_stage + have_post_stage;
+ if (!p->num_stages && multiplier != 1) {
+ bits = arbL = 0; /* Use cubic_stage in this case. */
+ ++p->num_stages;
+ }
+ p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages));
+ if (!p->stages)
+ return "out of memory";
+ for (i = 0; i < p->num_stages; ++i) {
+ p->stages[i].num = i;
+ p->stages[i].shared = shared;
+ p->stages[i].input_size = 8192;
+ }
+ p->stages[0].is_input = true;
+
+ alpha = postM / (io_ratio * (postL << 0));
+
+ if ((n = p->num_stages) > 1) { /* Att. budget: */
+ if (have_arb_stage)
+ att += linear_to_dB(2.), attArb = att, --n;
+ att += linear_to_dB((double)n);
+ }
+
+ half_fir_info = find_half_fir(core->half_firs, core->half_firs_len, att);
+ for (i = 0, s = p->stages; i < shr; ++i, ++s) {
+ s->fn = half_fir_info->fn;
+ s->coefs = half_fir_info->coefs;
+ s->n = half_fir_info->num_coefs;
+ s->pre_post = 4 * s->n;
+ s->preload = s->pre = s->pre_post >> 1;
+ }
+
+ if (have_pre_stage) {
+ if (maintain_3dB_pt && have_post_stage) { /* Trans. bands overlapping. */
+ double x = tbw0 * lsx_inv_f_resp(-3., att);
+ x = -lsx_f_resp(x / (max(2 * alpha - Fs0, alpha) - Fp0), att);
+ if (x > .035) {
+ tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014;
+ lsx_debug("tbw_tighten=%g (%gdB)", tbw_tighten, x);
+ }
+ }
+ Fn1 = preM? max(preL, preM) : arbM / arbL;
+ dft_stage_init(0, tighten(Fp1), Fs1, Fn1, att, phase_response, s++, preL,
+ max(preM, 1), &multiplier, r_spec->log2_min_dft_size,
+ r_spec->log2_large_dft_size, core_flags, core->rdft_cb);
+ Fp1 /= Fn1, Fs1 /= Fn1;
+ }
+
+ if (bits==0 && have_arb_stage) { /* `Quick' cubic arb stage: */
+ s->fn = core->cubic_stage_fn;
+ s->mult = multiplier, multiplier = 1;
+ s->step.whole = (int64_t)(arbM * MULT32 + .5);
+ s->pre_post = max(3, s->step.integer);
+ s->preload = s->pre = 1;
+ s->out_in_ratio = MULT32 / (double)s->step.whole;
+ }
+ else if (have_arb_stage) { /* Higher quality arb stage: */
+ static const float rolloffs[] = {-.01f, -.3f, 0, -.103f};
+ poly_fir_t const * f = &core->poly_firs[6*(upsample+!!preM)+mode-!upsample];
+ int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases;
+ size_t coefs_size;
+ double at, Fp = Fp1, Fs, Fn, mult = upsample? 1 : arbM / arbL;
+ poly_fir1_t const * f1;
+
+ if (!upsample && preM)
+ Fn = 2 * mult, Fs = 3 + fabs(Fs1 - 1);
+ else Fn = 1, Fs = 2 - (mode? Fp1 + (Fs1 - Fp1) * .7 : Fs1);
+
+ if (mode)
+ Fp = Fs - (Fs - Fp) / (1 - lsx_inv_f_resp(rolloffs[rolloff], attArb));
+
+ i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1;
+ do {
+ f1 = &f->interp[++i];
+ assert(f1->fn);
+ if (i)
+ arbM /= arbL, arbL = 1, rational = false;
+ phase_bits = (int)ceil(f1->scalar - log(mult)/log(2.));
+ phases = !rational? (1 << phase_bits) : arbL;
+ if (f->interp[0].scalar==0) {
+ int phases0 = max(phases, 19), n0 = 0;
+ lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta);
+ num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM;
+ }
+ if ((num_coefs & 1) && rational && (arbL & 1))
+ phases <<= 1, arbL <<= 1, arbM *= 2;
+ at = arbL * (s->phase0 = .5 * (num_coefs & 1));
+ order = i + (i && mode > 4);
+ coefs_size = (size_t)(num_coefs4 * phases * (order+1)) * sizeof_real;
+ } while (interpolator < 0 && i < 2 && f->interp[i+1].fn &&
+ coefs_size / 1000 > r_spec->coef_size_kbytes);
+
+ if (!s->shared->poly_fir_coefs) {
+ int num_taps = num_coefs * phases - 1;
+ double * coefs = lsx_design_lpf(
+ Fp, Fs, Fn, attArb, &num_taps, phases, f->beta);
+ s->shared->poly_fir_coefs = prepare_poly_fir_coefs(
+ coefs, num_coefs, phases, order, multiplier, core_flags, &core->mem);
+ lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk",
+ num_coefs, phases, order, (double)coefs_size / 1000.);
+ free(coefs);
+ }
+ multiplier = 1;
+ s->fn = f1->fn;
+ s->pre_post = num_coefs4 - 1;
+ s->preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs);
+ s->n = num_coefs4;
+ s->phase_bits = phase_bits;
+ s->L = arbL;
+ s->use_hi_prec_clock =
+ mode>1 && (q_spec->flags & SOXR_HI_PREC_CLOCK) && !rational;
+#if WITH_FLOAT_STD_PREC_CLOCK
+ if (order && !s->use_hi_prec_clock) {
+ s->at.flt = at;
+ s->step.flt = arbM;
+ s->out_in_ratio = (double)(arbL / s->step.flt);
+ } else
+#endif
+ {
+ s->at.whole = (int64_t)(at * MULT32 + .5);
+#if WITH_HI_PREC_CLOCK
+ if (s->use_hi_prec_clock) {
+ double M = arbM * MULT32;
+ s->at.fix.ls.parts.ms = 0x80000000ul;
+ s->step.whole = (int64_t)M;
+ M -= (double)s->step.whole;
+ M *= MULT32 * MULT32;
+ s->step.fix.ls.all = (uint64_t)M;
+ } else
+#endif
+ s->step.whole = (int64_t)(arbM * MULT32 + .5);
+ s->out_in_ratio = MULT32 * arbL / (double)s->step.whole;
+ }
+ ++s;
+ }
+
+ if (have_post_stage)
+ dft_stage_init(1, tighten(Fp0 / (upsample? alpha : 1)), upsample? max(2 -
+ Fs0 / alpha, 1) : Fs0, (double)max(postL, postM), att, phase_response,
+ s++, postL, postM, &multiplier, r_spec->log2_min_dft_size,
+ r_spec->log2_large_dft_size, core_flags, core->rdft_cb);
+
+ lsx_debug("%g: >>%i %i/%i %i/%g %i/%i (%x)", 1/io_ratio,
+ shr, preL, preM, arbL, arbM, postL, postM, core_flags);
+
+ for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
+ fifo_create(&s->fifo, (int)sizeof_real);
+ memset(fifo_reserve(&s->fifo, s->preload), 0,
+ sizeof_real * (size_t)s->preload);
+ lsx_debug_more("%5i|%-5i preload=%i remL=%i",
+ s->pre, s->pre_post-s->pre, s->preload, s->at.integer);
+ }
+ fifo_create(&s->fifo, (int)sizeof_real);
+ return 0;
+}
+
+static bool stage_process(stage_t * stage, bool flushing)
+{
+ fifo_t * fifo = &stage->fifo;
+ bool done = false;
+ int want;
+ while (!done && (want = stage->input_size - fifo_occupancy(fifo)) > 0) {
+ if (stage->is_input) {
+ if (flushing)
+ memset(fifo_reserve(fifo, want), 0, fifo->item_size * (size_t)want);
+ else done = true;
+ }
+ else done = stage_process(stage - 1, flushing);
+ }
+ stage->fn(stage, &stage[1].fifo);
+ return done && fifo_occupancy(fifo) < stage->input_size;
+}
+
+STATIC void _soxr_process(void * P, size_t olen)
+{
+ rate_t *p = P;
+ int const n = p->flushing? min(-(int)p->samples_out, (int)olen) : (int)olen;
+ stage_t * stage = &p->stages[p->num_stages];
+ fifo_t * fifo = &stage->fifo;
+ bool done = false;
+ while (!done && fifo_occupancy(fifo) < (int)n)
+ done = stage->is_input || stage_process(stage - 1, p->flushing);
+}
+
+STATIC void * _soxr_input(void * P, void * samples, size_t n)
+{
+ rate_t *p = P;
+ if (p->flushing)
+ return 0;
+ p->samples_in += (int64_t)n;
+ return fifo_write(&p->stages[0].fifo, (int)n, samples);
+}
+
+STATIC void const * _soxr_output(void * P, void * samples, size_t * n0)
+{
+ rate_t *p = P;
+ fifo_t * fifo = &p->stages[p->num_stages].fifo;
+ int n = p->flushing? min(-(int)p->samples_out, (int)*n0) : (int)*n0;
+ p->samples_out += n = min(n, fifo_occupancy(fifo));
+ return fifo_read(fifo, (int)(*n0 = (size_t)n), samples);
+}
+
+STATIC void _soxr_flush(void * P)
+{
+ rate_t *p = P;
+ if (p->flushing) return;
+ p->samples_out -= (int64_t)((double)p->samples_in / p->io_ratio + .5);
+ p->samples_in = 0;
+ p->flushing = true;
+}
+
+STATIC void _soxr_close(void * P)
+{
+ rate_t *p = P;
+ if (p->stages) {
+ rdft_cb_table const * const RDFT_CB = p->core->rdft_cb;
+ rate_shared_t * shared = p->stages[0].shared;
+ int i;
+
+ for (i = 0; i <= p->num_stages; ++i) {
+ stage_t * s = &p->stages[i];
+ rdft_free(s->dft_scratch);
+ rdft_free(s->dft_out);
+ fifo_delete(&s->fifo);
+ }
+ if (shared) {
+ for (i = 0; i < 2; ++i) {
+ dft_filter_t * f= &shared->dft_filter[i];
+ rdft_free(f->coefs);
+ rdft_delete_setup(f->dft_forward_setup);
+ rdft_delete_setup(f->dft_backward_setup);
+ }
+ p->core->mem.free(shared->poly_fir_coefs);
+ memset(shared, 0, sizeof(*shared));
+ }
+ free(p->stages);
+ (rdft_cb_table const *)RDFT_CB;
+ }
+}
+
+#if defined SOXR_LIB
+STATIC double _soxr_delay(void * P)
+{
+ rate_t *p = P;
+ return (double)p->samples_in / p->io_ratio - (double)p->samples_out;
+}
+
+STATIC void _soxr_sizes(size_t * shared, size_t * channel)
+{
+ *shared = sizeof(rate_shared_t);
+ *channel = sizeof(rate_t);
+}
+#endif
diff --git a/soxr/src/cr.h b/soxr/src/cr.h
new file mode 100644
index 0000000..880eb1d
--- /dev/null
+++ b/soxr/src/cr.h
@@ -0,0 +1,178 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if !defined soxr_cr_included
+#define soxr_cr_included
+
+#define FIFO_SIZE_T int
+#include "fifo.h"
+
+typedef void real; /* float or double */
+struct stage;
+typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
+typedef struct half_fir_info {
+ int num_coefs;
+ real const * coefs;
+ stage_fn_t fn, dfn;
+ float att;
+} half_fir_info_t;
+typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
+typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;
+
+#define U100_l 42
+#define MULT32 (65536. * 65536.)
+
+/* Conceptually: coef_p is &coefs[num_phases][fir_len][interp_order+1]: */
+#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\
+ (fir_len) * ((interp_order) + 1) * (phase_num) + \
+ ((interp_order) + 1) * (fir_coef_num) + \
+ ((interp_order) - (coef_interp_num))]
+
+/* Conceptually: coef_p is &coefs[num_phases][fir_len/4][interp_order+1][4]: */
+#define coef4(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\
+ (fir_len) * ((interp_order) + 1) * (phase_num) + \
+ ((interp_order) + 1) * ((fir_coef_num) & ~3) + \
+ 4 * ((interp_order) - (coef_interp_num)) + \
+ ((fir_coef_num) & 3)]
+
+typedef union { /* Int64 in parts */
+ #if HAVE_BIGENDIAN
+ struct {int32_t ms; uint32_t ls;} parts;
+ #else
+ struct {uint32_t ls; int32_t ms;} parts;
+ #endif
+ int64_t all;
+} int64p_t;
+
+typedef union { /* Uint64 in parts */
+ #if HAVE_BIGENDIAN
+ struct {uint32_t ms, ls;} parts;
+ #else
+ struct {uint32_t ls, ms;} parts;
+ #endif
+ uint64_t all;
+} uint64p_t;
+
+typedef struct {
+ int dft_length, num_taps, post_peak;
+ void * dft_forward_setup, * dft_backward_setup;
+ real * coefs;
+} dft_filter_t;
+
+typedef struct { /* So generated filter coefs may be shared between channels */
+ real * poly_fir_coefs;
+ dft_filter_t dft_filter[2];
+} rate_shared_t;
+
+typedef double float_step_t; /* Or long double or __float128. */
+
+typedef union { /* Fixed point arithmetic */
+ struct {uint64p_t ls; int64p_t ms;} fix; /* Hi-prec has ~96 bits. */
+ float_step_t flt;
+} step_t;
+
+#define integer fix.ms.parts.ms
+#define fraction fix.ms.parts.ls
+#define whole fix.ms.all
+
+#define CORE_DBL 1
+#define CORE_SIMD_POLY 2
+#define CORE_SIMD_HALF 4
+#define CORE_SIMD_DFT 8
+#define LOG2_SIZEOF_REAL(core_flags) (2 + ((core_flags) & 1))
+
+typedef int core_flags_t;
+
+#if defined SOXR_LIB
+#include "rdft_t.h"
+#else
+typedef void fn_t;
+#endif
+
+typedef struct stage {
+ int num;
+
+ /* Common to all stage types: */
+ core_flags_t core_flags;
+ stage_fn_t fn;
+ fifo_t fifo;
+ int pre; /* Number of past samples to store */
+ int pre_post; /* pre + number of future samples to store */
+ int preload; /* Number of zero samples to pre-load the fifo */
+ double out_in_ratio; /* For buffer management. */
+ int input_size;
+ bool is_input;
+
+ /* For a stage with variable (run-time generated) filter coefs: */
+ rdft_cb_table const * rdft_cb;
+ rate_shared_t * shared;
+ unsigned dft_filter_num; /* Which, if any, of the 2 DFT filters to use */
+ real * dft_scratch;
+ float * dft_out;
+ real const * coefs;
+
+ /* For a stage with variable L/M: */
+ step_t at, step;
+ bool use_hi_prec_clock;
+ int L, remM;
+ int n, phase_bits, block_len;
+ double mult, phase0;
+} stage_t;
+
+#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
+#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
+
+#define lq_bw0 (1385/2048.) /* ~.67625, FP exact. */
+
+typedef enum {rolloff_small, rolloff_medium, rolloff_none} rolloff_t;
+
+typedef struct {
+ void * (* alloc)(size_t);
+ void * (* calloc)(size_t, size_t);
+ void (* free)(void *);
+} alloc_t;
+
+typedef struct {
+ alloc_t mem;
+ half_fir_info_t const * half_firs;
+ size_t half_firs_len;
+ half_fir_info_t const * doub_firs;
+ size_t doub_firs_len;
+ stage_fn_t cubic_stage_fn;
+ poly_fir_t const * poly_firs;
+ rdft_cb_table * rdft_cb;
+} cr_core_t;
+
+typedef struct rate rate_t;
+struct rate {
+ cr_core_t const * core;
+ double io_ratio;
+ int64_t samples_in, samples_out;
+ int num_stages, flushing;
+ stage_t * stages;
+};
+
+#if defined SOXR_LIB
+
+#include "soxr.h"
+
+char const * _soxr_init(
+ rate_t * const p, /* Per audio channel. */
+ rate_shared_t * const shared, /* Between channels (undergoing same rate change)*/
+ double const io_ratio, /* Input rate divided by output rate. */
+ soxr_quality_spec_t const * const q_spec,
+ soxr_runtime_spec_t const * const r_spec,
+ double multiplier, /* Linear gain to apply during conversion. 1 */
+ cr_core_t const * const core,
+ core_flags_t const);
+
+void _soxr_process(void * p, size_t olen);
+void * _soxr_input(void * p, void * samples, size_t n);
+void const * _soxr_output(void * p, void * samples, size_t * n0);
+void _soxr_flush(void * p);
+void _soxr_close(void * p);
+double _soxr_delay(void * p);
+void _soxr_sizes(size_t * shared, size_t * channel);
+#endif
+
+#endif
diff --git a/soxr/src/cr32.c b/soxr/src/cr32.c
new file mode 100644
index 0000000..b9eb264
--- /dev/null
+++ b/soxr/src/cr32.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#define RATE_CB _soxr_rate32_cb
+#define CORE_STR "cr32"
+
+#define CORE_TYPE 0
+#include "cr-core.c"
diff --git a/soxr/src/cr32s.c b/soxr/src/cr32s.c
new file mode 100644
index 0000000..5de2a43
--- /dev/null
+++ b/soxr/src/cr32s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#define RATE_CB _soxr_rate32s_cb
+#define CORE_STR "cr32s"
+
+#define CORE_TYPE (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+#include "cr-core.c"
diff --git a/soxr/src/cr64.c b/soxr/src/cr64.c
new file mode 100644
index 0000000..518cdd7
--- /dev/null
+++ b/soxr/src/cr64.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#define RATE_CB _soxr_rate64_cb
+#define CORE_STR "cr64"
+
+#define CORE_TYPE CORE_DBL
+#include "cr-core.c"
diff --git a/soxr/src/cr64s.c b/soxr/src/cr64s.c
new file mode 100644
index 0000000..5dcd6f1
--- /dev/null
+++ b/soxr/src/cr64s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#define RATE_CB _soxr_rate64s_cb
+#define CORE_STR "cr64s"
+
+#define CORE_TYPE (CORE_DBL|CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+#include "cr-core.c"
diff --git a/soxr/src/data-io.c b/soxr/src/data-io.c
index 1cd8e7f..fb61675 100644
--- a/soxr/src/data-io.c
+++ b/soxr/src/data-io.c
@@ -1,4 +1,4 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#include
@@ -14,8 +14,8 @@
unsigned i; \
size_t j; \
T const * src = *src0; \
- if (ch > 1) \
- for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) dest[i][j] = (DEINTERLEAVE_TO)*src++; \
+ if (ch > 1) for (j = 0; j < n; ++j) \
+ for (i = 0; i < ch; ++i) dest[i][j] = (DEINTERLEAVE_TO)*src++; \
else if (flag) memcpy(dest[0], src, n * sizeof(T)), src = &src[n]; \
else for (j = 0; j < n; dest[0][j++] = (DEINTERLEAVE_TO)*src++); \
*src0 = src; \
@@ -23,7 +23,7 @@
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */
soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
{
@@ -40,7 +40,7 @@ void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */
-#if HAVE_SINGLE_PRECISION
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
{
@@ -60,35 +60,6 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
#include "rint.h"
-#if HAVE_FENV_H
- #include
- #define fe_test_invalid() fetestexcept(FE_INVALID)
- #define fe_clear_invalid() feclearexcept(FE_INVALID)
-#elif defined _MSC_VER
- #define FE_INVALID 1
- #if defined _WIN64
- #include
- #define fe_test_invalid() (_statusfp() & _SW_INVALID)
- #define fe_clear_invalid _clearfp /* FIXME clears all */
- #else
- static __inline int fe_test_invalid()
- {
- short status_word;
- __asm fnstsw status_word
- return status_word & FE_INVALID;
- }
-
- static __inline int fe_clear_invalid()
- {
- int16_t status[14];
- __asm fnstenv status
- status[2] &= ~FE_INVALID;
- __asm fldenv status
- return 0;
- }
- #endif
-#endif
-
#if defined FE_INVALID && defined FPU_RINT32 && defined __STDC_VERSION__
@@ -97,13 +68,13 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
#endif
#endif
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
#define FLOATX double
#define LSX_RINT_CLIP_2 lsx_rint32_clip_2
#define LSX_RINT_CLIP lsx_rint32_clip
#define RINT_CLIP rint32_clip
-#define RINT rint32
+#define RINT rint32D
#if defined FPU_RINT32
#define FPU_RINT
#endif
@@ -114,7 +85,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
#define LSX_RINT_CLIP_2 lsx_rint16_clip_2
#define LSX_RINT_CLIP lsx_rint16_clip
#define RINT_CLIP rint16_clip
-#define RINT rint16
+#define RINT rint16D
#if defined FPU_RINT16
#define FPU_RINT
#endif
@@ -125,7 +96,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
#define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither
#define LSX_RINT_CLIP lsx_rint16_clip_dither
#define RINT_CLIP rint16_clip_dither
-#define RINT rint16
+#define RINT rint16D
#if defined FPU_RINT16
#define FPU_RINT
#endif
@@ -139,13 +110,13 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
-#if HAVE_SINGLE_PRECISION
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
#define FLOATX float
#define LSX_RINT_CLIP_2 lsx_rint32_clip_2_f
#define LSX_RINT_CLIP lsx_rint32_clip_f
#define RINT_CLIP rint32_clip_f
-#define RINT rint32
+#define RINT rint32F
#if defined FPU_RINT32
#define FPU_RINT
#endif
@@ -156,7 +127,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
#define LSX_RINT_CLIP_2 lsx_rint16_clip_2_f
#define LSX_RINT_CLIP lsx_rint16_clip_f
#define RINT_CLIP rint16_clip_f
-#define RINT rint16
+#define RINT rint16F
#if defined FPU_RINT16
#define FPU_RINT
#endif
@@ -167,7 +138,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
#define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither_f
#define LSX_RINT_CLIP lsx_rint16_clip_dither_f
#define RINT_CLIP rint16_clip_dither_f
-#define RINT rint16
+#define RINT rint16D
#if defined FPU_RINT16
#define FPU_RINT
#endif
@@ -199,7 +170,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
return 0; \
} while (0)
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0,
double const * const * src, size_t n, unsigned ch, unsigned long * seed)
{
@@ -225,7 +196,7 @@ size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0,
}
#endif
-#if HAVE_SINGLE_PRECISION
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
size_t /* clips */ _soxr_interleave_f(soxr_datatype_t data_type, void * * dest0,
float const * const * src, size_t n, unsigned ch, unsigned long * seed)
{
diff --git a/soxr/src/dev32s.h b/soxr/src/dev32s.h
new file mode 100644
index 0000000..7edae86
--- /dev/null
+++ b/soxr/src/dev32s.h
@@ -0,0 +1,54 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if !defined soxr_dev32s_included
+#define soxr_dev32s_included
+
+#if defined __GNUC__
+ #define SIMD_INLINE(T) static __inline T __attribute__((always_inline))
+ #define vAlign __attribute__((aligned (16)))
+#elif defined _MSC_VER
+ #define SIMD_INLINE(T) static __forceinline T
+ #define vAlign __declspec(align(16))
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+
+#include
+
+#define vZero() _mm_setzero_ps()
+#define vSet1(a) _mm_set_ss(a)
+#define vMul(a,b) _mm_mul_ps(a,b)
+#define vAdd(a,b) _mm_add_ps(a,b)
+#define vMac(a,b,c) vAdd(vMul(a,b),c)
+#define vLds(a) _mm_set1_ps(a)
+#define vLd(a) _mm_load_ps(a)
+#define vLdu(a) _mm_loadu_ps(a)
+
+typedef __m128 v4_t;
+
+SIMD_INLINE(void) vStorSum(float * a, v4_t b) {
+ v4_t t = vAdd(_mm_movehl_ps(b, b), b);
+ _mm_store_ss(a, vAdd(t, _mm_shuffle_ps(t,t,1)));}
+
+#elif defined __arm__
+
+#include
+
+#define vZero() vdupq_n_f32(0)
+#define vMul(a,b) vmulq_f32(a,b)
+#define vAdd(a,b) vaddq_f32(a,b)
+#define vMac(a,b,c) vmlaq_f32(c,a,b)
+#define vLds(a) vld1q_dup_f32(&(a))
+#define vLd(a) vld1q_f32(a)
+#define vLdu(a) vld1q_f32(a)
+
+typedef float32x4_t v4_t;
+
+SIMD_INLINE(void) vStorSum(float * a, v4_t b) {
+ float32x2_t t = vadd_f32(vget_high_f32(b), vget_low_f32(b));
+ *a = vget_lane_f32(vpadd_f32(t, t), 0);}
+
+#endif
+
+#endif
diff --git a/soxr/src/dev64s.h b/soxr/src/dev64s.h
new file mode 100644
index 0000000..4672210
--- /dev/null
+++ b/soxr/src/dev64s.h
@@ -0,0 +1,42 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if !defined soxr_dev64s_included
+#define soxr_dev64s_included
+
+#if defined __GNUC__
+ #define SIMD_INLINE(T) static __inline T __attribute__((always_inline))
+ #define vAlign __attribute__((aligned (32)))
+#elif defined _MSC_VER
+ #define SIMD_INLINE(T) static __forceinline T
+ #define vAlign __declspec(align(32))
+#else
+ #define SIMD_INLINE(T) static __inline T
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+
+#include
+
+#if defined __AVX__
+
+#define vZero() _mm256_setzero_pd()
+#define vSet1(a) _mm256_set_pd(0,0,0,a)
+#define vMul(a,b) _mm256_mul_pd(a,b)
+#define vAdd(a,b) _mm256_add_pd(a,b)
+#define vMac(a,b,c) vAdd(vMul(a,b),c) /* Note: gcc -mfma will `fuse' these */
+#define vLds(a) _mm256_set1_pd(a)
+#define vLd(a) _mm256_load_pd(a)
+#define vLdu(a) _mm256_loadu_pd(a)
+
+typedef __m256d v4_t;
+
+SIMD_INLINE(void) vStorSum(double * a, v4_t b) {
+ b = _mm256_hadd_pd(b, _mm256_permute2f128_pd(b,b,1));
+ _mm_store_sd(a, _mm256_castpd256_pd128(_mm256_hadd_pd(b,b)));}
+
+#endif
+
+#endif
+
+#endif
diff --git a/soxr/src/fft4g.c b/soxr/src/fft4g.c
index 5fae8a6..cf6293a 100644
--- a/soxr/src/fft4g.c
+++ b/soxr/src/fft4g.c
@@ -282,22 +282,16 @@ Appendix :
*/
-#include
+#include "math-wrap.h"
#include "fft4g.h"
#ifdef FFT4G_FLOAT
#define double float
#define one_half 0.5f
-#if defined _MSC_VER
- #define sin (float)sin
- #define cos (float)cos
- #define atan (float)atan
-#else
- #define sin sinf
- #define cos cosf
- #define atan atanf
-#endif
+ #define sin(x) sinf(x)
+ #define cos(x) cosf(x)
+ #define atan(x) atanf(x)
#define cdft lsx_cdft_f
#define rdft lsx_rdft_f
@@ -818,7 +812,7 @@ static void bitrv2(int n, int *ip0, double *a)
static void bitrv2conj(int n, int *ip0, double *a)
{
- int j, j1, k, k1, l, m, m2, ip[256];
+ int j, j1, k, k1, l, m, m2, ip[512];
double xr, xi, yr, yi;
(void)ip0;
diff --git a/soxr/src/fft4g32.c b/soxr/src/fft4g32.c
index 8741394..4e4912e 100644
--- a/soxr/src/fft4g32.c
+++ b/soxr/src/fft4g32.c
@@ -1,27 +1,38 @@
/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
+#include
#include "filter.h"
#define FFT4G_FLOAT
#include "fft4g.c"
+#include "soxr-config.h"
-static void * null(void) {return 0;}
-static void forward (int length, void * setup, double * H) {lsx_safe_rdft_f(length, 1, H); (void)setup;}
-static void backward(int length, void * setup, double * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
+#if WITH_CR32
+#include "rdft_t.h"
+static void * null(int u1) {(void)u1; return 0;}
+static void forward (int length, void * setup, void * H, void * scratch) {lsx_safe_rdft_f(length, 1, H); (void)setup; (void)scratch;}
+static void backward(int length, void * setup, void * H, void * scratch) {lsx_safe_rdft_f(length, -1, H); (void)setup; (void)scratch;}
static int multiplier(void) {return 2;}
-static void nothing(void) {}
+static void nothing(void *u1) {(void)u1;}
+static void nothing2(int u1, void *u2, void *u3, void *u4) {(void)u1; (void)u2; (void)u3; (void)u4;}
+static int flags(void) {return 0;}
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32_cb[] = {
- (fn_t)null,
- (fn_t)null,
- (fn_t)nothing,
- (fn_t)forward,
- (fn_t)forward,
- (fn_t)backward,
- (fn_t)backward,
- (fn_t)_soxr_ordered_convolve_f,
- (fn_t)_soxr_ordered_partial_convolve_f,
- (fn_t)multiplier,
- (fn_t)nothing,
+rdft_cb_table _soxr_rdft32_cb = {
+ null,
+ null,
+ nothing,
+ forward,
+ forward,
+ backward,
+ backward,
+ _soxr_ordered_convolve_f,
+ _soxr_ordered_partial_convolve_f,
+ multiplier,
+ nothing2,
+ malloc,
+ calloc,
+ free,
+ flags,
};
+
+#endif
diff --git a/soxr/src/fft4g32s.c b/soxr/src/fft4g32s.c
index 4a95a7d..c7f3772 100644
--- a/soxr/src/fft4g32s.c
+++ b/soxr/src/fft4g32s.c
@@ -2,25 +2,30 @@
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#include "filter.h"
-#include "simd.h"
+#include "util32s.h"
+#include "rdft_t.h"
static void * null(void) {return 0;}
static void nothing(void) {}
static void forward (int length, void * setup, float * H) {lsx_safe_rdft_f(length, 1, H); (void)setup;}
static void backward(int length, void * setup, float * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
static int multiplier(void) {return 2;}
+static int flags(void) {return RDFT_IS_SIMD;}
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32s_cb[] = {
- (fn_t)null,
- (fn_t)null,
- (fn_t)nothing,
- (fn_t)forward,
- (fn_t)forward,
- (fn_t)backward,
- (fn_t)backward,
- (fn_t)_soxr_ordered_convolve_simd,
- (fn_t)_soxr_ordered_partial_convolve_simd,
- (fn_t)multiplier,
- (fn_t)nothing,
+rdft_cb_table _soxr_rdft32s_cb = {
+ null,
+ null,
+ nothing,
+ forward,
+ forward,
+ backward,
+ backward,
+ ORDERED_CONVOLVE_SIMD,
+ ORDERED_PARTIAL_CONVOLVE_SIMD,
+ multiplier,
+ nothing,
+ SIMD_ALIGNED_MALLOC,
+ SIMD_ALIGNED_CALLOC,
+ SIMD_ALIGNED_FREE,
+ flags,
};
diff --git a/soxr/src/fft4g64.c b/soxr/src/fft4g64.c
index 48eaddd..fb87281 100644
--- a/soxr/src/fft4g64.c
+++ b/soxr/src/fft4g64.c
@@ -1,29 +1,36 @@
/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
+#include
#include "filter.h"
#include "fft4g.c"
#include "soxr-config.h"
-#if HAVE_DOUBLE_PRECISION
-static void * null(void) {return 0;}
-static void nothing(void) {}
-static void forward (int length, void * setup, double * H) {lsx_safe_rdft(length, 1, H); (void)setup;}
-static void backward(int length, void * setup, double * H) {lsx_safe_rdft(length, -1, H); (void)setup;}
+#if WITH_CR64
+#include "rdft_t.h"
+static void * null(int u1) {(void)u1; return 0;}
+static void nothing(void *u1) {(void)u1;}
+static void nothing2(int u1, void *u2, void *u3, void *u4) {(void)u1; (void)u2; (void)u3; (void)u4;}
+static void forward (int length, void * setup, void * H, void * scratch) {lsx_safe_rdft(length, 1, H); (void)setup; (void)scratch;}
+static void backward(int length, void * setup, void * H, void * scratch) {lsx_safe_rdft(length, -1, H); (void)setup; (void)scratch;}
static int multiplier(void) {return 2;}
+static int flags(void) {return 0;}
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft64_cb[] = {
- (fn_t)null,
- (fn_t)null,
- (fn_t)nothing,
- (fn_t)forward,
- (fn_t)forward,
- (fn_t)backward,
- (fn_t)backward,
- (fn_t)_soxr_ordered_convolve,
- (fn_t)_soxr_ordered_partial_convolve,
- (fn_t)multiplier,
- (fn_t)nothing,
+rdft_cb_table _soxr_rdft64_cb = {
+ null,
+ null,
+ nothing,
+ forward,
+ forward,
+ backward,
+ backward,
+ _soxr_ordered_convolve,
+ _soxr_ordered_partial_convolve,
+ multiplier,
+ nothing2,
+ malloc,
+ calloc,
+ free,
+ flags,
};
#endif
diff --git a/soxr/src/fifo.h b/soxr/src/fifo.h
index b2bda43..33af9fe 100644
--- a/soxr/src/fifo.h
+++ b/soxr/src/fifo.h
@@ -1,14 +1,15 @@
/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
-#ifndef fifo_included
-#define fifo_included
+#ifndef soxr_fifo_included
+#define soxr_fifo_included
#if !defined FIFO_SIZE_T
#define FIFO_SIZE_T size_t
#endif
#if !defined FIFO_REALLOC
+#include
#define FIFO_REALLOC(a,b,c) realloc(a,b)
#undef FIFO_FREE
#define FIFO_FREE free
diff --git a/soxr/src/filter.c b/soxr/src/filter.c
index ca146d2..019d24d 100644
--- a/soxr/src/filter.c
+++ b/soxr/src/filter.c
@@ -1,12 +1,9 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#include "filter.h"
-#include
-#if !defined M_PI
-#define M_PI 3.14159265358979323846
-#endif
+#include "math-wrap.h"
#include
#include
#include
@@ -14,7 +11,7 @@
#include "fft4g.h"
#include "ccrw2.h"
-#if 1 || HAVE_DOUBLE_PRECISION /* Always need this, for lsx_fir_to_phase. */
+#if 1 || WITH_CR64 || WITH_CR64S /* Always need this, for lsx_fir_to_phase. */
#define DFT_FLOAT double
#define DONE_WITH_FFT_CACHE done_with_fft_cache
#define FFT_CACHE_CCRW fft_cache_ccrw
@@ -31,7 +28,7 @@
#include "fft4g_cache.h"
#endif
-#if HAVE_SINGLE_PRECISION && !HAVE_AVFFT
+#if (WITH_CR32 && !AVCODEC_FOUND) || (WITH_CR32S && !AVCODEC_FOUND && !WITH_PFFFT)
#define DFT_FLOAT float
#define DONE_WITH_FFT_CACHE done_with_fft_cache_f
#define FFT_CACHE_CCRW fft_cache_ccrw_f
@@ -48,14 +45,14 @@
#include "fft4g_cache.h"
#endif
-#if HAVE_DOUBLE_PRECISION || !SOXR_LIB
+#if WITH_CR64 || WITH_CR64S || !SOXR_LIB
#define DFT_FLOAT double
#define ORDERED_CONVOLVE lsx_ordered_convolve
#define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve
#include "rdft.h"
#endif
-#if HAVE_SINGLE_PRECISION
+#if WITH_CR32
#define DFT_FLOAT float
#define ORDERED_CONVOLVE lsx_ordered_convolve_f
#define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve_f
@@ -96,12 +93,12 @@ double * lsx_make_lpf(
double * h = malloc((size_t)num_taps * sizeof(*h));
double mult = scale / lsx_bessel_I_0(beta), mult1 = 1 / (.5 * m + rho);
assert(Fc >= 0 && Fc <= 1);
- lsx_debug("make_lpf(n=%i Fc=%.7g β=%g ρ=%g scale=%g)",
+ lsx_debug("make_lpf(n=%i Fc=%.7g beta=%g rho=%g scale=%g)",
num_taps, Fc, beta, rho, scale);
if (h) for (i = 0; i <= m / 2; ++i) {
double z = i - .5 * m, x = z * M_PI, y = z * mult1;
- h[i] = x? sin(Fc * x) / x : Fc;
+ h[i] = x!=0? sin(Fc * x) / x : Fc;
h[i] *= lsx_bessel_I_0(beta * sqrt(1 - y * y)) * mult;
if (m - i != i)
h[m - i] = h[i];
@@ -123,12 +120,15 @@ double * lsx_design_lpf(
double Fn, /* Nyquist freq; e.g. 0.5, 1, PI */
double att, /* Stop-band attenuation in dB */
int * num_taps, /* 0: value will be estimated */
- int k, /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */
+ int k, /* >0: number of phases; <0: num_taps = 1 (mod -k) */
double beta) /* <0: value will be estimated */
{
int n = *num_taps, phases = max(k, 1), modulo = max(-k, 1);
double tr_bw, Fc, rho = phases == 1? .5 : att < 120? .63 : .75;
+ lsx_debug_more("./sinctest %-12.7g %-12.7g %g 0 %-5g %i %i 50 %g %g -4 >1",
+ Fp, Fs, Fn, att, *num_taps, k, beta, rho);
+
Fp /= fabs(Fn), Fs /= fabs(Fn); /* Normalise to Fn = 1 */
tr_bw = .5 * (Fs - Fp); /* Transition band-width: 6dB to stop points */
tr_bw /= phases, Fs /= phases;
@@ -145,7 +145,7 @@ double * lsx_design_lpf(
static double safe_log(double x)
{
assert(x >= 0);
- if (x)
+ if (x!=0)
return log(x);
lsx_debug("log(0)");
return -26;
@@ -222,7 +222,7 @@ void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase)
while (peak && fabs(work[peak-1]) > fabs(work[peak]) && work[peak-1] * work[peak] > 0)
--peak;
- if (!phase1)
+ if (phase1==0)
begin = 0;
else if (phase1 == 1)
begin = peak - *len / 2;
@@ -243,3 +243,35 @@ void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase)
work[imp_peak], *len, *post_len, 100 - 100. * *post_len / (*len - 1));
free(pi_wraps), free(work);
}
+
+#define F_x(F,expr) static double F(double x) {return expr;}
+F_x(sinePhi, ((2.0517e-07*x-1.1303e-04)*x+.023154)*x+.55924 )
+F_x(sinePsi, ((9.0667e-08*x-5.6114e-05)*x+.013658)*x+1.0977 )
+F_x(sinePow, log(.5)/log(sin(x*.5)) )
+#define dB_to_linear(x) exp((x) * (M_LN10 * 0.05))
+
+double lsx_f_resp(double t, double a)
+{
+ double x;
+ if (t > (a <= 160? .8 : .82)) {
+ double a1 = a+15;
+ double p = .00035*a+.375;
+ double w = 1/(1-.597)*asin(pow((a1-10.6)/a1,1/p));
+ double c = 1+asin(pow(1-a/a1,1/p))/w;
+ return a1*(pow(sin((c-t)*w),p)-1);
+ }
+ if (t > .5)
+ x = sinePsi(a), x = pow(sin((1-t) * x), sinePow(x));
+ else
+ x = sinePhi(a), x = 1 - pow(sin(t * x), sinePow(x));
+ return linear_to_dB(x);
+}
+
+double lsx_inv_f_resp(double drop, double a)
+{
+ double x = sinePhi(a), s;
+ drop = dB_to_linear(drop);
+ s = drop > .5 ? 1 - drop : drop;
+ x = asin(pow(s, 1/sinePow(x))) / x;
+ return drop > .5? x : 1 -x;
+}
diff --git a/soxr/src/filter.h b/soxr/src/filter.h
index 435303b..203e73d 100644
--- a/soxr/src/filter.h
+++ b/soxr/src/filter.h
@@ -16,10 +16,10 @@ void lsx_safe_rdft(int len, int type, double * d);
void lsx_safe_cdft(int len, int type, double * d);
void lsx_safe_rdft_f(int len, int type, float * d);
void lsx_safe_cdft_f(int len, int type, float * d);
-void lsx_ordered_convolve(int n, void * not_used, double * a, const double * b);
-void lsx_ordered_convolve_f(int n, void * not_used, float * a, const float * b);
-void lsx_ordered_partial_convolve(int n, double * a, const double * b);
-void lsx_ordered_partial_convolve_f(int n, float * a, const float * b);
+void lsx_ordered_convolve(int n, void * not_used, void * a, const void * b);
+void lsx_ordered_convolve_f(int n, void * not_used, void * a, const void * b);
+void lsx_ordered_partial_convolve(int n, void * a, const void * b);
+void lsx_ordered_partial_convolve_f(int n, void * a, const void * b);
double lsx_kaiser_beta(double att, double tr_bw);
double * lsx_make_lpf(int num_taps, double Fc, double beta, double rho,
@@ -31,9 +31,14 @@ double * lsx_design_lpf(
double Fn, /* Nyquist freq; e.g. 0.5, 1, PI; < 0: dummy run */
double att, /* Stop-band attenuation in dB */
int * num_taps, /* 0: value will be estimated */
- int k, /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */
+ int k, /* >0: number of phases; <0: num_taps = 1 (mod -k) */
double beta); /* <0: value will be estimated */
+
void lsx_fir_to_phase(double * * h, int * len,
int * post_len, double phase0);
+double lsx_f_resp(double t, double a);
+double lsx_inv_f_resp(double drop, double a);
+#define lsx_to_3dB(a) (1 - lsx_inv_f_resp(-3., a))
+
#endif
diff --git a/soxr/src/filters.h b/soxr/src/filters.h
deleted file mode 100644
index e9a8011..0000000
--- a/soxr/src/filters.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-#include "half_coefs.h"
-
-#define FUNCTION h8
-#define CONVOLVE _ _ _ _ _ _ _ _
-#define h8_l 8
-#define COEFS half_fir_coefs_8
-#include "half-fir.h"
-
-#define FUNCTION h9
-#define CONVOLVE _ _ _ _ _ _ _ _ _
-#define h9_l 9
-#define COEFS half_fir_coefs_9
-#include "half-fir.h"
-
-#define FUNCTION h10
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _
-#define h10_l 10
-#define COEFS half_fir_coefs_10
-#include "half-fir.h"
-
-#define FUNCTION h11
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _
-#define h11_l 11
-#define COEFS half_fir_coefs_11
-#include "half-fir.h"
-
-#define FUNCTION h12
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _
-#define h12_l 12
-#define COEFS half_fir_coefs_12
-#include "half-fir.h"
-
-#define FUNCTION h13
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _ _
-#define h13_l 13
-#define COEFS half_fir_coefs_13
-#include "half-fir.h"
-
-static struct {int num_coefs; stage_fn_t fn; float att;} const half_firs[] = {
- { 8, h8 , 136.51f},
- { 9, h9 , 152.32f},
- {10, h10, 168.07f},
- {11, h11, 183.78f},
- {12, h12, 199.44f},
- {13, h13, 212.75f},
-};
-
-#define HI_PREC_CLOCK
-
-#define VAR_LENGTH p->n
-#define VAR_CONVOLVE while (j < FIR_LENGTH) _
-#define VAR_POLY_PHASE_BITS p->phase_bits
-
-#define FUNCTION vpoly0
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir0.h"
-
-#define FUNCTION vpoly1
-#define COEF_INTERP 1
-#define PHASE_BITS VAR_POLY_PHASE_BITS
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir.h"
-
-#define FUNCTION vpoly2
-#define COEF_INTERP 2
-#define PHASE_BITS VAR_POLY_PHASE_BITS
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir.h"
-
-#define FUNCTION vpoly3
-#define COEF_INTERP 3
-#define PHASE_BITS VAR_POLY_PHASE_BITS
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir.h"
-
-#undef HI_PREC_CLOCK
-
-#define U100_l 42
-#if RATE_SIMD_POLY
- #define U100_l_EXTRA _ _
- #define u100_l_EXTRA _
- #define U100_l_EXTRA_LENGTH 2
- #define u100_l_EXTRA_LENGTH 1
-#else
- #define U100_l_EXTRA
- #define u100_l_EXTRA
- #define U100_l_EXTRA_LENGTH 0
- #define u100_l_EXTRA_LENGTH 0
-#endif
-#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ U100_l_EXTRA
-#define FUNCTION U100_0
-#define FIR_LENGTH (U100_l + U100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_U100
-#include "poly-fir0.h"
-
-#define u100_l 11
-#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _ u100_l_EXTRA
-#define FUNCTION u100_0
-#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_u100
-#include "poly-fir0.h"
-
-#define FUNCTION u100_1
-#define COEF_INTERP 1
-#define PHASE_BITS 8
-#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_u100
-#include "poly-fir.h"
-#define u100_1_b 8
-
-#define FUNCTION u100_2
-#define COEF_INTERP 2
-#define PHASE_BITS 6
-#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_u100
-#include "poly-fir.h"
-#define u100_2_b 6
-
-typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
-typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;
-
-static poly_fir_t const poly_firs[] = {
- {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}},
- {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}},
- {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}},
- {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}},
- {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}},
- {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}},
-
- {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}},
- {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}},
- {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}},
- {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}},
- {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}},
- {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}},
-
- {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}},
- {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}},
- {-1, {{0, vpoly0}, { 9, vpoly1}, { 6, vpoly2}}},
- {-1, {{0, vpoly0}, { 11, vpoly1}, { 7, vpoly2}}},
- {-1, {{0, vpoly0}, { 13, vpoly1}, { 8, vpoly2}}},
- {-1, {{0, vpoly0}, { 10, vpoly2}, { 8, vpoly3}}},
- {-1, {{0, vpoly0}, { 12, vpoly2}, { 9, vpoly3}}},
-};
diff --git a/soxr/src/half-coefs.h b/soxr/src/half-coefs.h
new file mode 100644
index 0000000..a5a0882
--- /dev/null
+++ b/soxr/src/half-coefs.h
@@ -0,0 +1,75 @@
+/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if defined __GNUC__
+ #pragma GCC system_header
+#elif defined __SUNPRO_C
+ #pragma disable_warn
+#elif defined _MSC_VER
+ #pragma warning(push, 1)
+#endif
+
+#if CORE_TYPE & CORE_SIMD_HALF
+ #define VALIGN vAlign
+#else
+ #define VALIGN
+#endif
+
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+static VALIGN const sample_t half_fir_coefs_7[] = {
+ 3.1062656496657370e-01, -8.4998810699955796e-02, 3.4007044621123500e-02,
+-1.2839903789829387e-02, 3.9899380181723145e-03, -8.9355202017945374e-04,
+ 1.0918292424806546e-04,
+};
+#endif
+
+static VALIGN const sample_t half_fir_coefs_8[] = {
+ 3.1154652365332069e-01, -8.7344917685739543e-02, 3.6814458353637280e-02,
+-1.5189204581464479e-02, 5.4540855610738801e-03, -1.5643862626630416e-03,
+ 3.1816575906323303e-04, -3.4799449225005688e-05,
+};
+
+static VALIGN const sample_t half_fir_coefs_9[] = {
+ 3.1227034755311189e-01, -8.9221517147969526e-02, 3.9139704015071934e-02,
+-1.7250558515852023e-02, 6.8589440230476112e-03, -2.3045049636430419e-03,
+ 6.0963740543348963e-04, -1.1323803957431231e-04, 1.1197769991000046e-05,
+};
+
+#if CORE_TYPE & CORE_DBL
+static VALIGN const sample_t half_fir_coefs_10[] = {
+ 3.1285456012000523e-01, -9.0756740799292787e-02, 4.1096398104193160e-02,
+-1.9066319572525220e-02, 8.1840569787684902e-03, -3.0766876176359834e-03,
+ 9.6396524429277980e-04, -2.3585679989922018e-04, 4.0252189026627833e-05,
+-3.6298196342497932e-06,
+};
+
+static VALIGN const sample_t half_fir_coefs_11[] = {
+ 3.1333588822574199e-01, -9.2035898673019811e-02, 4.2765169698406408e-02,
+-2.0673580894964429e-02, 9.4225426824512421e-03, -3.8563379950013192e-03,
+ 1.3634742159642453e-03, -3.9874150714431009e-04, 9.0586723632664806e-05,
+-1.4285617244076783e-05, 1.1834642946400529e-06,
+};
+
+static VALIGN const sample_t half_fir_coefs_12[] = {
+ 3.1373928463345568e-01, -9.3118180335301962e-02, 4.4205005881659098e-02,
+-2.2103860986973051e-02, 1.0574689371162864e-02, -4.6276428065385065e-03,
+ 1.7936153397572132e-03, -5.9617527051353237e-04, 1.6314517495669067e-04,
+-3.4555126770115446e-05, 5.0617615610782593e-06, -3.8768958592971409e-07,
+};
+
+static VALIGN const sample_t half_fir_coefs_13[] = {
+ 3.1408224847888910e-01, -9.4045836332667387e-02, 4.5459878763259978e-02,
+-2.3383369012219993e-02, 1.1644273044890753e-02, -5.3806714579057013e-03,
+ 2.2429072878264022e-03, -8.2204347506606424e-04, 2.5724946477840893e-04,
+-6.6072709864248668e-05, 1.3099163296288644e-05, -1.7907147069136000e-06,
+ 1.2750825595240592e-07,
+};
+#endif
+
+#undef VALIGN
+
+#if defined __SUNPRO_C
+ #pragma enable_warn
+#elif defined _MSC_VER
+ #pragma warning(pop)
+#endif
diff --git a/soxr/src/half-fir.h b/soxr/src/half-fir.h
index 0a8ee97..782be1b 100644
--- a/soxr/src/half-fir.h
+++ b/soxr/src/half-fir.h
@@ -1,25 +1,61 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
-/* Down-sample by a factor of 2 using a FIR with odd length (LEN).*/
+/* Decimate by 2 using a FIR with odd length (LEN). */
/* Input must be preceded and followed by LEN >> 1 samples. */
-#define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j;
-static void FUNCTION(stage_t * p, fifo_t * output_fifo)
+#define COEFS ((sample_t const *)p->coefs)
+
+#if SIMD_SSE
+ #define BEGINNING v4_t sum, q1, q2, t
+ #define ____ \
+ q1 = _mm_shuffle_ps(t=vLdu(input+2*j),vLdu(input+2*j+4),_MM_SHUFFLE(3,1,3,1)); \
+ q2 = _mm_shuffle_ps(vLdu(input-2*j-4),vLdu(input-2*j-8),_MM_SHUFFLE(1,3,1,3)); \
+ sum = vAdd(j? sum : vMul(vSet1(.5), t), vMul(vAdd(q1, q2), vLd(COEFS+j))); \
+ j += 4;
+ #define __ \
+ q1 = _mm_shuffle_ps(vLdu(input+2*j), vLdu(input-2*j-4), _MM_SHUFFLE(1,3,3,1)); \
+ q2 = _mm_loadl_pi(q2, (__m64*)(COEFS+j)), q2 = _mm_movelh_ps(q2, q2); \
+ sum = vAdd(sum, vMul(q1, q2)); \
+ j += 2;
+ #define _ \
+ q1 = _mm_add_ss(_mm_load_ss(input+2*j+1), _mm_load_ss(input-2*j-1)); \
+ sum = _mm_add_ss(sum, _mm_mul_ss(q1, _mm_load_ss(COEFS+j))); \
+ ++j;
+ #define END vStorSum(output+i, sum)
+/* #elif SIMD_AVX; No good solution found. */
+/* #elif SIMD_NEON; No need: gcc -O3 does a good job by itself. */
+#else
+ #define BEGINNING sample_t sum = input[0] * .5f
+ #define ____ __ __
+ #define __ _ _
+ #define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j;
+ #define END output[i] = sum
+#endif
+
+
+
+static void FUNCTION_H(stage_t * p, fifo_t * output_fifo)
{
- sample_t const * input = stage_read_p(p);
- int i, num_out = (stage_occupancy(p) + 1) / 2;
- sample_t * output = fifo_reserve(output_fifo, num_out);
+ sample_t const * __restrict input = stage_read_p(p);
+ int num_in = min(stage_occupancy(p), p->input_size);
+ int i, num_out = (num_in + 1) >> 1;
+ sample_t * __restrict output = fifo_reserve(output_fifo, num_out);
for (i = 0; i < num_out; ++i, input += 2) {
int j = 0;
- sample_t sum = input[0] * .5f;
- CONVOLVE
- output[i] = sum;
+ BEGINNING; CONVOLVE; END;
}
fifo_read(&p->fifo, 2 * num_out, NULL);
}
+
+
+
#undef _
+#undef __
+#undef ____
+#undef BEGINNING
+#undef END
#undef COEFS
#undef CONVOLVE
-#undef FUNCTION
+#undef FUNCTION_H
diff --git a/soxr/src/half_coefs.h b/soxr/src/half_coefs.h
deleted file mode 100644
index aac7769..0000000
--- a/soxr/src/half_coefs.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-#if defined __GNUC__
- #pragma GCC system_header
-#elif defined __SUNPRO_C
- #pragma disable_warn
-#elif defined _MSC_VER
- #pragma warning(push, 1)
-#endif
-
-static const sample_t half_fir_coefs_8[] = {
- 0.3115465451887802, -0.08734497241282892, 0.03681452335604365,
- -0.01518925831569441, 0.005454118437408876, -0.001564400922162005,
- 0.0003181701445034203, -3.48001341225749e-5,
-};
-
-static const sample_t half_fir_coefs_9[] = {
- 0.3122703613711853, -0.08922155288172305, 0.03913974805854332,
- -0.01725059723447163, 0.006858970092378141, -0.002304518467568703,
- 0.0006096426006051062, -0.0001132393923815236, 1.119795386287666e-5,
-};
-
-static const sample_t half_fir_coefs_10[] = {
- 0.3128545521327376, -0.09075671986104322, 0.04109637155154835,
- -0.01906629512749895, 0.008184039342054333, -0.0030766775017262,
- 0.0009639607022414314, -0.0002358552746579827, 4.025184282444155e-5,
- -3.629779111541012e-6,
-};
-
-static const sample_t half_fir_coefs_11[] = {
- 0.3133358837508807, -0.09203588680609488, 0.04276515428384758,
- -0.02067356614745591, 0.00942253142371517, -0.003856330993895144,
- 0.001363470684892284, -0.0003987400965541919, 9.058629923971627e-5,
- -1.428553070915318e-5, 1.183455238783835e-6,
-};
-
-static const sample_t half_fir_coefs_12[] = {
- 0.3137392991811407, -0.0931182192961332, 0.0442050575271454,
- -0.02210391200618091, 0.01057473015666001, -0.00462766983973885,
- 0.001793630226239453, -0.0005961819959665878, 0.0001631475979359577,
- -3.45557865639653e-5, 5.06188341942088e-6, -3.877010943315563e-7,
-};
-
-static const sample_t half_fir_coefs_13[] = {
- 0.3140822554324578, -0.0940458550886253, 0.04545990399121566,
- -0.02338339450796002, 0.01164429409071052, -0.005380686021429845,
- 0.002242915773871009, -0.000822047600000082, 0.0002572510962395222,
- -6.607320708956279e-5, 1.309926399120154e-5, -1.790719575255006e-6,
- 1.27504961098836e-7,
-};
-
-#if defined __SUNPRO_C
- #pragma enable_warn
-#elif defined _MSC_VER
- #pragma warning(pop)
-#endif
diff --git a/soxr/src/internal.h b/soxr/src/internal.h
index 5d8d44e..08924d5 100644
--- a/soxr/src/internal.h
+++ b/soxr/src/internal.h
@@ -1,46 +1,84 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#if !defined soxr_internal_included
#define soxr_internal_included
-#include "soxr-config.h"
+#include "std-types.h"
+
+
#undef min
#undef max
#define min(a, b) ((a) <= (b) ? (a) : (b))
#define max(a, b) ((a) >= (b) ? (a) : (b))
+
+
#define range_limit(x, lower, upper) (min(max(x, lower), upper))
#define linear_to_dB(x) (log10(x) * 20)
#define array_length(a) (sizeof(a)/sizeof(a[0]))
+#if !defined AL
#define AL(a) array_length(a)
+#endif
#define iAL(a) (int)AL(a)
#define sqr(a) ((a) * (a))
-#ifdef __GNUC__
+
+
+#if defined __GNUC__
#define UNUSED __attribute__ ((unused))
#else
#define UNUSED
#endif
-#if defined NDEBUG || SOXR_SILENT
+
+
+#if !WITH_DEV_TRACE
#ifdef __GNUC__
void lsx_dummy(char const *, ...);
#else
static __inline void lsx_dummy(char const * x, ...) {}
#endif
#define lsx_debug if(0) lsx_dummy
+ #define lsx_debug_more lsx_debug
#else
- #include
- #include
- UNUSED static void lsx_debug(char const * fmt, ...)
- {
- va_list args;
- va_start(args, fmt);
- vfprintf(stderr, fmt, args);
- fputc('\n', stderr);
- va_end(args);
- }
+ extern int _soxr_trace_level;
+ void _soxr_trace(char const * fmt, ...);
+ #define lsx_debug if (_soxr_trace_level > 0) _soxr_trace
+ #define lsx_debug_more if (_soxr_trace_level > 1) _soxr_trace
#endif
+
+
+
+/* soxr_quality_spec_t.flags: */
+
+#define SOXR_ROLLOFF_LSR2Q 3u /* Reserved for internal use. */
+#define SOXR_ROLLOFF_MASK 3u /* For masking these bits. */
+#define SOXR_MAINTAIN_3DB_PT 4u /* Reserved for internal use. */
+#define SOXR_PROMOTE_TO_LQ 64u /* Reserved for internal use. */
+
+
+
+/* soxr_runtime_spec_t.flags: */
+
+#define SOXR_STRICT_BUFFERING 4u /* Reserved for future use. */
+#define SOXR_NOSMALLINTOPT 8u /* For test purposes only. */
+
+
+
+/* soxr_quality_spec recipe: */
+
+#define SOXR_PRECISIONQ 11 /* Quality specified by the precision parameter. */
+
+#define SOXR_PHASE_MASK 0x30 /* For masking these bits. */
+
+
+
+/* soxr_quality_spec flags: */
+
+#define RESET_ON_CLEAR (1u<<31)
+
+
+
#endif
diff --git a/soxr/src/libsoxr-dev.src.in b/soxr/src/libsoxr-dev.src.in
deleted file mode 100644
index ce879f9..0000000
--- a/soxr/src/libsoxr-dev.src.in
+++ /dev/null
@@ -1,2 +0,0 @@
-set(TARGET_HEADERS "@TARGET_HEADERS@")
-set(TARGET_PCS "@TARGET_PCS@")
diff --git a/soxr/src/libsoxr.src.in b/soxr/src/libsoxr.src.in
deleted file mode 100644
index 1c926ff..0000000
--- a/soxr/src/libsoxr.src.in
+++ /dev/null
@@ -1 +0,0 @@
-set(TARGET_LIBS "@TARGET_LIBS@")
diff --git a/soxr/src/lsr.c b/soxr/src/lsr.c
deleted file mode 100644
index 64b5798..0000000
--- a/soxr/src/lsr.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-/* Wrapper mostly compatible with `libsamplerate'. */
-
-#include
-#include
-#include "soxr.h"
-
-/* Runtime casts: */
-typedef struct io_t {
- float *in,*out; long ilen,olen,idone,odone; int eoi; double oi_ratio;} io_t;
-#define SRC_DATA io_t
-typedef struct soxr SRC_STATE;
-#define src_callback_t soxr_input_fn_t
-#define SRC_ERROR soxr_error_t
-#define SRC_SRCTYPE unsigned
-
-#include "soxr-lsr.h"
-#include "rint.h"
-
-soxr_error_t src_simple(io_t * p, unsigned id, int channels)
-{
- size_t idone, odone;
- soxr_error_t error;
- soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + id, 0);
- char const * e = getenv("SOXR_LSR_NUM_THREADS");
- soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
- assert (channels > 0);
- assert (p->ilen >= 0);
- assert (p->olen >= 0);
- error = soxr_oneshot(1, p->oi_ratio, (unsigned)channels,
- p->in, (size_t)p->ilen, &idone, p->out, (size_t)p->olen, &odone,
- 0, &q_spec, &r_spec);
- p->idone = (long)idone, p->odone = (long)odone;
- return error;
-}
-
-soxr_t src_callback_new(soxr_input_fn_t fn, unsigned id, int channels, SRC_ERROR * error0, void * p)
-{
- soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + id, 0);
- char const * e = getenv("SOXR_LSR_NUM_THREADS");
- soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
- soxr_error_t error;
- soxr_t soxr = 0;
- assert (channels > 0);
- /* To minimise latency e.g. for real-time playback:
- if (id == 2)
- r_spec.log2_large_dft_size = r_spec.log2_min_dft_size = 8;
- */
- soxr = soxr_create(0, 0, (unsigned)channels, &error, 0, &q_spec, &r_spec);
- if (soxr)
- error = soxr_set_input_fn(soxr, fn, p, 0);
- if (error0)
- *(int *)error0 = (int)(ptrdiff_t)error;
- return soxr;
-}
-
-soxr_error_t src_process(soxr_t p, io_t * io)
-{
- if (!p || !io) return "null pointer";
- soxr_set_error(p, soxr_set_io_ratio(p, 1/io->oi_ratio, (size_t)io->olen));
-
- { size_t idone , odone;
- soxr_process(p, io->in, (size_t)(io->eoi? ~io->ilen : io->ilen), /* hack */
- &idone, io->out, (size_t)io->olen, &odone);
- io->idone = (long)idone, io->odone = (long)odone;
- return soxr_error(p); }
-}
-
-long src_callback_read(soxr_t p, double oi_ratio, long olen, float * obuf)
-{
- if (!p || olen < 0) return -1;
- soxr_set_error(p, soxr_set_io_ratio(p, 1/oi_ratio, (size_t)olen));
- return (long)soxr_output(p, obuf, (size_t)olen);
-}
-
-void src_float_to_short_array(float const * src, short * dest, int len)
-{
- double d, N = 1. + SHRT_MAX;
- assert (src && dest);
- while (len--) d = src[len] * N, dest[len] = (short)(d > N - 1? (short)(N - 1) : d < -N? (short)-N : rint16(d));
-}
-
-void src_short_to_float_array(short const * src, float * dest, int len)
-{
- assert (src && dest);
- while (len--) dest[len] = (float)(src[len] * (1 / (1. + SHRT_MAX)));
-}
-
-void src_float_to_int_array(float const * src, int * dest, int len)
-{
- double d, N = 32768. * 65536.; /* N.B. int32, not int! (Also next fn.) */
- assert (src && dest);
- while (len--) d = src[len] * N, dest[len] = d >= N - 1? (int)(N - 1) : d < -N? (int)(-N) : rint32(d);
-}
-
-void src_int_to_float_array(int const * src, float * dest, int len)
-{
- assert (src && dest);
- while (len--) dest[len] = (float)(src[len] * (1 / (32768. * 65536.)));
-}
-
-static char const * const names[] = {"LSR best sinc", "LSR medium sinc", "LSR fastest sinc", "LSR ZOH", "LSR linear", "SoX VHQ"};
-char const * src_get_name(unsigned n) {return n < 5u + !getenv("SOXR_LSR_STRICT")? names[n] : 0;}
-char const * src_get_description(unsigned id) {return src_get_name(id);}
-char const * src_get_version(void) {return soxr_version();}
-char const * src_strerror(soxr_error_t error) {return error == (soxr_error_t)1? "Placeholder." : sizeof(int) >= sizeof(char *) || !error ? soxr_strerror(error) : "soxr error";}
-int src_is_valid_ratio(double oi_ratio) {return getenv("SOXR_LSR_STRICT")? oi_ratio >= 1./256 && oi_ratio <= 256 : oi_ratio > 0;}
-soxr_error_t src_error(soxr_t p) {return soxr_error(p);}
-soxr_error_t src_reset(soxr_t p) {return soxr_clear(p);}
-soxr_t src_delete(soxr_t p) {soxr_delete(p); return 0;}
-soxr_error_t src_set_ratio(soxr_t p, double oi_ratio) {return soxr_set_io_ratio(p, 1/oi_ratio, 0);}
-soxr_t src_new(unsigned id, int channels, SRC_ERROR * error) {return src_callback_new(0, id, channels, error, 0);}
diff --git a/soxr/src/math-wrap.h b/soxr/src/math-wrap.h
new file mode 100644
index 0000000..8a526f1
--- /dev/null
+++ b/soxr/src/math-wrap.h
@@ -0,0 +1,31 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if !defined soxr_math_wrap_included
+#define soxr_math_wrap_included
+
+#include
+
+#if defined __STRICT_ANSI__
+ #define sinf(x) (float)sin ((double)(x))
+ #define cosf(x) (float)cos ((double)(x))
+ #define atanf(x) (float)atan((double)(x))
+#endif
+
+#if !defined M_PI
+ #define M_PI 3.141592653589793238462643383279502884
+#endif
+
+#if !defined M_LN10
+ #define M_LN10 2.302585092994045684017991454684364208
+#endif
+
+#if !defined M_SQRT2
+ #define M_SQRT2 1.414213562373095048801688724209698079
+#endif
+
+#if !defined M_LN2
+ #define M_LN2 0.693147180559945309417232121458176568
+#endif
+
+#endif
diff --git a/soxr/src/pffft-avx.h b/soxr/src/pffft-avx.h
new file mode 100644
index 0000000..ace19b5
--- /dev/null
+++ b/soxr/src/pffft-avx.h
@@ -0,0 +1,40 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+/* AVX support macros */
+
+#if !defined soxr_avx_included
+#define soxr_avx_included
+
+#include
+
+typedef __m256d v4sf;
+#define VZERO() _mm256_setzero_pd()
+#define VMUL(a,b) _mm256_mul_pd(a,b)
+#define VADD(a,b) _mm256_add_pd(a,b)
+#define VMADD(a,b,c) VADD(VMUL(a,b),c) /* Note: gcc -mfma will `fuse' these */
+#define VSUB(a,b) _mm256_sub_pd(a,b)
+#define LD_PS1(p) _mm256_set1_pd(p)
+#define INTERLEAVE2(in1, in2, out1, out2) {v4sf \
+ t1 = _mm256_unpacklo_pd(in1, in2), \
+ t2 = _mm256_unpackhi_pd(in1, in2); \
+ out1 = _mm256_permute2f128_pd(t1,t2,0x20); \
+ out2 = _mm256_permute2f128_pd(t1,t2,0x31); }
+#define UNINTERLEAVE2(in1, in2, out1, out2) {v4sf \
+ t1 = _mm256_permute2f128_pd(in1,in2,0x20), \
+ t2 = _mm256_permute2f128_pd(in1,in2,0x31); \
+ out1 = _mm256_unpacklo_pd(t1, t2); \
+ out2 = _mm256_unpackhi_pd(t1, t2);}
+#define VTRANSPOSE4(x0,x1,x2,x3) {v4sf \
+ t0 = _mm256_shuffle_pd(x0,x1, 0x0), \
+ t2 = _mm256_shuffle_pd(x0,x1, 0xf), \
+ t1 = _mm256_shuffle_pd(x2,x3, 0x0), \
+ t3 = _mm256_shuffle_pd(x2,x3, 0xf); \
+ x0 = _mm256_permute2f128_pd(t0,t1, 0x20); \
+ x1 = _mm256_permute2f128_pd(t2,t3, 0x20); \
+ x2 = _mm256_permute2f128_pd(t0,t1, 0x31); \
+ x3 = _mm256_permute2f128_pd(t2,t3, 0x31);}
+#define VSWAPHL(a,b) _mm256_permute2f128_pd(b, a, 0x30)
+#define VALIGNED(ptr) ((((long)(ptr)) & 0x1F) == 0)
+
+#endif
diff --git a/soxr/src/pffft-wrap.c b/soxr/src/pffft-wrap.c
new file mode 100644
index 0000000..c920f06
--- /dev/null
+++ b/soxr/src/pffft-wrap.c
@@ -0,0 +1,110 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if !defined PFFT_MACROS_ONLY
+
+#include "math-wrap.h"
+
+#if PFFFT_DOUBLE
+ #include "util64s.h"
+#else
+ #include "util32s.h"
+ #define sin(x) sinf(x)
+ #define cos(x) cosf(x)
+#endif
+
+#define pffft_aligned_free SIMD_ALIGNED_FREE
+#define pffft_aligned_malloc SIMD_ALIGNED_MALLOC
+#define pffft_aligned_calloc SIMD_ALIGNED_CALLOC
+
+#undef inline
+#define inline __inline
+
+#endif
+
+
+
+#include "pffft.c"
+
+
+
+#if !defined PFFT_MACROS_ONLY
+
+#if !defined PFFFT_SIMD_DISABLE
+
+static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
+ int i, Ncvec = s->Ncvec;
+ const v4sf * /*RESTRICT*/ va = (const v4sf*)a;
+ const v4sf * RESTRICT vb = (const v4sf*)b;
+ v4sf * /*RESTRICT*/ vab = (v4sf*)ab;
+
+ float ar, ai, br, bi;
+
+#ifdef __arm__
+ __builtin_prefetch(va);
+ __builtin_prefetch(vb);
+ __builtin_prefetch(va+2);
+ __builtin_prefetch(vb+2);
+ __builtin_prefetch(va+4);
+ __builtin_prefetch(vb+4);
+ __builtin_prefetch(va+6);
+ __builtin_prefetch(vb+6);
+#endif
+
+ assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+ ar = ((v4sf_union*)va)[0].f[0];
+ ai = ((v4sf_union*)va)[1].f[0];
+ br = ((v4sf_union*)vb)[0].f[0];
+ bi = ((v4sf_union*)vb)[1].f[0];
+
+ for (i=0; i < Ncvec; i += 2) {
+ v4sf ar, ai, br, bi;
+ ar = va[2*i+0]; ai = va[2*i+1];
+ br = vb[2*i+0]; bi = vb[2*i+1];
+ VCPLXMUL(ar, ai, br, bi);
+ vab[2*i+0] = ar;
+ vab[2*i+1] = ai;
+ ar = va[2*i+2]; ai = va[2*i+3];
+ br = vb[2*i+2]; bi = vb[2*i+3];
+ VCPLXMUL(ar, ai, br, bi);
+ vab[2*i+2] = ar;
+ vab[2*i+3] = ai;
+ }
+ if (s->transform == PFFFT_REAL) {
+ ((v4sf_union*)vab)[0].f[0] = ar*br;
+ ((v4sf_union*)vab)[1].f[0] = ai*bi;
+ }
+}
+
+#else
+
+static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
+ int i, Ncvec = s->Ncvec;
+
+ if (s->transform == PFFFT_REAL) {
+ /* take care of the fftpack ordering */
+ ab[0] = a[0]*b[0];
+ ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1];
+ ++ab; ++a; ++b; --Ncvec;
+ }
+ for (i=0; i < Ncvec; ++i) {
+ float ar, ai, br, bi;
+ ar = a[2*i+0]; ai = a[2*i+1];
+ br = b[2*i+0]; bi = b[2*i+1];
+ VCPLXMUL(ar, ai, br, bi);
+ ab[2*i+0] = ar;
+ ab[2*i+1] = ai;
+ }
+}
+
+#endif
+
+#include
+
+static void pffft_reorder_back(int length, void * setup, float * data, float * work)
+{
+ memcpy(work, data, (unsigned)length * sizeof(*work));
+ pffft_zreorder(setup, work, data, PFFFT_BACKWARD);
+}
+
+#endif
diff --git a/soxr/src/pffft.c b/soxr/src/pffft.c
index 957e604..46c841e 100644
--- a/soxr/src/pffft.c
+++ b/soxr/src/pffft.c
@@ -1,4 +1,7 @@
-/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com )
+/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.c
+ * with minor changes for libsoxr. */
+
+/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB
(http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
@@ -57,29 +60,12 @@
- 2011/10/02, version 1: This is the very first release of this file.
*/
-#if !defined PFFT_MACROS_ONLY
#include "pffft.h"
-#include "simd.h"
-#include
#include
+#include
#include
#include
-#define pffft_aligned_free _soxr_simd_aligned_free
-#define pffft_aligned_malloc _soxr_simd_aligned_malloc
-#define pffft_aligned_calloc _soxr_simd_aligned_calloc
-#endif
-
-/*
- vector support macros: the rest of the code is independant of
- SSE/Altivec/NEON -- adding support for other platforms with 4-element
- vectors should be limited to these macros
-*/
-
-
-/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */
-/*#define PFFFT_SIMD_DISABLE */
-
/* detect compiler flavour */
#if defined(_MSC_VER)
# define COMPILER_MSVC
@@ -91,14 +77,25 @@
# define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
# define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
# define RESTRICT __restrict
-/*# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; */
+# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
#elif defined(COMPILER_MSVC)
# define ALWAYS_INLINE(return_type) __forceinline return_type
# define NEVER_INLINE(return_type) __declspec(noinline) return_type
# define RESTRICT __restrict
-/*# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (v4sf*)_alloca(size__ * sizeof(type__)) */
+# define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
#endif
+
+/*
+ vector support macros: the rest of the code is independant of
+ SSE/Altivec/NEON -- adding support for other platforms with 4-element
+ vectors should be limited to these macros
+*/
+
+
+/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */
+/*#define PFFFT_SIMD_DISABLE */
+
/*
Altivec support macros
*/
@@ -136,9 +133,11 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
*/
#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+# define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+
+#if !PFFFT_DOUBLE
#include
typedef __m128 v4sf;
-# define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
# define VZERO() _mm_setzero_ps()
# define VMUL(a,b) _mm_mul_ps(a,b)
# define VADD(a,b) _mm_add_ps(a,b)
@@ -151,10 +150,14 @@ typedef __m128 v4sf;
# define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
# define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
+#else
+#include "pffft-avx.h"
+#endif
+
/*
ARM NEON support macros
*/
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__arm64__) || defined(__aarch64__))
+#elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__)
# include
typedef float32x4_t v4sf;
# define SIMD_SZ 4
@@ -166,7 +169,7 @@ typedef float32x4_t v4sf;
# define LD_PS1(p) vld1q_dup_f32(&(p))
# define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
# define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-# define VTRANSPOSE4_(x0,x1,x2,x3) { \
+# define VTRANSPOSE4(x0,x1,x2,x3) { \
float32x4x2_t t0_ = vzipq_f32(x0, x2); \
float32x4x2_t t1_ = vzipq_f32(x1, x3); \
float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]); \
@@ -174,7 +177,7 @@ typedef float32x4_t v4sf;
x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
}
/* marginally faster version */
-# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+/*# define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } */
# define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
# define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
#else
@@ -184,6 +187,10 @@ typedef float32x4_t v4sf;
# endif
#endif
+#if PFFFT_DOUBLE
+#define float double
+#endif
+
/* fallback mode for situations where SSE/Altivec are not available, use scalar mode instead */
#ifdef PFFFT_SIMD_DISABLE
typedef float v4sf;
@@ -200,6 +207,12 @@ typedef float v4sf;
/* shortcuts for complex multiplcations */
#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+/* multiply a scalar with a vector */
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#if !defined PFFT_MACROS_ONLY
#if !defined(PFFFT_SIMD_DISABLE)
typedef union v4sf_union {
@@ -213,7 +226,8 @@ typedef union v4sf_union {
#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
/* detect bugs with the vector support macros */
-void validate_pffft_simd() {
+void validate_pffft_simd(void);
+void validate_pffft_simd(void) {
float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
v4sf_union a0, a1, a2, a3, t, u;
memcpy(a0.f, f, 4*sizeof(float));
@@ -229,7 +243,6 @@ void validate_pffft_simd() {
printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
t.v = VMADD(a1.v, a2.v,a0.v);
printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
-
INTERLEAVE2(a1.v,a2.v,t.v,u.v);
printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
@@ -252,20 +265,23 @@ void validate_pffft_simd() {
#endif
#endif /*!PFFFT_SIMD_DISABLE */
-#if !defined PFFT_MACROS_ONLY
+#if 0
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */
+void *pffft_aligned_malloc(size_t nb_bytes) {
+ void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+ if (!p0) return (void *) 0;
+ p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+ *((void **) p - 1) = p0;
+ return p;
+}
+void pffft_aligned_free(void *p) {
+ if (p) free(*((void **) p - 1));
+}
-#if defined (COMPILER_MSVC)
- #define sin (float)sin
- #define cos (float)cos
-#else
- #define sin sinf
- #define cos cosf
-#endif
-
-/*
int pffft_simd_size() { return SIMD_SZ; }
-*/
+#endif
/*
passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
@@ -299,6 +315,7 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
/*
passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
*/
+#if 0
static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
const float *wa1, const float *wa2, float fsign) {
static const float taur = -0.5f;
@@ -311,13 +328,13 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
for (i=0; i 2);
+ for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
+ for (i = 0; i < ido-1; i += 2) {
+ ti5 = VSUB(cc_ref(i , 2), cc_ref(i , 5));
+ ti2 = VADD(cc_ref(i , 2), cc_ref(i , 5));
+ ti4 = VSUB(cc_ref(i , 3), cc_ref(i , 4));
+ ti3 = VADD(cc_ref(i , 3), cc_ref(i , 4));
+ tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
+ tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
+ tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
+ tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
+ ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
+ ch_ref(i , 1) = VADD(cc_ref(i , 1), VADD(ti2, ti3));
+ cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
+ ci2 = VADD(cc_ref(i , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
+ cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
+ ci3 = VADD(cc_ref(i , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
+ cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+ ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+ cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+ ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+ dr3 = VSUB(cr3, ci4);
+ dr4 = VADD(cr3, ci4);
+ di3 = VADD(ci3, cr4);
+ di4 = VSUB(ci3, cr4);
+ dr5 = VADD(cr2, ci5);
+ dr2 = VSUB(cr2, ci5);
+ di5 = VSUB(ci2, cr5);
+ di2 = VADD(ci2, cr5);
+ wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
+ wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
+ VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+ ch_ref(i - 1, 2) = dr2;
+ ch_ref(i, 2) = di2;
+ VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+ ch_ref(i - 1, 3) = dr3;
+ ch_ref(i, 3) = di3;
+ VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+ ch_ref(i - 1, 4) = dr4;
+ ch_ref(i, 4) = di4;
+ VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+ ch_ref(i - 1, 5) = dr5;
+ ch_ref(i, 5) = di5;
+ }
+ }
+#undef ch_ref
+#undef cc_ref
+}
+#endif
+
static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
static const float minus_one = -1.f;
int i, k, l1ido = l1*ido;
@@ -425,7 +515,7 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
if (ido % 2 == 1) return;
}
for (k=0; k < l1ido; k += ido) {
- ch[2*k + ido] = VMUL(LD_PS1(minus_one), cc[ido-1 + k + l1ido]);
+ ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
ch[2*k + ido-1] = cc[k + ido-1];
}
} /* radf2 */
@@ -460,10 +550,11 @@ static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, co
for (k = 0; k < l1ido; k += ido) {
a = cc[2*k + ido-1]; b = cc[2*k + ido];
ch[k + ido-1] = VADD(a,a);
- ch[k + ido-1 + l1ido] = VMUL(LD_PS1(minus_two), b);
+ ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
}
} /* radb2 */
+#if 0
static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
const float *wa1, const float *wa2) {
static const float taur = -0.5f;
@@ -473,8 +564,8 @@ static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT
for (k=0; k= 32); }
- if (transform == PFFFT_COMPLEX) { assert(N >= 16); }
+ int k, m;
+ if (!s) return s;
+ /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
+ and 32 for real FFTs -- a lot of stuff would need to be rewritten to
+ handle other cases (or maybe just switch to a scalar fft, I don't know..) */
+ if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
+ if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
/*assert((N % 32) == 0); */
s->N = N;
s->transform = transform;
/* nb of complex simd vectors */
s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
s->data = (v4sf*)pffft_aligned_malloc(2*(size_t)s->Ncvec * sizeof(v4sf));
- if (!s->data) {
- free(s);
- return 0;
- }
+ if (!s->data) {free(s); return 0;}
s->e = (float*)s->data;
s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
@@ -988,15 +1288,22 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
}
cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
}
+
+ /* check that N is decomposable with allowed prime factors */
+ for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
+ if (m != N/SIMD_SZ) {
+ pffft_destroy_setup(s); s = 0;
+ }
+
return s;
}
-static void pffft_destroy_setup(PFFFT_Setup *s) {
- if(s){
- pffft_aligned_free(s->data);
- free(s);
- }
+static
+void pffft_destroy_setup(PFFFT_Setup *s) {
+ if (!s) return;
+ pffft_aligned_free(s->data);
+ free(s);
}
#if !defined(PFFFT_SIMD_DISABLE)
@@ -1035,7 +1342,8 @@ static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
UNINTERLEAVE2(h0, g1, out[0], out[1]);
}
-static void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+static
+void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
int k, N = setup->N, Ncvec = setup->Ncvec;
const v4sf *vin = (const v4sf*)in;
v4sf *vout = (v4sf*)out;
@@ -1072,7 +1380,8 @@ static void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pfff
}
}
-static void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+static
+void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
v4sf r0, i0, r1, i1, r2, i2, r3, i3;
v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
@@ -1116,7 +1425,8 @@ static void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf
}
}
-static void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+static
+void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
v4sf r0, i0, r1, i1, r2, i2, r3, i3;
v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
@@ -1342,22 +1652,23 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
}
-static void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
+static
+void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
pffft_direction_t direction, int ordered) {
int k, Ncvec = setup->Ncvec;
int nf_odd = (setup->ifac[1] & 1);
+#if 0
/* temporary buffer is allocated on the stack if the scratch pointer is NULL */
- /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */
- /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */
+ int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+ VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+#endif
- int ib = (nf_odd ^ ordered ? 1 : 0);
const v4sf *vinput = (const v4sf*)finput;
v4sf *voutput = (v4sf*)foutput;
v4sf *buff[2];
- buff[0] = voutput, buff[1] = scratch /*? scratch : scratch_on_stack*/;
-
- /*if (scratch == 0) scratch = scratch_on_stack; */
+ int ib = (nf_odd ^ ordered ? 1 : 0);
+ buff[0] = voutput; buff[1] = scratch;
assert(VALIGNED(finput) && VALIGNED(foutput));
@@ -1415,8 +1726,8 @@ static void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, fl
}
#if 0
-static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
- int i, Ncvec = s->Ncvec;
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
+ int Ncvec = s->Ncvec;
const v4sf * RESTRICT va = (const v4sf*)a;
const v4sf * RESTRICT vb = (const v4sf*)b;
v4sf * RESTRICT vab = (v4sf*)ab;
@@ -1434,10 +1745,16 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo
__builtin_prefetch(va+6);
__builtin_prefetch(vb+6);
__builtin_prefetch(vab+6);
+# ifndef __clang__
+# define ZCONVOLVE_USING_INLINE_NEON_ASM
+# endif
#endif
float ar, ai, br, bi, abr, abi;
+#ifndef ZCONVOLVE_USING_INLINE_ASM
v4sf vscal = LD_PS1(scaling);
+ int i;
+#endif
assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
ar = ((v4sf_union*)va)[0].f[0];
@@ -1447,8 +1764,7 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo
abr = ((v4sf_union*)vab)[0].f[0];
abi = ((v4sf_union*)vab)[1].f[0];
-#ifdef __arm__
-# if 1 /* inline asm version */
+#ifdef ZCONVOLVE_USING_INLINE_ASM /* inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc */
const float *a_ = a, *b_ = b; float *ab_ = ab;
int N = Ncvec;
asm volatile("mov r8, %2 \n"
@@ -1484,49 +1800,7 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo
"subs %3, #2 \n"
"bne 1b \n"
: "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
-
-# else /* neon instrinsics version, 30% slower that the asm one with gcc 4.6 */
- v4sf a1r, a1i, b1r, b1i;
- v4sf a2r, a2i, b2r, b2i;
- v4sf ab1r, ab1i, ab2r, ab2i;
- for (i=0; i < Ncvec; i += 2) {
- __builtin_prefetch(va+8);
- __builtin_prefetch(va+10);
-
- a1r = *va++; a1i = *va++;
- a2r = *va++; a2i = *va++;
- b1r = *vb++; b1i = *vb++;
- b2r = *vb++; b2i = *vb++;
- ab1r = vab[0]; ab1i = vab[1];
- ab2r = vab[2]; ab2i = vab[3];
-
- v4sf z1r = VMUL(a1r, b1r);
- v4sf z2r = VMUL(a2r, b2r);
- v4sf z1i = VMUL(a1r, b1i);
- v4sf z2i = VMUL(a2r, b2i);
-
- __builtin_prefetch(vb+4);
- __builtin_prefetch(vb+6);
-
- z1r = vmlsq_f32(z1r, a1i, b1i);
- z2r = vmlsq_f32(z2r, a2i, b2i);
- z1i = vmlaq_f32(z1i, a1i, b1r);
- z2i = vmlaq_f32(z2i, a2i, b2r);
-
- __builtin_prefetch(vab+4);
- __builtin_prefetch(vab+6);
-
- ab1r = vmlaq_f32(ab1r, z1r, vscal);
- ab2r = vmlaq_f32(ab2r, z2r, vscal);
- ab1i = vmlaq_f32(ab1i, z1i, vscal);
- ab2i = vmlaq_f32(ab2i, z2i, vscal);
-
- *vab++ = ab1r; *vab++ = ab1i;
- *vab++ = ab2r; *vab++ = ab2i;
- }
-# endif
-
-#else /* not ARM, no need to use a special routine */
+#else /* default routine, works fine for non-arm cpus with current compilers */
for (i=0; i < Ncvec; i += 2) {
v4sf ar, ai, br, bi;
ar = va[2*i+0]; ai = va[2*i+1];
@@ -1548,50 +1822,14 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo
}
#endif
-static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
- int i, Ncvec = s->Ncvec;
- const v4sf * /*RESTRICT*/ va = (const v4sf*)a;
- const v4sf * RESTRICT vb = (const v4sf*)b;
- v4sf * /*RESTRICT*/ vab = (v4sf*)ab;
-
- float ar, ai, br, bi;
-
-#ifdef __arm__
-#error
-#endif
- assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
- ar = ((v4sf_union*)va)[0].f[0];
- ai = ((v4sf_union*)va)[1].f[0];
- br = ((v4sf_union*)vb)[0].f[0];
- bi = ((v4sf_union*)vb)[1].f[0];
-
- for (i=0; i < Ncvec; i += 2) {
- v4sf ar, ai, br, bi;
- ar = va[2*i+0]; ai = va[2*i+1];
- br = vb[2*i+0]; bi = vb[2*i+1];
- VCPLXMUL(ar, ai, br, bi);
- vab[2*i+0] = ar;
- vab[2*i+1] = ai;
- ar = va[2*i+2]; ai = va[2*i+3];
- br = vb[2*i+2]; bi = vb[2*i+3];
- VCPLXMUL(ar, ai, br, bi);
- vab[2*i+2] = ar;
- vab[2*i+3] = ai;
- }
- if (s->transform == PFFFT_REAL) {
- ((v4sf_union*)vab)[0].f[0] = ar*br;
- ((v4sf_union*)vab)[1].f[0] = ai*bi;
- }
-}
-
-
#else /* defined(PFFFT_SIMD_DISABLE) */
/* standard routine using scalar floats, without SIMD stuff. */
#define pffft_zreorder_nosimd pffft_zreorder
-static void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+static
+void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
int k, N = setup->N;
if (setup->transform == PFFFT_COMPLEX) {
for (k=0; k < 2*N; ++k) out[k] = in[k];
@@ -1611,19 +1849,22 @@ static void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *ou
}
#define pffft_transform_internal_nosimd pffft_transform_internal
-static void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
+static
+void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
pffft_direction_t direction, int ordered) {
int Ncvec = setup->Ncvec;
int nf_odd = (setup->ifac[1] & 1);
+#if 0
/* temporary buffer is allocated on the stack if the scratch pointer is NULL */
- /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */
- /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */
- /*if (scratch == 0) scratch = scratch_on_stack; */
-
- int ib;
+ int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+ VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+#endif
float *buff[2];
- buff[0] = output, buff[1] = scratch;
+ int ib;
+ /* if (scratch == 0) scratch = scratch_on_stack; */
+ buff[0] = output; buff[1] = scratch;
+
if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */
ib = (nf_odd ^ ordered ? 1 : 0);
@@ -1669,7 +1910,7 @@ static void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *inp
#if 0
#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
-static void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
+void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
float *ab, float scaling) {
int i, Ncvec = s->Ncvec;
@@ -1690,40 +1931,16 @@ static void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, co
}
#endif
-#define pffft_zconvolve_nosimd pffft_zconvolve
-static void pffft_zconvolve_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
- int i, Ncvec = s->Ncvec;
-
- if (s->transform == PFFFT_REAL) {
- /* take care of the fftpack ordering */
- ab[0] = a[0]*b[0];
- ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1];
- ++ab; ++a; ++b; --Ncvec;
- }
- for (i=0; i < Ncvec; ++i) {
- float ar, ai, br, bi;
- ar = a[2*i+0]; ai = a[2*i+1];
- br = b[2*i+0]; bi = b[2*i+1];
- VCPLXMUL(ar, ai, br, bi);
- ab[2*i+0] = ar;
- ab[2*i+1] = ai;
- }
-}
-
#endif /* defined(PFFFT_SIMD_DISABLE) */
-static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+static
+void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
}
-static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+static
+void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
}
-
-static void pffft_reorder_back(int length, void * setup, float * data, float * work)
-{
- memcpy(work, data, (unsigned)length * sizeof(*work));
- pffft_zreorder(setup, work, data, PFFFT_BACKWARD);
-}
#endif
diff --git a/soxr/src/pffft.h b/soxr/src/pffft.h
index 78d936b..63522ca 100644
--- a/soxr/src/pffft.h
+++ b/soxr/src/pffft.h
@@ -1,4 +1,9 @@
-/* Copyright (c) 2011 Julien Pommier ( pommier@modartt.com )
+/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.h
+ * with minor changes for libsoxr. */
+
+#if !defined PFFT_MACROS_ONLY
+
+/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
authored by Dr Paul Swarztrauber of NCAR, in 1985.
@@ -60,8 +65,9 @@
- 1D transforms only, with 32-bit single precision.
- supports only transforms for inputs of length N of the form
- N=(2^a)*(3^b), a >= 5 and b >=0 (32, 48, 64, 96, 128, 144 etc
- are all acceptable lengths). Performance is best for 128<=N<=8192.
+ N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+ 144, 160, etc are all acceptable lengths). Performance is best for
+ 128<=N<=8192.
- all (float*) pointers in the functions below are expected to
have an "simd-compatible" alignment, that is 16 bytes on x86 and
@@ -80,6 +86,10 @@
#ifdef __cplusplus
extern "C" {
+#endif
+
+#if PFFFT_DOUBLE
+#define float double
#endif
/* opaque struct holding internal stuff (precomputed twiddle factors)
@@ -99,8 +109,10 @@ extern "C" {
PFFFT_Setup structure is read-only so it can safely be shared by
multiple concurrent threads.
*/
- static PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
- static void pffft_destroy_setup(PFFFT_Setup *);
+ static
+ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+ static
+ void pffft_destroy_setup(PFFFT_Setup *);
/*
Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for
@@ -113,13 +125,14 @@ extern "C" {
Typically you will want to scale the backward transform by 1/N.
The 'work' pointer should point to an area of N (2*N for complex
- fft) floats, properly aligned. [del]If 'work' is NULL, then stack will
- be used instead (this is probably the beest strategy for small
- FFTs, say for N < 16384).[/del]
+ fft) floats, properly aligned. If 'work' is NULL, then stack will
+ be used instead (this is probably the best strategy for small
+ FFTs, say for N < 16384).
input and output may alias.
*/
- static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+ static
+ void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
Similar to pffft_transform, but makes sure that the output is
@@ -128,7 +141,8 @@ extern "C" {
input and output may alias.
*/
- static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+ static
+ void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
@@ -142,7 +156,8 @@ extern "C" {
input and output should not alias.
*/
- static void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+ static
+ void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
/*
Perform a multiplication of the frequency components of dft_a and
@@ -155,23 +170,28 @@ extern "C" {
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
- void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
*/
+ void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
/*
- the operation performed is: dft_ab = (dft_a * fdt_b)
-
- The dft_a, dft_b and dft_ab pointers may alias.
+ the float buffers must have the correct alignment (16-byte boundary
+ on intel and powerpc). This function may be used to obtain such
+ correctly aligned buffers.
*/
- static void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab);
+#if 0
+ void *pffft_aligned_malloc(size_t nb_bytes);
+ void pffft_aligned_free(void *);
/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
- int pffft_simd_size(void);
+ int pffft_simd_size();
+#endif
- static void pffft_reorder_back(int length, void * setup, float * data, float * work);
+#undef float
#ifdef __cplusplus
}
#endif
#endif
+
+#endif
diff --git a/soxr/src/pffft32.c b/soxr/src/pffft32.c
index 21bd845..c4c8e0a 100644
--- a/soxr/src/pffft32.c
+++ b/soxr/src/pffft32.c
@@ -1,11 +1,14 @@
/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
-#define _soxr_simd_aligned_free free
-#define _soxr_simd_aligned_malloc malloc
+#define SIMD_ALIGNED_FREE free
+#define SIMD_ALIGNED_MALLOC malloc
#define PFFFT_SIMD_DISABLE
-#include "pffft.c"
+#define PFFFT_DOUBLE 0
+#include "pffft-wrap.c"
+
#include "filter.h"
+#include "rdft_t.h"
static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
static void delete_setup(void * setup) {pffft_destroy_setup(setup);}
@@ -15,18 +18,22 @@ static void backward (int length, void * setup, float * H, float * scratch) {pff
static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H); (void)length;}
static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_NEEDS_SCRATCH;}
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32_cb[] = {
- (fn_t)setup,
- (fn_t)setup,
- (fn_t)delete_setup,
- (fn_t)forward,
- (fn_t)oforward,
- (fn_t)backward,
- (fn_t)obackward,
- (fn_t)convolve,
- (fn_t)_soxr_ordered_partial_convolve_f,
- (fn_t)multiplier,
- (fn_t)pffft_reorder_back,
+rdft_cb_table _soxr_rdft32_cb = {
+ setup,
+ setup,
+ delete_setup,
+ forward,
+ oforward,
+ backward,
+ obackward,
+ convolve,
+ _soxr_ordered_partial_convolve_f,
+ multiplier,
+ pffft_reorder_back,
+ malloc,
+ calloc,
+ free,
+ flags,
};
diff --git a/soxr/src/pffft32s.c b/soxr/src/pffft32s.c
index d049990..06f8fd5 100644
--- a/soxr/src/pffft32s.c
+++ b/soxr/src/pffft32s.c
@@ -1,27 +1,34 @@
/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
-#include "pffft.c"
+#define PFFFT_DOUBLE 0
+#include "pffft-wrap.c"
+
+#include "rdft_t.h"
static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
static void forward (int length, void * setup, float * h, float * scratch) {pffft_transform (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
static void oforward (int length, void * setup, float * h, float * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
static void backward (int length, void * setup, float * H, float * scratch) {pffft_transform (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
-static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H); (void)length;}
+static void convolve(int length, void * setup, float * H, float const * with) {pffft_zconvolve(setup, H, with, H); (void)length;}
static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;}
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32s_cb[] = {
- (fn_t)setup,
- (fn_t)setup,
- (fn_t)pffft_destroy_setup,
- (fn_t)forward,
- (fn_t)oforward,
- (fn_t)backward,
- (fn_t)obackward,
- (fn_t)convolve,
- (fn_t)_soxr_ordered_partial_convolve_simd,
- (fn_t)multiplier,
- (fn_t)pffft_reorder_back,
+rdft_cb_table _soxr_rdft32s_cb = {
+ setup,
+ setup,
+ pffft_destroy_setup,
+ forward,
+ oforward,
+ backward,
+ obackward,
+ convolve,
+ ORDERED_PARTIAL_CONVOLVE_SIMD,
+ multiplier,
+ pffft_reorder_back,
+ SIMD_ALIGNED_MALLOC,
+ SIMD_ALIGNED_CALLOC,
+ SIMD_ALIGNED_FREE,
+ flags,
};
diff --git a/soxr/src/pffft64s.c b/soxr/src/pffft64s.c
new file mode 100644
index 0000000..82f6504
--- /dev/null
+++ b/soxr/src/pffft64s.c
@@ -0,0 +1,34 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#define PFFFT_DOUBLE 1
+#include "pffft-wrap.c"
+
+#include "rdft_t.h"
+
+static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
+static void forward (int length, void * setup, double * h, double * scratch) {pffft_transform (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void oforward (int length, void * setup, double * h, double * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void backward (int length, void * setup, double * H, double * scratch) {pffft_transform (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void obackward(int length, void * setup, double * H, double * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void convolve(int length, void * setup, double * H, double const * with) {pffft_zconvolve(setup, H, with, H); (void)length;}
+static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;}
+
+rdft_cb_table _soxr_rdft64s_cb = {
+ setup,
+ setup,
+ pffft_destroy_setup,
+ forward,
+ oforward,
+ backward,
+ obackward,
+ convolve,
+ ORDERED_PARTIAL_CONVOLVE_SIMD,
+ multiplier,
+ pffft_reorder_back,
+ SIMD_ALIGNED_MALLOC,
+ SIMD_ALIGNED_CALLOC,
+ SIMD_ALIGNED_FREE,
+ flags,
+};
diff --git a/soxr/src/poly-fir.h b/soxr/src/poly-fir.h
index f7b4261..d138e03 100644
--- a/soxr/src/poly-fir.h
+++ b/soxr/src/poly-fir.h
@@ -1,97 +1,149 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
-/* Resample using an interpolated poly-phase FIR with length LEN.*/
-/* Input must be followed by LEN-1 samples. */
+/* Resample using an interpolated poly-phase FIR with length LEN. */
+/* Input must be followed by FIR_LENGTH-1 samples. */
-#define a (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 0,j))
-#define b (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 1,j))
-#define c (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 2,j))
-#define d (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 3,j))
-#if COEF_INTERP == 0
- #define _ sum += a *in[j], ++j;
-#elif COEF_INTERP == 1
- #define _ sum += (b *x + a)*in[j], ++j;
-#elif COEF_INTERP == 2
- #define _ sum += ((c *x + b)*x + a)*in[j], ++j;
-#elif COEF_INTERP == 3
- #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j;
-#else
+#if COEF_INTERP != 1 && COEF_INTERP != 2 && COEF_INTERP != 3
#error COEF_INTERP
#endif
+#if SIMD_AVX || SIMD_SSE || SIMD_NEON
+ #define N (FIR_LENGTH>>2)
+
+ #if COEF_INTERP == 1
+ #define _ sum=vMac(vMac(b,X,a),vLdu(in+j*4),sum), ++j;
+ #elif COEF_INTERP == 2
+ #define _ sum=vMac(vMac(vMac(c,X,b),X,a),vLdu(in+j*4),sum), ++j;
+ #else
+ #define _ sum=vMac(vMac(vMac(vMac(d,X,c),X,b),X,a),vLdu(in+j*4),sum), ++j;
+ #endif
+
+ #define a coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-0)]
+ #define b coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-1)]
+ #define c coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-2)]
+ #define d coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-3)]
+
+ #define BEGINNING v4_t X = vLds(x), sum = vZero(); \
+ v4_t const * const __restrict coefs = (v4_t *)COEFS
+ #define END vStorSum(output+i, sum)
+ #define cc(n) case n: core(n); break
+ #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
+#else
+ #define N FIR_LENGTH
+
+ #if COEF_INTERP == 1
+ #define _ sum += (b*x + a)*in[j], ++j;
+ #elif COEF_INTERP == 2
+ #define _ sum += ((c*x + b)*x + a)*in[j], ++j;
+ #else
+ #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j;
+ #endif
+
+ #define a (coef(COEFS, COEF_INTERP, N, phase, 0,j))
+ #define b (coef(COEFS, COEF_INTERP, N, phase, 1,j))
+ #define c (coef(COEFS, COEF_INTERP, N, phase, 2,j))
+ #define d (coef(COEFS, COEF_INTERP, N, phase, 3,j))
+
+ #define BEGINNING sample_t sum = 0
+ #define END output[i] = sum
+ #define CORE(n) core(n)
+#endif
+
+
+
+#define floatPrecCore(n) { \
+ float_step_t at = p->at.flt; \
+ for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { \
+ sample_t const * const __restrict in = input + (int)at; \
+ float_step_t frac = at - (int)at; \
+ int phase = (int)(frac * (1 << PHASE_BITS)); \
+ sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); \
+ int j = 0; \
+ BEGINNING; CONVOLVE(n); END; \
+ } \
+ fifo_read(&p->fifo, (int)at, NULL); \
+ p->at.flt = at - (int)at; } /* Could round to 1 in some cirmcumstances. */
+
+
+
+#define highPrecCore(n) { \
+ step_t at; at.fix = p->at.fix; \
+ for (i = 0; at.integer < num_in; ++i, \
+ at.fix.ls.all += p->step.fix.ls.all, \
+ at.whole += p->step.whole + (at.fix.ls.all < p->step.fix.ls.all)) { \
+ sample_t const * const __restrict in = input + at.integer; \
+ uint32_t frac = at.fraction; \
+ int phase = (int)(frac >> (32 - PHASE_BITS)); /* High-order bits */ \
+ /* Low-order bits, scaled to [0,1): */ \
+ sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+ int j = 0; \
+ BEGINNING; CONVOLVE(n); END; \
+ } \
+ fifo_read(&p->fifo, at.integer, NULL); \
+ p->at.whole = at.fraction; \
+ p->at.fix.ls = at.fix.ls; }
+
+
+
+#define stdPrecCore(n) { \
+ int64p_t at; at.all = p->at.whole; \
+ for (i = 0; at.parts.ms < num_in; ++i, at.all += p->step.whole) { \
+ sample_t const * const __restrict in = input + at.parts.ms; \
+ uint32_t const frac = at.parts.ls; \
+ int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
+ /* Low-order bits, scaled to [0,1): */ \
+ sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+ int j = 0; \
+ BEGINNING; CONVOLVE(n); END; \
+ } \
+ fifo_read(&p->fifo, at.parts.ms, NULL); \
+ p->at.whole = at.parts.ls; }
+
+
+
+#if WITH_FLOAT_STD_PREC_CLOCK
+ #define SPCORE floatPrecCore
+#else
+ #define SPCORE stdPrecCore
+#endif
+
+
+
+#if WITH_HI_PREC_CLOCK
+ #define core(n) if (p->use_hi_prec_clock) highPrecCore(n) else SPCORE(n)
+#else
+ #define core(n) SPCORE(n)
+#endif
+
+
+
static void FUNCTION(stage_t * p, fifo_t * output_fifo)
{
sample_t const * input = stage_read_p(p);
- int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
- sample_t * output = fifo_reserve(output_fifo, max_num_out);
+ int num_in = min(stage_occupancy(p), p->input_size);
+ int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio);
+ sample_t * const __restrict output = fifo_reserve(output_fifo, max_num_out);
-#if defined HI_PREC_CLOCK
-#if FLOAT_HI_PREC_CLOCK
- if (p->use_hi_prec_clock) {
- float_step_t at = p->at.flt;
- for (i = 0; (int)at < num_in; ++i, at += p->step.flt) {
- sample_t const * in = input + (int)at;
- float_step_t frac = at - (int)at;
- int phase = (int)(frac * (1 << PHASE_BITS));
-#if COEF_INTERP > 0
- sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase);
-#endif
- sample_t sum = 0;
- int j = 0;
- CONVOLVE
- output[i] = sum;
- }
- fifo_read(&p->fifo, (int)at, NULL);
- p->at.flt = at - (int)at;
- } else
-#else
- if (p->use_hi_prec_clock) {
- for (i = 0; p->at.integer < num_in; ++i,
- p->at.fix.ls.all += p->step.fix.ls.all,
- p->at.whole += p->step.whole + (p->at.fix.ls.all < p->step.fix.ls.all)) {
- sample_t const * in = input + p->at.integer;
- uint32_t frac = p->at.fraction;
- int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */
-#if COEF_INTERP > 0 /* low-order bits, scaled to [0,1) */
- sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32));
-#endif
- sample_t sum = 0;
- int j = 0;
- CONVOLVE
- output[i] = sum;
- }
- fifo_read(&p->fifo, p->at.integer, NULL);
- p->at.integer = 0;
- } else
-#endif
-#endif
- {
- for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
- sample_t const * in = input + p->at.integer;
- uint32_t frac = p->at.fraction;
- int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */
-#if COEF_INTERP > 0 /* low-order bits, scaled to [0,1) */
- sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32));
-#endif
- sample_t sum = 0;
- int j = 0;
- CONVOLVE
- output[i] = sum;
- }
- fifo_read(&p->fifo, p->at.integer, NULL);
- p->at.integer = 0;
- }
+ CORE(N);
assert(max_num_out - i >= 0);
fifo_trim_by(output_fifo, max_num_out - i);
}
+
+
#undef _
#undef a
#undef b
#undef c
#undef d
+#undef CORE
+#undef cc
+#undef core
#undef COEF_INTERP
+#undef N
+#undef BEGINNING
+#undef END
#undef CONVOLVE
#undef FIR_LENGTH
#undef FUNCTION
diff --git a/soxr/src/poly-fir0.h b/soxr/src/poly-fir0.h
index 52d85b3..76fca2d 100644
--- a/soxr/src/poly-fir0.h
+++ b/soxr/src/poly-fir0.h
@@ -1,32 +1,56 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
-/* Resample using a non-interpolated poly-phase FIR with length LEN.*/
-/* Input must be followed by LEN-1 samples. */
+/* Resample using a non-interpolated poly-phase FIR with length LEN. */
+/* Input must be followed by FIR_LENGTH-1 samples. */
-#define _ sum += (coef(p->shared->poly_fir_coefs, 0, FIR_LENGTH, rem, 0, j)) *at[j], ++j;
+#if SIMD_AVX || SIMD_SSE || SIMD_NEON
+ #define N (FIR_LENGTH>>2)
+ #define BEGINNING v4_t sum = vZero(); \
+ v4_t const * const __restrict coefs = (v4_t *)COEFS + N * rem;
+ #define _ sum = vMac(vLdu(at+j*4), coefs[j], sum), ++j;
+ #define END vStorSum(output+i, sum)
+ #define cc(n) case n: core(n); break
+ #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
+#else
+ #define N FIR_LENGTH
+ #define BEGINNING sample_t sum = 0; \
+ sample_t const * const __restrict coefs = (sample_t *)COEFS + N * rem;
+ #define _ sum += coefs[j]*at[j], ++j;
+ #define END output[i] = sum
+ #define CORE(n) core(n)
+#endif
+
+#define core(n) \
+ for (i = 0; at < num_in * p->L; ++i, at += step) { \
+ int const div = at / p->L, rem = at % p->L; \
+ sample_t const * const __restrict at = input + div; \
+ int j = 0; BEGINNING; CONVOLVE(n); END;}
static void FUNCTION(stage_t * p, fifo_t * output_fifo)
{
- sample_t const * input = stage_read_p(p);
- int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
- sample_t * output = fifo_reserve(output_fifo, max_num_out);
+ int num_in = min(stage_occupancy(p), p->input_size);
+ if (num_in) {
+ sample_t const * input = stage_read_p(p);
+ int at = p->at.integer, step = p->step.integer;
+ int i, num_out = (num_in * p->L - at + step - 1) / step;
+ sample_t * __restrict output = fifo_reserve(output_fifo, num_out);
- for (i = 0; p->at.integer < num_in * p->L; ++i, p->at.integer += p->step.integer) {
- int div = p->at.integer / p->L, rem = p->at.integer % p->L;
- sample_t const * at = input + div;
- sample_t sum = 0;
- int j = 0;
- CONVOLVE
- output[i] = sum;
+ CORE(N);
+ assert(i == num_out);
+ fifo_read(&p->fifo, at / p->L, NULL);
+ p->at.integer = at % p->L;
}
- assert(max_num_out - i >= 0);
- fifo_trim_by(output_fifo, max_num_out - i);
- fifo_read(&p->fifo, p->at.integer / p->L, NULL);
- p->at.integer = p->at.integer % p->L;
}
#undef _
+#undef CORE
+#undef cc
+#undef core
+#undef N
+#undef BEGINNING
+#undef MIDDLE
+#undef END
#undef CONVOLVE
#undef FIR_LENGTH
#undef FUNCTION
diff --git a/soxr/src/rate.h b/soxr/src/rate.h
deleted file mode 100644
index f6d055a..0000000
--- a/soxr/src/rate.h
+++ /dev/null
@@ -1,726 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-14 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-#include
-#include
-#include
-#include
-
-#include "filter.h"
-
-#if defined SOXR_LIB
-#include "internal.h"
-
-typedef void (* fn_t)(void);
-extern fn_t RDFT_CB[11];
-
-#define rdft_forward_setup (*(void * (*)(int))RDFT_CB[0])
-#define rdft_backward_setup (*(void * (*)(int))RDFT_CB[1])
-#define rdft_delete_setup (*(void (*)(void *))RDFT_CB[2])
-#define rdft_forward (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[3])
-#define rdft_oforward (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[4])
-#define rdft_backward (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[5])
-#define rdft_obackward (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[6])
-#define rdft_convolve (*(void (*)(int, void *, sample_t *, sample_t const *))RDFT_CB[7])
-#define rdft_convolve_portion (*(void (*)(int, sample_t *, sample_t const *))RDFT_CB[8])
-#define rdft_multiplier (*(int (*)(void))RDFT_CB[9])
-#define rdft_reorder_back (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[10])
-
-#endif
-
-#if RATE_SIMD /* Align for SIMD: */
- #include "simd.h"
-#if 0 /* Not using this yet. */
- #define RATE_SIMD_POLY 1
- #define num_coefs4 ((num_coefs + 3) & ~3)
- #define coefs4_check(i) ((i) < num_coefs)
-#else
- #define RATE_SIMD_POLY 0
- #define num_coefs4 num_coefs
- #define coefs4_check(i) 1
-#endif
-
- #define aligned_free _soxr_simd_aligned_free
- #define aligned_malloc _soxr_simd_aligned_malloc
- #define aligned_calloc _soxr_simd_aligned_calloc
-#if 0
- #define FIFO_REALLOC aligned_realloc
- #define FIFO_MALLOC aligned_malloc
- #define FIFO_FREE aligned_free
-
- static void * aligned_realloc(void * q, size_t nb_bytes, size_t copy_bytes) {
- void * p = aligned_malloc(nb_bytes);
- if (p) memcpy(p, q, copy_bytes);
- aligned_free(q);
- return p;
- }
-#endif
-#else
- #define RATE_SIMD_POLY 0
- #define num_coefs4 num_coefs
- #define coefs4_check(i) 1
-
- #define aligned_free free
- #define aligned_malloc malloc
- #define aligned_calloc calloc
-#endif
-
-#define FIFO_SIZE_T int
-#include "fifo.h"
-
-typedef union { /* Int64 in parts */
- #if WORDS_BIGENDIAN
- struct {int32_t ms; uint32_t ls;} parts;
- #else
- struct {uint32_t ls; int32_t ms;} parts;
- #endif
- int64_t all;
-} int64p_t;
-
-typedef union { /* Uint64 in parts */
- #if WORDS_BIGENDIAN
- struct {uint32_t ms, ls;} parts;
- #else
- struct {uint32_t ls, ms;} parts;
- #endif
- uint64_t all;
-} uint64p_t;
-
-#define FLOAT_HI_PREC_CLOCK 0 /* Non-float hi-prec has ~96 bits. */
-#define float_step_t long double /* __float128 is also a (slow) option */
-
-#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) coef_p[(fir_len) * ((interp_order) + 1) * (phase_num) + ((interp_order) + 1) * (fir_coef_num) + (interp_order - coef_interp_num)]
-
-#define raw_coef_t double
-
-static sample_t * prepare_coefs(raw_coef_t const * coefs, int num_coefs,
- int num_phases, int interp_order, double multiplier)
-{
- int i, j, length = num_coefs4 * num_phases;
- sample_t * result = malloc((size_t)(length * (interp_order + 1)) * sizeof(*result));
- double fm1 = coefs[0], f1 = 0, f2 = 0;
-
- for (i = num_coefs4 - 1; i >= 0; --i)
- for (j = num_phases - 1; j >= 0; --j) {
- double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */
- int pos = i * num_phases + j - 1;
- fm1 = coefs4_check(i) && pos > 0 ? coefs[pos - 1] * multiplier : 0;
- switch (interp_order) {
- case 1: b = f1 - f0; break;
- case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break;
- case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break;
- default: if (interp_order) assert(0);
- }
- #define coef_coef(x) \
- coef(result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i)
- coef_coef(0) = (sample_t)f0;
- if (interp_order > 0) coef_coef(1) = (sample_t)b;
- if (interp_order > 1) coef_coef(2) = (sample_t)c;
- if (interp_order > 2) coef_coef(3) = (sample_t)d;
- #undef coef_coef
- f2 = f1, f1 = f0;
- }
- return result;
-}
-
-typedef struct {
- int dft_length, num_taps, post_peak;
- void * dft_forward_setup, * dft_backward_setup;
- sample_t * coefs;
-} dft_filter_t;
-
-typedef struct { /* So generated filter coefs may be shared between channels */
- sample_t * poly_fir_coefs;
- dft_filter_t dft_filter[2];
-} rate_shared_t;
-
-typedef enum {
- irrational_stage = 1,
- cubic_stage,
- dft_stage,
- half_stage,
- rational_stage
-} stage_type_t;
-
-struct stage;
-typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
-#define MULT32 (65536. * 65536.)
-
-typedef union { /* Fixed point arithmetic */
- struct {uint64p_t ls; int64p_t ms;} fix;
- float_step_t flt;
-} step_t;
-
-typedef struct stage {
- /* Common to all stage types: */
- stage_type_t type;
- stage_fn_t fn;
- fifo_t fifo;
- int pre; /* Number of past samples to store */
- int pre_post; /* pre + number of future samples to store */
- int preload; /* Number of zero samples to pre-load the fifo */
- double out_in_ratio; /* For buffer management. */
-
- /* For a stage with variable (run-time generated) filter coefs: */
- rate_shared_t * shared;
- unsigned dft_filter_num; /* Which, if any, of the 2 DFT filters to use */
- sample_t * dft_scratch, * dft_out;
-
- /* For a stage with variable L/M: */
- step_t at, step;
- bool use_hi_prec_clock;
- int L, remM;
- int n, phase_bits, block_len;
- double mult, phase0;
-} stage_t;
-
-#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
-#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
-
-static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo)
-{
- int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
- sample_t const * input = stage_read_p(p);
- sample_t * output = fifo_reserve(output_fifo, max_num_out);
-
-#define integer fix.ms.parts.ms
-#define fraction fix.ms.parts.ls
-#define whole fix.ms.all
- for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
- sample_t const * s = input + p->at.integer;
- double x = p->at.fraction * (1 / MULT32);
- double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b);
- double c = s[1]-*s-a-b;
- output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s));
- }
- assert(max_num_out - i >= 0);
- fifo_trim_by(output_fifo, max_num_out - i);
- fifo_read(&p->fifo, p->at.integer, NULL);
- p->at.integer = 0;
-}
-
-#if RATE_SIMD
- #define dft_out p->dft_out
-#else
- #define dft_out output
-#endif
-
-static void dft_stage_fn(stage_t * p, fifo_t * output_fifo)
-{
- sample_t * output;
- int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
- rate_shared_t const * s = p->shared;
- dft_filter_t const * f = &s->dft_filter[p->dft_filter_num];
- int const overlap = f->num_taps - 1;
-
- while (p->at.integer + p->L * num_in >= f->dft_length) {
- div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L);
- sample_t const * input = fifo_read_ptr(&p->fifo);
- fifo_read(&p->fifo, divd.quot, NULL);
- num_in -= divd.quot;
-
- output = fifo_reserve(output_fifo, f->dft_length);
-
- if (lsx_is_power_of_2(p->L)) { /* F-domain */
- int portion = f->dft_length / p->L;
- memcpy(dft_out, input, (unsigned)portion * sizeof(*dft_out));
- rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch);
- for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
- dft_out[i] = dft_out[(portion << 1) - i],
- dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
- dft_out[portion] = dft_out[1];
- dft_out[portion + 1] = 0;
- dft_out[1] = dft_out[0];
-
- for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
- memcpy(dft_out + i, dft_out, (size_t)portion * sizeof(*dft_out));
- dft_out[i + 1] = 0;
- }
- if (p->step.integer > 0)
- rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
- } else {
- if (p->L == 1)
- memcpy(dft_out, input, (size_t)f->dft_length * sizeof(*dft_out));
- else {
- memset(dft_out, 0, (size_t)f->dft_length * sizeof(*dft_out));
- for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
- dft_out[i] = input[j];
- p->at.integer = p->L - 1 - divd.rem;
- }
- if (p->step.integer > 0)
- rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
- else
- rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
- }
-
- if (p->step.integer > 0) {
- rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs);
- rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
-#if RATE_SIMD
- if (p->step.integer == 1)
- memcpy(output, dft_out, (size_t)f->dft_length * sizeof(sample_t));
-#endif
- if (p->step.integer != 1) {
- for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
- i += p->step.integer)
- output[j] = dft_out[i];
- p->remM = i - (f->dft_length - overlap);
- fifo_trim_by(output_fifo, f->dft_length - j);
- }
- else fifo_trim_by(output_fifo, overlap);
- }
- else { /* F-domain */
- int m = -p->step.integer;
- rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs);
- rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch);
-#if RATE_SIMD
- memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof(sample_t));
-#endif
- fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m);
- }
- }
-}
-
-#undef dft_out
-
-/* Set to 4 x nearest power of 2 */
-/* or half of that if danger of causing too many cache misses. */
-static int set_dft_length(int num_taps, int min, int large)
-{
- double d = log((double)num_taps) / log(2.);
- return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large));
-}
-
-static void dft_stage_init(
- unsigned instance, double Fp, double Fs, double Fn, double att,
- double phase, stage_t * p, int L, int M, double * multiplier,
- int min_dft_size, int large_dft_size)
-{
- dft_filter_t * f = &p->shared->dft_filter[instance];
- int num_taps = 0, dft_length = f->dft_length, i;
- bool f_domain_m = abs(3-M) == 1 && Fs <= 1;
-
- if (!dft_length) {
- int k = phase == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4;
- double * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.);
-
- if (phase != 50)
- lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase);
- else f->post_peak = num_taps / 2;
-
- dft_length = set_dft_length(num_taps, min_dft_size, large_dft_size);
- f->coefs = aligned_calloc((size_t)dft_length, sizeof(*f->coefs));
- for (i = 0; i < num_taps; ++i)
- f->coefs[(i + dft_length - num_taps + 1) & (dft_length - 1)]
- = (sample_t)(h[i] * ((1. / dft_length) * rdft_multiplier() * L * *multiplier));
- free(h);
- }
-
-#if RATE_SIMD
- p->dft_out = aligned_malloc(sizeof(sample_t) * (size_t)dft_length);
-#endif
-#if 1 /* In fact, currently, only pffft needs this. */
- p->dft_scratch = aligned_malloc(2 * sizeof(sample_t) * (size_t)dft_length);
-#endif
-
- if (!f->dft_length) {
- void * coef_setup = rdft_forward_setup(dft_length);
- int Lp = lsx_is_power_of_2(L)? L : 1;
- int Mp = f_domain_m? M : 1;
- f->dft_forward_setup = rdft_forward_setup(dft_length / Lp);
- f->dft_backward_setup = rdft_backward_setup(dft_length / Mp);
- if (Mp == 1)
- rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch);
- else
- rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch);
- rdft_delete_setup(coef_setup);
- f->num_taps = num_taps;
- f->dft_length = dft_length;
- lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i",
- num_taps, dft_length, Fp, Fs, Fn, att, L, M);
- }
- *multiplier = 1;
- p->out_in_ratio = (double)L / M;
- p->type = dft_stage;
- p->fn = dft_stage_fn;
- p->preload = f->post_peak / L;
- p->at.integer = f->post_peak % L;
- p->L = L;
- p->step.integer = f_domain_m? -M/2 : M;
- p->dft_filter_num = instance;
- p->block_len = f->dft_length - (f->num_taps - 1);
- p->phase0 = p->at.integer / p->L;
-}
-
-#include "filters.h"
-
-typedef struct {
- double factor;
- uint64_t samples_in, samples_out;
- int num_stages;
- stage_t * stages;
-} rate_t;
-
-#define pre_stage p->stages[shift]
-#define arb_stage p->stages[shift + have_pre_stage]
-#define post_stage p->stages[shift + have_pre_stage + have_arb_stage]
-#define have_pre_stage (preM * preL != 1)
-#define have_arb_stage (arbM * arbL != 1)
-#define have_post_stage (postM * postL != 1)
-
-#define TO_3dB(a) ((1.6e-6*a-7.5e-4)*a+.646)
-#define LOW_Q_BW0 (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
-
-typedef enum {
- rolloff_none, rolloff_small /* <= 0.01 dB */, rolloff_medium /* <= 0.35 dB */
-} rolloff_t;
-
-
-static char const * rate_init(
- /* Private work areas (to be supplied by the client): */
- rate_t * p, /* Per audio channel. */
- rate_shared_t * shared, /* Between channels (undergoing same rate change)*/
-
- /* Public parameters: Typically */
- double factor, /* Input rate divided by output rate. */
- double bits, /* Required bit-accuracy (pass + stop) 16|20|28 */
- double phase, /* Linear/minimum etc. filter phase. 50 */
- double passband_end, /* 0dB pt. bandwidth to preserve; nyquist=1 0.913*/
- double stopband_begin, /* Aliasing/imaging control; > passband_end 1 */
- rolloff_t rolloff, /* Pass-band roll-off small */
- bool maintain_3dB_pt, /* true */
- double multiplier, /* Linear gain to apply during conversion. 1 */
-
- /* Primarily for test/development purposes: */
- bool use_hi_prec_clock, /* Increase irrational ratio accuracy. false */
- int interpolator, /* Force a particular coef interpolator. -1 */
- size_t max_coefs_size, /* k bytes of coefs to try to keep below. 400 */
- bool noSmallIntOpt, /* Disable small integer optimisations. false */
- int log2_min_dft_size,
- int log2_large_dft_size)
-{
- double att = (bits + 1) * linear_to_dB(2.), attArb = att; /* pass + stop */
- double tbw0 = 1 - passband_end, Fs_a = stopband_begin;
- double arbM = factor, tbw_tighten = 1;
- int n = 0, i, preL = 1, preM = 1, shift = 0, arbL = 1, postL = 1, postM = 1;
- bool upsample = false, rational = false, iOpt = !noSmallIntOpt;
- int mode = rolloff > rolloff_small? factor > 1 || passband_end > LOW_Q_BW0:
- (int)ceil(2 + (bits - 17) / 4);
- stage_t * s;
-
- assert(factor > 0);
- assert(!bits || (15 <= bits && bits <= 33));
- assert(0 <= phase && phase <= 100);
- assert(.53 <= passband_end);
- assert(stopband_begin <= 1.2);
- assert(passband_end + .005 < stopband_begin);
-
- p->factor = factor;
- if (bits) while (!n++) { /* Determine stages: */
- int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 :
- (int)ceil((double)max_coefs_size * 1000. / (U100_l * sizeof(sample_t)));
- double d, epsilon = 0, frac;
- upsample = arbM < 1;
- for (i = (int)(arbM * .5), shift = 0; i >>= 1; arbM *= .5, ++shift);
- preM = upsample || (arbM > 1.5 && arbM < 2);
- postM = 1 + (arbM > 1 && preM), arbM /= postM;
- preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL;
- if ((frac = arbM - (int)arbM))
- epsilon = fabs((uint32_t)(frac * MULT32 + .5) / (frac * MULT32) - 1);
- for (i = 1, rational = !frac; i <= maxL && !rational; ++i) {
- d = frac * i, try = (int)(d + .5);
- if ((rational = fabs(try / d - 1) <= epsilon)) { /* No long doubles! */
- if (try == i)
- arbM = ceil(arbM), shift += arbM > 2, arbM /= 1 + (arbM > 2);
- else arbM = i * (int)arbM + try, arbL = i;
- }
- }
- L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x;
- if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) {
- for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1);
- arbM = arbM * postL / arbL / preL, arbL = 1, n = 0;
- } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt))
- preL = L, preM = M, arbM = arbL = postM = 1;
- if (!mode && (!rational || !n))
- ++mode, n = 0;
- }
-
- p->num_stages = shift + have_pre_stage + have_arb_stage + have_post_stage;
- if (!p->num_stages && multiplier != 1) {
- bits = arbL = 0; /* Use cubic_stage in this case. */
- ++p->num_stages;
- }
- p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages));
- for (i = 0; i < p->num_stages; ++i)
- p->stages[i].shared = shared;
-
- if ((n = p->num_stages) > 1) { /* Att. budget: */
- if (have_arb_stage)
- att += linear_to_dB(2.), attArb = att, --n;
- att += linear_to_dB((double)n);
- }
-
- for (n = 0; (size_t)n + 1 < array_length(half_firs) && att > half_firs[n].att; ++n);
- for (i = 0, s = p->stages; i < shift; ++i, ++s) {
- s->type = half_stage;
- s->fn = half_firs[n].fn;
- s->pre_post = 4 * half_firs[n].num_coefs;
- s->preload = s->pre = s->pre_post >> 1;
- }
-
- if (have_pre_stage) {
- if (maintain_3dB_pt && have_post_stage) { /* Trans. bands overlapping. */
- double tbw3 = tbw0 * TO_3dB(att); /* FFS: consider Fs_a. */
- double x = ((2.1429e-4 - 5.2083e-7 * att) * att - .015863) * att + 3.95;
- x = att * pow((tbw0 - tbw3) / (postM / (factor * postL) - 1 + tbw0), x);
- if (x > .035) {
- tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014;
- lsx_debug("x=%g tbw_tighten=%g", x, tbw_tighten);
- }
- }
- dft_stage_init(0, 1 - tbw0 * tbw_tighten, Fs_a, preM? max(preL, preM) :
- arbM / arbL, att, phase, &pre_stage, preL, max(preM, 1), &multiplier,
- log2_min_dft_size, log2_large_dft_size);
- }
-
- if (!bits && have_arb_stage) { /* `Quick' cubic arb stage: */
- arb_stage.type = cubic_stage;
- arb_stage.fn = cubic_stage_fn;
- arb_stage.mult = multiplier, multiplier = 1;
- arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5);
- arb_stage.pre_post = max(3, arb_stage.step.integer);
- arb_stage.preload = arb_stage.pre = 1;
- arb_stage.out_in_ratio = MULT32 / (double)arb_stage.step.whole;
- }
- else if (have_arb_stage) { /* Higher quality arb stage: */
- poly_fir_t const * f = &poly_firs[6*(upsample + !!preM) + mode - !upsample];
- int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases;
- size_t coefs_size;
- double x = .5, at, Fp, Fs, Fn, mult = upsample? 1 : arbL / arbM;
- poly_fir1_t const * f1;
-
- Fn = !upsample && preM? x = arbM / arbL : 1;
- Fp = !preM? mult : mode? .5 : 1;
- Fs = 2 - Fp; /* Ignore Fs_a; it would have little benefit here. */
- Fp *= 1 - tbw0;
- if (rolloff > rolloff_small && mode)
- Fp = !preM? mult * .5 - .125 : mult * .05 + .1;
- else if (rolloff == rolloff_small)
- Fp = Fs - (Fs - .148 * x - Fp * .852) * (.00813 * bits + .973);
-
- i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1;
- do {
- f1 = &f->interp[++i];
- assert(f1->fn);
- if (i)
- arbM /= arbL, arbL = 1, rational = false;
- phase_bits = (int)ceil(f1->scalar + log(mult)/log(2.));
- phases = !rational? (1 << phase_bits) : arbL;
- if (!f->interp[0].scalar) {
- int phases0 = max(phases, 19), n0 = 0;
- lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta);
- num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM;
- }
- if ((num_coefs & 1) && rational && (arbL & 1))
- phases <<= 1, arbL <<= 1, arbM *= 2;
- at = arbL * (arb_stage.phase0 = .5 * (num_coefs & 1));
- order = i + (i && mode > 4);
- coefs_size = (size_t)(num_coefs4 * phases * (order + 1)) * sizeof(sample_t);
- } while (interpolator < 0 && i < 2 && f->interp[i+1].fn &&
- coefs_size / 1000 > max_coefs_size);
-
- if (!arb_stage.shared->poly_fir_coefs) {
- int num_taps = num_coefs * phases - 1;
- raw_coef_t * coefs = lsx_design_lpf(
- Fp, Fs, Fn, attArb, &num_taps, phases, f->beta);
- arb_stage.shared->poly_fir_coefs = prepare_coefs(
- coefs, num_coefs, phases, order, multiplier);
- lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk",
- num_coefs, phases, order, (double)coefs_size / 1000.);
- free(coefs);
- }
- multiplier = 1;
- arb_stage.type = rational? rational_stage : irrational_stage;
- arb_stage.fn = f1->fn;
- arb_stage.pre_post = num_coefs4 - 1;
- arb_stage.preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs);
- arb_stage.n = num_coefs4;
- arb_stage.phase_bits = phase_bits;
- arb_stage.L = arbL;
- arb_stage.use_hi_prec_clock = mode > 1 && use_hi_prec_clock && !rational;
-#if FLOAT_HI_PREC_CLOCK
- if (arb_stage.use_hi_prec_clock) {
- arb_stage.at.flt = at;
- arb_stage.step.flt = arbM;
- arb_stage.out_in_ratio = (double)(arbL / arb_stage.step.flt);
- } else
-#endif
- {
- arb_stage.at.whole = (int64_t)(at * MULT32 + .5);
-#if !FLOAT_HI_PREC_CLOCK
- if (arb_stage.use_hi_prec_clock) {
- arb_stage.at.fix.ls.parts.ms = 0x80000000ul;
- arbM *= MULT32;
- arb_stage.step.whole = (int64_t)arbM;
- arbM -= (double)arb_stage.step.whole;
- arbM *= MULT32 * MULT32;
- arb_stage.step.fix.ls.all = (uint64_t)arbM;
- } else
-#endif
- arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5);
- arb_stage.out_in_ratio = MULT32 * arbL / (double)arb_stage.step.whole;
- }
- }
-
- if (have_post_stage)
- dft_stage_init(1, 1 - (1 - (1 - tbw0) *
- (upsample? factor * postL / postM : 1)) * tbw_tighten, Fs_a,
- (double)max(postL, postM), att, phase, &post_stage, postL, postM,
- &multiplier, log2_min_dft_size, log2_large_dft_size);
-
-
- lsx_debug("%g: »%i⋅%i/%i⋅%i/%g⋅%i/%i",
- 1/factor, shift, preL, preM, arbL, arbM, postL, postM);
- for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
- fifo_create(&s->fifo, (int)sizeof(sample_t));
- memset(fifo_reserve(&s->fifo, s->preload), 0, sizeof(sample_t) * (size_t)s->preload);
- lsx_debug("%5i|%-5i preload=%i remL=%i o/i=%g",
- s->pre, s->pre_post - s->pre, s->preload, s->at.integer, s->out_in_ratio);
- }
- fifo_create(&s->fifo, (int)sizeof(sample_t));
- return 0;
-}
-
-static void rate_process(rate_t * p)
-{
- stage_t * stage = p->stages;
- int i;
- for (i = 0; i < p->num_stages; ++i, ++stage)
- stage->fn(stage, &(stage+1)->fifo);
-}
-
-static sample_t * rate_input(rate_t * p, sample_t const * samples, size_t n)
-{
- p->samples_in += n;
- return fifo_write(&p->stages[0].fifo, (int)n, samples);
-}
-
-static sample_t const * rate_output(rate_t * p, sample_t * samples, size_t * n)
-{
- fifo_t * fifo = &p->stages[p->num_stages].fifo;
- p->samples_out += *n = min(*n, (size_t)fifo_occupancy(fifo));
- return fifo_read(fifo, (int)*n, samples);
-}
-
-static void rate_flush(rate_t * p)
-{
- fifo_t * fifo = &p->stages[p->num_stages].fifo;
-#if defined _MSC_VER && _MSC_VER == 1200
- uint64_t samples_out = (uint64_t)(int64_t)((double)(int64_t)p->samples_in / p->factor + .5);
-#else
- uint64_t samples_out = (uint64_t)((double)p->samples_in / p->factor + .5);
-#endif
- size_t remaining = (size_t)(samples_out - p->samples_out);
-
- if ((size_t)fifo_occupancy(fifo) < remaining) {
- uint64_t samples_in = p->samples_in;
- sample_t * buff = calloc(1024, sizeof(*buff));
-
- while ((size_t)fifo_occupancy(fifo) < remaining) {
- rate_input(p, buff, 1024);
- rate_process(p);
- }
- fifo_trim_to(fifo, (int)remaining);
- p->samples_in = samples_in;
- free(buff);
- }
-}
-
-static void rate_close(rate_t * p)
-{
- rate_shared_t * shared = p->stages[0].shared;
- int i;
-
- for (i = 0; i <= p->num_stages; ++i) {
- stage_t * s = &p->stages[i];
- aligned_free(s->dft_scratch);
- aligned_free(s->dft_out);
- fifo_delete(&s->fifo);
- }
- if (shared) {
- for (i = 0; i < 2; ++i) {
- dft_filter_t * f= &shared->dft_filter[i];
- aligned_free(f->coefs);
- rdft_delete_setup(f->dft_forward_setup);
- rdft_delete_setup(f->dft_backward_setup);
- }
- free(shared->poly_fir_coefs);
- memset(shared, 0, sizeof(*shared));
- }
- free(p->stages);
-}
-
-#if defined SOXR_LIB
-static double rate_delay(rate_t * p)
-{
-#if defined _MSC_VER && _MSC_VER == 1200
- double samples_out = (double)(int64_t)p->samples_in / p->factor;
- return max(0, samples_out - (double)(int64_t)p->samples_out);
-#else
- double samples_out = (double)p->samples_in / p->factor;
- return max(0, samples_out - (double)p->samples_out);
-#endif
-}
-
-static void rate_sizes(size_t * shared, size_t * channel)
-{
- *shared = sizeof(rate_shared_t);
- *channel = sizeof(rate_t);
-}
-
-#include "soxr.h"
-
-static char const * rate_create(
- void * channel,
- void * shared,
- double io_ratio,
- soxr_quality_spec_t * q_spec,
- soxr_runtime_spec_t * r_spec,
- double scale)
-{
- return rate_init(
- channel, shared,
- io_ratio,
- q_spec->precision,
- q_spec->phase_response,
- q_spec->passband_end,
- q_spec->stopband_begin,
- "\1\2\0"[q_spec->flags & 3],
- !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT),
- scale,
- !!(q_spec->flags & SOXR_HI_PREC_CLOCK),
- (int)(r_spec->flags & 3) - 1,
- r_spec->coef_size_kbytes,
- !!(r_spec->flags & SOXR_NOSMALLINTOPT),
- (int)r_spec->log2_min_dft_size,
- (int)r_spec->log2_large_dft_size);
-}
-
-static char const * id(void)
-{
- return RATE_ID;
-}
-
-fn_t RATE_CB[] = {
- (fn_t)rate_input,
- (fn_t)rate_process,
- (fn_t)rate_output,
- (fn_t)rate_flush,
- (fn_t)rate_close,
- (fn_t)rate_delay,
- (fn_t)rate_sizes,
- (fn_t)rate_create,
- (fn_t)0,
- (fn_t)id,
-};
-#endif
diff --git a/soxr/src/rate32.c b/soxr/src/rate32.c
deleted file mode 100644
index d6dd3b9..0000000
--- a/soxr/src/rate32.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-#define sample_t float
-#define RATE_SIMD 0
-#define RDFT_CB _soxr_rdft32_cb
-#define RATE_CB _soxr_rate32_cb
-#define RATE_ID "single-precision"
-#include "rate.h"
diff --git a/soxr/src/rate32s.c b/soxr/src/rate32s.c
deleted file mode 100644
index 26a371a..0000000
--- a/soxr/src/rate32s.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-#define sample_t float
-#define RATE_SIMD 1
-#define RDFT_CB _soxr_rdft32s_cb
-#define RATE_CB _soxr_rate32s_cb
-#define RATE_ID "single-precision-SIMD"
-#include "rate.h"
diff --git a/soxr/src/rate64.c b/soxr/src/rate64.c
deleted file mode 100644
index 6289911..0000000
--- a/soxr/src/rate64.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-#define sample_t double
-#define RATE_SIMD 0
-#define RDFT_CB _soxr_rdft64_cb
-#define RATE_CB _soxr_rate64_cb
-#define RATE_ID "double-precision"
-#include "rate.h"
diff --git a/soxr/src/rdft.h b/soxr/src/rdft.h
index 59ba174..4ecd247 100644
--- a/soxr/src/rdft.h
+++ b/soxr/src/rdft.h
@@ -1,9 +1,11 @@
/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
-void ORDERED_CONVOLVE(int n, void * not_used, DFT_FLOAT * a, const DFT_FLOAT * b)
+void ORDERED_CONVOLVE(int n, void * not_used, void * A, const void * B)
{
int i;
+ DFT_FLOAT* a = A;
+ const DFT_FLOAT* b = B;
a[0] *= b[0];
a[1] *= b[1];
for (i = 2; i < n; i += 2) {
@@ -14,9 +16,11 @@ void ORDERED_CONVOLVE(int n, void * not_used, DFT_FLOAT * a, const DFT_FLOAT * b
(void)not_used;
}
-void ORDERED_PARTIAL_CONVOLVE(int n, DFT_FLOAT * a, const DFT_FLOAT * b)
+void ORDERED_PARTIAL_CONVOLVE(int n, void * A, const void * B)
{
int i;
+ DFT_FLOAT* a = A;
+ const DFT_FLOAT* b = B;
a[0] *= b[0];
for (i = 2; i < n; i += 2) {
DFT_FLOAT tmp = a[i];
diff --git a/soxr/src/rdft_t.h b/soxr/src/rdft_t.h
new file mode 100644
index 0000000..7e44134
--- /dev/null
+++ b/soxr/src/rdft_t.h
@@ -0,0 +1,40 @@
+/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+typedef struct {
+ void * (* forward_setup)(int);
+ void * (* backward_setup)(int);
+ void (* delete_setup)(void *);
+ void (* forward)(int, void *, void *, void *);
+ void (* oforward)(int, void *, void *, void *);
+ void (* backward)(int, void *, void *, void *);
+ void (* obackward)(int, void *, void *, void *);
+ void (* convolve)(int, void *, void *, void const *);
+ void (* convolve_portion)(int, void *, void const *);
+ int (* multiplier)(void);
+ void (* reorder_back)(int, void *, void *, void *);
+ void * (* malloc)(size_t);
+ void * (* calloc)(size_t, size_t);
+ void (* free)(void *);
+ int (* flags)(void);
+} rdft_cb_table;
+
+#define rdft_forward_setup RDFT_CB->forward_setup
+#define rdft_backward_setup RDFT_CB->backward_setup
+#define rdft_delete_setup RDFT_CB->delete_setup
+#define rdft_forward RDFT_CB->forward
+#define rdft_oforward RDFT_CB->oforward
+#define rdft_backward RDFT_CB->backward
+#define rdft_obackward RDFT_CB->obackward
+#define rdft_convolve RDFT_CB->convolve
+#define rdft_convolve_portion RDFT_CB->convolve_portion
+#define rdft_multiplier RDFT_CB->multiplier
+#define rdft_reorder_back RDFT_CB->reorder_back
+#define rdft_malloc RDFT_CB->malloc
+#define rdft_calloc RDFT_CB->calloc
+#define rdft_free RDFT_CB->free
+#define rdft_flags RDFT_CB->flags
+
+/* Flag templates: */
+#define RDFT_IS_SIMD 1
+#define RDFT_NEEDS_SCRATCH 2
diff --git a/soxr/src/rint-clip.h b/soxr/src/rint-clip.h
index 06764a8..bfb6458 100644
--- a/soxr/src/rint-clip.h
+++ b/soxr/src/rint-clip.h
@@ -1,9 +1,9 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#if defined DITHER
-#define DITHERING (1./32)*(int)(((ran1>>=3)&31)-((ran2>>=3)&31))
+#define DITHERING + (1./32)*(int)(((ran1>>=3)&31)-((ran2>>=3)&31))
#define DITHER_RAND (seed = 1664525UL * seed + 1013904223UL) >> 3
#define DITHER_VARS unsigned long ran1 = DITHER_RAND, ran2 = DITHER_RAND
#define SEED_ARG , unsigned long * seed0
@@ -12,10 +12,11 @@
#define COPY_SEED1 unsigned long seed1 = seed
#define PASS_SEED1 , &seed1
#define PASS_SEED , &seed
+#define FLOATD double
#else
-#define DITHERING 0
+#define DITHERING
#define DITHER_VARS
#define SEED_ARG
#define SAVE_SEED
@@ -23,9 +24,12 @@
#define COPY_SEED1
#define PASS_SEED1
#define PASS_SEED
+#define FLOATD FLOATX
#endif
+#define DO_16 _;_;_;_;_;_;_;_;_;_;_;_;_;_;_;_
+
#if defined FE_INVALID && defined FPU_RINT
@@ -35,8 +39,8 @@ static void RINT_CLIP(RINT_T * const dest, FLOATX const * const src,
COPY_SEED
DITHER_VARS;
for (; i < n; ++i) {
- double d = src[i] + DITHERING;
- dest[stride * i] = RINT(d);
+ FLOATD const d = src[i] DITHERING;
+ RINT(dest[stride * i], d);
if (fe_test_invalid()) {
fe_clear_invalid();
dest[stride * i] = d > 0? RINT_MAX : -RINT_MAX - 1;
@@ -56,29 +60,29 @@ static size_t LSX_RINT_CLIP(void * * const dest0, FLOATX const * const src,
RINT_T * dest = *dest0;
COPY_SEED
#if defined FE_INVALID && defined FPU_RINT
-#define _ dest[i] = RINT(src[i] + DITHERING), ++i,
- fe_clear_invalid();
- for (i = 0; i < (n & ~7u);) {
+#define _ RINT(dest[i], src[i] DITHERING); ++i
+ for (i = 0; i < (n & ~15u);) {
COPY_SEED1;
DITHER_VARS;
- _ _ _ _ _ _ _ _ (void)0;
+ DO_16;
if (fe_test_invalid()) {
fe_clear_invalid();
- RINT_CLIP(dest, src, 1, i - 8, i, &clips PASS_SEED1);
+ RINT_CLIP(dest, src, 1, i - 16, i, &clips PASS_SEED1);
}
}
RINT_CLIP(dest, src, 1, i, n, &clips PASS_SEED);
#else
-#define _ d = src[i] + DITHERING, dest[i++] = (RINT_T)(d > 0? d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5),
+#define _ d = src[i] DITHERING, dest[i++] = (RINT_T)(d > 0? \
+ d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5)
const double N = 1. + RINT_MAX;
double d;
- for (i = 0; i < (n & ~7u);) {
+ for (i = 0; i < (n & ~15u);) {
DITHER_VARS;
- _ _ _ _ _ _ _ _ (void)0;
+ DO_16;
}
{
DITHER_VARS;
- for (; i < n; _ (void)0);
+ for (; i < n; _);
}
#endif
SAVE_SEED;
@@ -97,34 +101,34 @@ static size_t LSX_RINT_CLIP_2(void * * dest0, FLOATX const * const * srcs,
RINT_T * dest = *dest0;
COPY_SEED
#if defined FE_INVALID && defined FPU_RINT
-#define _ dest[stride * i] = RINT(src[i] + DITHERING), ++i,
- fe_clear_invalid();
+#define _ RINT(dest[stride * i], src[i] DITHERING); ++i
for (j = 0; j < stride; ++j, ++dest) {
FLOATX const * const src = srcs[j];
- for (i = 0; i < (n & ~7u);) {
+ for (i = 0; i < (n & ~15u);) {
COPY_SEED1;
DITHER_VARS;
- _ _ _ _ _ _ _ _ (void)0;
+ DO_16;
if (fe_test_invalid()) {
fe_clear_invalid();
- RINT_CLIP(dest, src, stride, i - 8, i, &clips PASS_SEED1);
+ RINT_CLIP(dest, src, stride, i - 16, i, &clips PASS_SEED1);
}
}
RINT_CLIP(dest, src, stride, i, n, &clips PASS_SEED);
}
#else
-#define _ d = src[i] + DITHERING, dest[stride * i++] = (RINT_T)(d > 0? d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5),
+#define _ d = src[i] DITHERING, dest[stride * i++] = (RINT_T)(d > 0? \
+ d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5)
const double N = 1. + RINT_MAX;
double d;
for (j = 0; j < stride; ++j, ++dest) {
FLOATX const * const src = srcs[j];
- for (i = 0; i < (n & ~7u);) {
+ for (i = 0; i < (n & ~15u);) {
DITHER_VARS;
- _ _ _ _ _ _ _ _ (void)0;
+ DO_16;
}
{
DITHER_VARS;
- for (; i < n; _ (void)0);
+ for (; i < n; _);
}
}
#endif
@@ -134,6 +138,7 @@ static size_t LSX_RINT_CLIP_2(void * * dest0, FLOATX const * const * srcs,
}
#undef _
+#undef FLOATD
#undef PASS_SEED
#undef PASS_SEED1
#undef COPY_SEED1
diff --git a/soxr/src/rint.h b/soxr/src/rint.h
index 529e4bb..2f1dfbe 100644
--- a/soxr/src/rint.h
+++ b/soxr/src/rint.h
@@ -1,68 +1,102 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#if !defined soxr_rint_included
#define soxr_rint_included
-#include "soxr-config.h"
-
-
-
-#if HAVE_LRINT && LONG_MAX == 2147483647L
- #include
- #define FPU_RINT32
- #define rint32 lrint
-#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
- #define FPU_RINT32
- static __inline int32_t rint32(double input) {
- int32_t result;
- __asm__ __volatile__("fistpl %0": "=m"(result): "t"(input): "st");
- return result;
- }
-#elif defined __GNUC__ && defined __arm__
- #define FPU_RINT32
- static __inline int32_t rint32(double input) {
- register int32_t result;
- __asm__ __volatile__ ("ftosid %0, %P1": "=w"(result): "w"(input));
- return result;
- }
-#elif defined _MSC_VER && defined _M_IX86 /* FIXME need solution for MSVC x64 */
- #define FPU_RINT32
- static __inline int32_t rint32(double input) {
- int32_t result;
- _asm {
- fld input
- fistp result
- }
- return result;
- }
-#else
- #define rint32(x) (int32_t)((x) < 0? x - .5 : x + .5)
-#endif
-
+#include "std-types.h"
+/* For x86, compiler-supplied versions of these functions (where available)
+ * can have poor performance (e.g. mingw32), so prefer these asm versions: */
#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+ #define FPU_RINT32
#define FPU_RINT16
- static __inline int16_t rint16(double input) {
- int16_t result;
- __asm__ __volatile__("fistps %0": "=m"(result): "t"(input): "st");
- return result;
+ #define rint32D(a,b) __asm__ __volatile__("fistpl %0": "=m"(a): "t"(b): "st")
+ #define rint16D(a,b) __asm__ __volatile__("fistps %0": "=m"(a): "t"(b): "st")
+ #define rint32F rint32D
+ #define rint16F rint16D
+ #define FE_INVALID 1
+ static __inline int fe_test_invalid(void) {
+ int status_word;
+ __asm__ __volatile__("fnstsw %%ax": "=a"(status_word));
+ return status_word & FE_INVALID;
}
-#elif defined _MSC_VER && defined _M_IX86 /* FIXME need solution for MSVC x64 */
+ static __inline int fe_clear_invalid(void) {
+ int32_t status[7];
+ __asm__ __volatile__("fnstenv %0": "=m"(status));
+ status[1] &= ~FE_INVALID;
+ __asm__ __volatile__("fldenv %0": : "m"(*status));
+ return 0;
+ }
+#elif defined _MSC_VER && defined _M_IX86
+ #define FPU_RINT32
#define FPU_RINT16
- static __inline int16_t rint16(double input) {
- int16_t result;
- _asm {
- fld input
- fistp result
- }
- return result;
+ #define rint_fn(N,Y,X) \
+ static __inline void N(Y *y, X x) {Y t; {__asm fld x __asm fistp t} *y=t;}
+ rint_fn(rint32d, int32_t, double)
+ rint_fn(rint32f, int32_t, float )
+ rint_fn(rint16d, int16_t, double)
+ rint_fn(rint16f, int16_t, float )
+ #define rint32D(y,x) rint32d(&(y),x)
+ #define rint32F(y,x) rint32f(&(y),x)
+ #define rint16D(y,x) rint16d(&(y),x)
+ #define rint16F(y,x) rint16f(&(y),x)
+ #define FE_INVALID 1
+ static __inline int fe_test_invalid(void) {
+ short status_word;
+ __asm fnstsw status_word
+ return status_word & FE_INVALID;
}
-#else
- #define rint16(x) (int16_t)((x) < 0? x - .5 : x + .5)
+ static __inline int fe_clear_invalid(void) {
+ int32_t status[7];
+ __asm fnstenv status
+ status[1] &= ~FE_INVALID;
+ __asm fldenv status
+ return 0;
+ }
+#elif defined _MSC_VER && defined _M_X64
+ #include
+ #include
+ #define FPU_RINT32
+ #define FPU_RINT16
+ static __inline void rint32d(int32_t *y, double x) {
+ *y = _mm_cvtsd_si32(_mm_load_sd(&x));}
+ static __inline void rint32f(int32_t *y, float x) {
+ *y = _mm_cvtss_si32(_mm_load_ss(&x));}
+ static __inline void rint16d(int16_t *y, double x) {
+ x = x*65536+32738; *y = (int16_t)(_mm_cvtsd_si32(_mm_load_sd(&x)) >> 16);}
+ #define rint32D(y,x) rint32d(&(y),x)
+ #define rint32F(y,x) rint32f(&(y),x)
+ #define rint16D(y,x) rint16d(&(y),x)
+ #define rint16F(y,x) rint16d(&(y),(double)(x))
+ #define FE_INVALID 1
+ #define fe_test_invalid() (_statusfp() & _SW_INVALID)
+ #define fe_clear_invalid _clearfp /* Note: clears all. */
+#elif HAVE_LRINT && LONG_MAX == 2147483647L && HAVE_FENV_H
+ #include
+ #include
+ #define FPU_RINT32
+ #define rint32D(y,x) ((y)=lrint(x))
+ #define rint32F(y,x) ((y)=lrintf(x))
+ #define fe_test_invalid() fetestexcept(FE_INVALID)
+ #define fe_clear_invalid() feclearexcept(FE_INVALID)
#endif
+#if !defined FPU_RINT32
+ #define rint32D(y,x) ((y)=(int32_t)((x) < 0? x - .5 : x + .5))
+ #define rint32F(y,x) rint32D(y,(double)(x))
+#endif
+#if !defined FPU_RINT16
+ #define rint16D(y,x) ((y)=(int16_t)((x) < 0? x - .5 : x + .5))
+ #define rint16F(y,x) rint16D(y,(double)(x))
+#endif
+
+static __inline int32_t rint32(double input) {
+ int32_t result; rint32D(result, input); return result;}
+
+static __inline int16_t rint16(double input) {
+ int16_t result; rint16D(result, input); return result;}
#endif
diff --git a/soxr/src/simd-dev.h b/soxr/src/simd-dev.h
deleted file mode 100644
index 019325c..0000000
--- a/soxr/src/simd-dev.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-#define PFFT_MACROS_ONLY
-#include "pffft.c"
diff --git a/soxr/src/simd.h b/soxr/src/simd.h
deleted file mode 100644
index 71eefc6..0000000
--- a/soxr/src/simd.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-#if !defined simd_included
-#define simd_included
-
-#include
-
-void * _soxr_simd_aligned_malloc(size_t);
-void * _soxr_simd_aligned_calloc(size_t, size_t);
-void _soxr_simd_aligned_free(void *);
-
-void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b);
-void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b);
-
-#endif
diff --git a/soxr/src/soxr-lsr.c b/soxr/src/soxr-lsr.c
new file mode 100644
index 0000000..58ab50a
--- /dev/null
+++ b/soxr/src/soxr-lsr.c
@@ -0,0 +1,198 @@
+/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+/* Wrapper mostly compatible with `libsamplerate'. */
+
+#include
+#include
+#include "soxr.h"
+#include "soxr-lsr.h"
+#include "rint.h"
+
+
+
+SRC_STATE *src_new(SRC_SRCTYPE id, int channels, SRC_ERROR * error)
+{
+ return src_callback_new(0, id, channels, error, 0);
+}
+
+
+
+SRC_ERROR src_process(SRC_STATE *p, SRC_DATA * io)
+{
+ size_t idone , odone;
+
+ if (!p || !io) return -1;
+
+ soxr_set_error(
+ p, soxr_set_io_ratio(p, 1/io->src_ratio, (size_t)io->output_frames));
+
+ soxr_process(p, io->data_in, /* hack: */
+ (size_t)(io->end_of_input? ~io->input_frames : io->input_frames),
+ &idone, io->data_out, (size_t)io->output_frames, &odone);
+
+ io->input_frames_used = (long)idone, io->output_frames_gen = (long)odone;
+ return -!!soxr_error(p);
+}
+
+
+
+SRC_ERROR src_set_ratio(SRC_STATE * p, double oi_ratio)
+{
+ return -!!soxr_set_io_ratio(p, 1/oi_ratio, 0);
+}
+
+
+
+SRC_ERROR src_reset(SRC_STATE * p)
+{
+ return -!!soxr_clear(p);
+}
+
+
+
+SRC_ERROR src_error(SRC_STATE * p)
+{
+ return -!!soxr_error(p);
+}
+
+
+
+SRC_STATE * src_delete(SRC_STATE * p)
+{
+ soxr_delete(p);
+ return 0;
+}
+
+
+
+SRC_STATE *src_callback_new(src_callback_t fn,
+ SRC_SRCTYPE id, int channels, SRC_ERROR * error0, void * p)
+{
+ soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + (unsigned)id, 0);
+ char const * e = getenv("SOXR_LSR_NUM_THREADS");
+ soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
+ soxr_error_t error;
+ soxr_t soxr = 0;
+
+ assert (channels > 0);
+ soxr = soxr_create(0, 0, (unsigned)channels, &error, 0, &q_spec, &r_spec);
+
+ if (soxr)
+ error = soxr_set_input_fn(soxr, (soxr_input_fn_t)fn, p, 0);
+
+ if (error0)
+ *error0 = -!!error;
+
+ return soxr;
+}
+
+
+
+long src_callback_read(SRC_STATE *p, double oi_ratio, long olen, float * obuf)
+{
+ if (!p || olen < 0) return -1;
+
+ soxr_set_error(p, soxr_set_io_ratio(p, 1/oi_ratio, (size_t)olen));
+ return (long)soxr_output(p, obuf, (size_t)olen);
+}
+
+
+
+SRC_ERROR src_simple(SRC_DATA * io, SRC_SRCTYPE id, int channels)
+{
+ size_t idone, odone;
+ soxr_error_t error;
+ soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + (unsigned)id, 0);
+ char const * e = getenv("SOXR_LSR_NUM_THREADS");
+ soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
+
+ if (!io || channels<=0 || io->input_frames<0 || io->output_frames<0) return-1;
+
+ error = soxr_oneshot(1, io->src_ratio, (unsigned)channels, io->data_in,
+ (size_t)io->input_frames, &idone, io->data_out, (size_t)io->output_frames,
+ &odone, 0, &q_spec, &r_spec);
+
+ io->input_frames_used = (long)idone, io->output_frames_gen = (long)odone;
+
+ return -!!error;
+}
+
+
+
+char const * src_get_name(SRC_SRCTYPE id)
+{
+ static char const * const names[] = {
+ "LSR best sinc", "LSR medium sinc", "LSR fastest sinc",
+ "LSR ZOH", "LSR linear", "SoX VHQ"};
+
+ return (unsigned)id < 5u + !getenv("SOXR_LSR_STRICT")? names[id] : 0;
+}
+
+
+
+char const * src_get_description(SRC_SRCTYPE id)
+{
+ return src_get_name(id);
+}
+
+
+
+char const * src_get_version(void)
+{
+ return soxr_version();
+}
+
+
+
+char const * src_strerror(SRC_ERROR error)
+{
+ return error == 1? "Placeholder." : error ? "soxr error" : soxr_strerror(0);
+}
+
+
+
+int src_is_valid_ratio(double oi_ratio)
+{
+ return getenv("SOXR_LSR_STRICT")?
+ oi_ratio >= 1./256 && oi_ratio <= 256 : oi_ratio > 0;
+}
+
+
+
+void src_short_to_float_array(short const * src, float * dest, int len)
+{
+ assert (src && dest);
+
+ while (len--) dest[len] = (float)(src[len] * (1 / (1. + SHRT_MAX)));
+}
+
+
+
+void src_float_to_short_array(float const * src, short * dest, int len)
+{
+ double d, N = 1. + SHRT_MAX;
+ assert (src && dest);
+
+ while (len--) d = src[len] * N, dest[len] =
+ (short)(d > N - 1? (short)(N - 1) : d < -N? (short)-N : rint16(d));
+}
+
+
+
+void src_int_to_float_array(int const * src, float * dest, int len)
+{
+ assert (src && dest);
+ while (len--) dest[len] = (float)(src[len] * (1 / (32768. * 65536.)));
+}
+
+
+
+void src_float_to_int_array(float const * src, int * dest, int len)
+{
+ double d, N = 32768. * 65536.; /* N.B. int32, not int! (Also above fn.) */
+ assert (src && dest);
+
+ while (len--) d = src[len] * N, dest[len] =
+ d >= N - 1? (int)(N - 1) : d < -N? (int)(-N) : rint32(d);
+}
diff --git a/soxr/src/soxr-lsr.h b/soxr/src/soxr-lsr.h
index c0923aa..b1cc247 100644
--- a/soxr/src/soxr-lsr.h
+++ b/soxr/src/soxr-lsr.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
@@ -37,13 +37,12 @@
#endif
typedef float SRC_SAMPLE;
-#if !defined SOXR_LIB
enum SRC_SRCTYPE_e {SRC_SINC_BEST_QUALITY, SRC_SINC_MEDIUM_QUALITY,
SRC_SINC_FASTEST, SRC_ZERO_ORDER_HOLD, SRC_LINEAR};
typedef int SRC_SRCTYPE;
typedef int SRC_ERROR;
typedef long (* src_callback_t)(void *, SRC_SAMPLE * *);
-typedef struct SRC_STATE SRC_STATE;
+typedef struct soxr SRC_STATE;
typedef struct SRC_DATA {
SRC_SAMPLE * data_in, * data_out;
long input_frames, output_frames;
@@ -51,7 +50,6 @@ typedef struct SRC_DATA {
int end_of_input;
double src_ratio;
} SRC_DATA;
-#endif
SOXR SRC_STATE * src_new(SRC_SRCTYPE, int num_channels, SRC_ERROR *);
SOXR SRC_ERROR src_process (SRC_STATE *, SRC_DATA *);
SOXR SRC_ERROR src_set_ratio(SRC_STATE *, double);
diff --git a/soxr/src/soxr.c b/soxr/src/soxr.c
index 5acace1..cdbfb9a 100644
--- a/soxr/src/soxr.c
+++ b/soxr/src/soxr.c
@@ -1,4 +1,4 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#include
@@ -10,6 +10,30 @@
#include "data-io.h"
#include "internal.h"
+#if AVUTIL_FOUND
+ #include
+#endif
+
+
+
+#if WITH_DEV_TRACE
+
+#include
+#include
+
+int _soxr_trace_level;
+
+void _soxr_trace(char const * fmt, ...)
+{
+ va_list args;
+ va_start(args, fmt);
+ vfprintf(stderr, fmt, args);
+ fputc('\n', stderr);
+ va_end(args);
+}
+
+#endif
+
char const * soxr_version(void)
@@ -19,21 +43,9 @@ char const * soxr_version(void)
+#include "cb_t.h"
+
typedef void sample_t; /* float or double */
-typedef void (* fn_t)(void);
-typedef fn_t control_block_t[10];
-
-#define resampler_input (*(sample_t * (*)(void *, sample_t * samples, size_t n))p->control_block[0])
-#define resampler_process (*(void (*)(void *, size_t))p->control_block[1])
-#define resampler_output (*(sample_t const * (*)(void *, sample_t * samples, size_t * n))p->control_block[2])
-#define resampler_flush (*(void (*)(void *))p->control_block[3])
-#define resampler_close (*(void (*)(void *))p->control_block[4])
-#define resampler_delay (*(double (*)(void *))p->control_block[5])
-#define resampler_sizes (*(void (*)(size_t * shared, size_t * channel))p->control_block[6])
-#define resampler_create (*(char const * (*)(void * channel, void * shared, double io_ratio, soxr_quality_spec_t * q_spec, soxr_runtime_spec_t * r_spec, double scale))p->control_block[7])
-#define resampler_set_io_ratio (*(void (*)(void *, double io_ratio, size_t len))p->control_block[8])
-#define resampler_id (*(char const * (*)(void))p->control_block[9])
-
typedef void * resampler_t; /* For one channel. */
typedef void * resampler_shared_t; /* Between channels. */
typedef void (* deinterleave_t)(sample_t * * dest,
@@ -67,45 +79,52 @@ struct soxr {
-#define RESET_ON_CLEAR (1u<<31)
+#if WITH_CR32 || WITH_CR32S || WITH_CR64 || WITH_CR64S
+ #include "filter.h"
+#else
+ #define lsx_to_3dB(x) ((x)/(x))
+#endif
+
-/* TODO: these should not be here. */
-#define TO_3dB(a) ((1.6e-6*a-7.5e-4)*a+.646)
-#define LOW_Q_BW0 (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
soxr_quality_spec_t soxr_quality_spec(unsigned long recipe, unsigned long flags)
{
soxr_quality_spec_t spec, * p = &spec;
- unsigned quality = recipe & 0xf;
+ unsigned q = recipe & 0xf; /* TODO: move to soxr-lsr.c: */
+ unsigned quality = q > SOXR_LSR2Q+2? SOXR_VHQ : q > SOXR_LSR2Q? SOXR_QQ : q;
double rej;
memset(p, 0, sizeof(*p));
- if (quality > 13) {
+ if (quality > SOXR_PRECISIONQ) {
p->e = "invalid quality type";
return spec;
}
- flags |= quality < SOXR_LSR0Q? RESET_ON_CLEAR : 0;
- if (quality == 13)
- quality = 6;
- else if (quality > 10)
- quality = 0;
- p->phase_response = "\62\31\144"[(recipe & 0x30) >> 4];
+ flags |= quality < SOXR_LSR0Q ? RESET_ON_CLEAR : 0;
+ p->phase_response = "\62\31\144"[(recipe & 0x30)>>4];
p->stopband_begin = 1;
- p->precision = !quality? 0: quality < 3? 16 : quality < 8? 4 + quality * 4 : 55 - quality * 4;
+ p->precision =
+ quality == SOXR_QQ ? 0 :
+ quality <= SOXR_16_BITQ ? 16 :
+ quality <= SOXR_32_BITQ ? 4 + quality * 4 :
+ quality <= SOXR_LSR2Q ? 55 - quality * 4 : /* TODO: move to soxr-lsr.c */
+ 0;
rej = p->precision * linear_to_dB(2.);
p->flags = flags;
- if (quality < 8) {
- p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / TO_3dB(rej);
+ if (quality <= SOXR_32_BITQ || quality == SOXR_PRECISIONQ) {
+ #define LOW_Q_BW0 (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
+ p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / lsx_to_3dB(rej);
if (quality <= 2)
p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
}
- else {
+ else { /* TODO: move to soxr-lsr.c */
static float const bw[] = {.931f, .832f, .663f};
- p->passband_end = bw[quality - 8];
- if (quality - 8 == 2)
- p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
+ p->passband_end = bw[quality - SOXR_LSR0Q];
+ if (quality == SOXR_LSR2Q) {
+ p->flags &= ~SOXR_ROLLOFF_NONE;
+ p->flags |= SOXR_ROLLOFF_LSR2Q | SOXR_PROMOTE_TO_LQ;
+ }
}
if (recipe & SOXR_STEEP_FILTER)
- p->passband_end = 1 - .01 / TO_3dB(rej);
+ p->passband_end = 1 - .01 / lsx_to_3dB(rej);
return spec;
}
@@ -163,39 +182,165 @@ soxr_io_spec_t soxr_io_spec(
-#if HAVE_SIMD
-static bool cpu_has_simd(void)
-{
-#if defined __x86_64__ || defined _M_X64
- return true;
-#elif defined __GNUC__ && defined i386
- uint32_t eax, ebx, ecx, edx;
- __asm__ __volatile__ (
- "pushl %%ebx \n\t"
- "cpuid \n\t"
- "movl %%ebx, %1\n\t"
- "popl %%ebx \n\t"
- : "=a"(eax), "=r"(ebx), "=c"(ecx), "=d"(edx)
- : "a"(1)
- : "cc" );
- return !!(edx & 0x06000000);
-#elif defined _MSC_VER && defined _M_IX86
- uint32_t d;
- __asm {
- xor eax, eax
- inc eax
- push ebx
- cpuid
- pop ebx
- mov d, edx
- }
- return !!(d & 0x06000000);
-#endif
- return false;
-}
+#if (WITH_CR32S && WITH_CR32) || (WITH_CR64S && WITH_CR64)
+ #if defined __GNUC__ && defined __x86_64__
+ #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+ __asm__ __volatile__ ( \
+ "cpuid \n\t" \
+ : "=a" (eax_), "=b" (ebx_), "=c" (ecx_), "=d" (edx_) \
+ : "a" (type), "c" (0));
+ #elif defined __GNUC__ && defined __i386__
+ #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+ __asm__ __volatile__ ( \
+ "mov %%ebx, %%edi \n\t" \
+ "cpuid \n\t" \
+ "xchg %%edi, %%ebx \n\t" \
+ : "=a" (eax_), "=D" (ebx_), "=c" (ecx_), "=d" (edx_) \
+ : "a" (type), "c" (0));
+ #elif defined _M_X64 && defined _MSC_VER && _MSC_VER > 1500
+ void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue);
+ #pragma intrinsic(__cpuidex)
+ #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \
+ int regs[4]; \
+ __cpuidex(regs, type, 0); \
+ eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \
+ } while(0)
+ #elif defined _M_X64 && defined _MSC_VER
+ void __cpuidex(int CPUInfo[4], int info_type);
+ #pragma intrinsic(__cpuidex)
+ #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \
+ int regs[4]; \
+ __cpuidex(regs, type); \
+ eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \
+ } while(0)
+ #elif defined _M_IX86 && defined _MSC_VER
+ #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+ __asm pushad \
+ __asm mov eax, type \
+ __asm xor ecx, ecx \
+ __asm cpuid \
+ __asm mov eax_, eax \
+ __asm mov ebx_, ebx \
+ __asm mov ecx_, ecx \
+ __asm mov edx_, edx \
+ __asm popad
+ #endif
#endif
-extern control_block_t _soxr_rate32s_cb, _soxr_rate32_cb, _soxr_rate64_cb, _soxr_vr32_cb;
+
+
+#if WITH_CR32S && WITH_CR32
+ static bool cpu_has_simd32(void)
+ {
+ #if defined __x86_64__ || defined _M_X64
+ return true;
+ #elif defined __i386__ || defined _M_IX86
+ enum {SSE = 1 << 25, SSE2 = 1 << 26};
+ unsigned eax_, ebx_, ecx_, edx_;
+ CPUID(1, eax_, ebx_, ecx_, edx_);
+ return (edx_ & (SSE|SSE2)) != 0;
+ #elif defined AV_CPU_FLAG_NEON
+ return !!(av_get_cpu_flags() & AV_CPU_FLAG_NEON);
+ #else
+ return false;
+ #endif
+ }
+
+ static bool should_use_simd32(void)
+ {
+ char const * e;
+ return ((e = getenv("SOXR_USE_SIMD" )))? !!atoi(e) :
+ ((e = getenv("SOXR_USE_SIMD32")))? !!atoi(e) : cpu_has_simd32();
+ }
+#else
+ #define should_use_simd32() true
+#endif
+
+
+
+#if WITH_CR64S && WITH_CR64
+ #if defined __GNUC__
+ #define XGETBV(type, eax_, edx_) \
+ __asm__ __volatile__ ( \
+ ".byte 0x0f, 0x01, 0xd0\n" \
+ : "=a"(eax_), "=d"(edx_) : "c" (type));
+ #elif defined _M_X64 && defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219
+ #include
+ #define XGETBV(type, eax_, edx_) do { \
+ union {uint64_t x; uint32_t y[2];} a = {_xgetbv(0)}; \
+ eax_ = a.y[0], edx_ = a.y[1]; \
+ } while(0)
+ #elif defined _M_IX86 && defined _MSC_VER
+ #define XGETBV(type, eax_, edx_) \
+ __asm pushad \
+ __asm mov ecx, type \
+ __asm _emit 0x0f \
+ __asm _emit 0x01 \
+ __asm _emit 0xd0 \
+ __asm mov eax_, eax \
+ __asm mov edx_, edx \
+ __asm popad
+ #else
+ #define XGETBV(type, eax_, edx_) eax_ = edx_ = 0
+ #endif
+
+ static bool cpu_has_simd64(void)
+ {
+ enum {OSXSAVE = 1 << 27, AVX = 1 << 28};
+ unsigned eax_, ebx_, ecx_, edx_;
+ CPUID(1, eax_, ebx_, ecx_, edx_);
+ if ((ecx_ & (OSXSAVE|AVX)) == (OSXSAVE|AVX)) {
+ XGETBV(0, eax_, edx_);
+ return (eax_ & 6) == 6;
+ }
+ return false;
+ }
+
+ static bool should_use_simd64(void)
+ {
+ char const * e;
+ return ((e = getenv("SOXR_USE_SIMD" )))? !!atoi(e) :
+ ((e = getenv("SOXR_USE_SIMD64")))? !!atoi(e) : cpu_has_simd64();
+ }
+#else
+ #define should_use_simd64() true
+#endif
+
+
+
+extern control_block_t
+ _soxr_rate32_cb,
+ _soxr_rate32s_cb,
+ _soxr_rate64_cb,
+ _soxr_rate64s_cb,
+ _soxr_vr32_cb;
+
+
+
+static void runtime_num(char const * env_name,
+ int min, int max, unsigned * field)
+{
+ char const * e = getenv(env_name);
+ if (e) {
+ int i = atoi(e);
+ if (i >= min && i <= max)
+ *field = (unsigned)i;
+ }
+}
+
+
+
+static void runtime_flag(char const * env_name,
+ unsigned n_bits, unsigned n_shift, unsigned long * flags)
+{
+ char const * e = getenv(env_name);
+ if (e) {
+ int i = atoi(e);
+ unsigned long mask = (1UL << n_bits) - 1;
+ if (i >= 0 && i <= (int)mask)
+ *flags &= ~(mask << n_shift), *flags |= ((unsigned long)i << n_shift);
+ }
+}
@@ -207,11 +352,30 @@ soxr_t soxr_create(
soxr_quality_spec_t const * q_spec,
soxr_runtime_spec_t const * runtime_spec)
{
- double io_ratio = output_rate? input_rate? input_rate / output_rate : -1 : input_rate? -1 : 0;
+ double io_ratio = output_rate!=0? input_rate!=0?
+ input_rate / output_rate : -1 : input_rate!=0? -1 : 0;
static const float datatype_full_scale[] = {1, 1, 65536.*32768, 32768};
soxr_t p = 0;
soxr_error_t error = 0;
+#if WITH_DEV_TRACE
+#define _(x) (char)(sizeof(x)>=10? 'a'+(char)(sizeof(x)-10):'0'+(char)sizeof(x))
+ char const * e = getenv("SOXR_TRACE");
+ _soxr_trace_level = e? atoi(e) : 0;
+ {
+ static char const arch[] = {_(char), _(short), _(int), _(long), _(long long)
+ , ' ', _(float), _(double), _(long double)
+ , ' ', _(int *), _(int (*)(int))
+ , ' ', HAVE_BIGENDIAN ? 'B' : 'L'
+#if defined _OPENMP
+ , ' ', 'O', 'M', 'P'
+#endif
+ , 0};
+#undef _
+ lsx_debug("arch: %s", arch);
+ }
+#endif
+
if (q_spec && q_spec->e) error = q_spec->e;
else if (io_spec && (io_spec->itype | io_spec->otype) >= SOXR_SPLIT * 2)
error = "invalid io datatype(s)";
@@ -219,6 +383,8 @@ soxr_t soxr_create(
if (!error && !(p = calloc(sizeof(*p), 1))) error = "malloc failed";
if (p) {
+ control_block_t * control_block;
+
p->q_spec = q_spec? *q_spec : soxr_quality_spec(SOXR_HQ, 0);
if (q_spec) { /* Backwards compatibility with original API: */
@@ -236,35 +402,59 @@ soxr_t soxr_create(
p->io_spec.scale = 1;
p->runtime_spec = runtime_spec? *runtime_spec : soxr_runtime_spec(1);
+
+ runtime_num("SOXR_MIN_DFT_SIZE", 8, 15, &p->runtime_spec.log2_min_dft_size);
+ runtime_num("SOXR_LARGE_DFT_SIZE", 8, 20, &p->runtime_spec.log2_large_dft_size);
+ runtime_num("SOXR_COEFS_SIZE", 100, 800, &p->runtime_spec.coef_size_kbytes);
+ runtime_num("SOXR_NUM_THREADS", 0, 64, &p->runtime_spec.num_threads);
+ runtime_flag("SOXR_COEF_INTERP", 2, 0, &p->runtime_spec.flags);
+
+ runtime_flag("SOXR_STRICT_BUF", 1, 2, &p->runtime_spec.flags);
+ runtime_flag("SOXR_NOSMALLINTOPT", 1, 3, &p->runtime_spec.flags);
+
p->io_spec.scale *= datatype_full_scale[p->io_spec.otype & 3] /
datatype_full_scale[p->io_spec.itype & 3];
+
p->seed = (unsigned long)time(0) ^ (unsigned long)(size_t)p;
-#if HAVE_SINGLE_PRECISION
- if (!HAVE_DOUBLE_PRECISION || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION))
- || (p->q_spec.flags & SOXR_VR)) {
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
+ if (0
+#if WITH_VR32
+ || ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR))
+#endif
+#if WITH_CR32 || WITH_CR32S
+ || !(WITH_CR64 || WITH_CR64S) || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION))
+#endif
+ ) {
p->deinterleave = (deinterleave_t)_soxr_deinterleave_f;
p->interleave = (interleave_t)_soxr_interleave_f;
- memcpy(&p->control_block,
- (p->q_spec.flags & SOXR_VR)? &_soxr_vr32_cb :
-#if HAVE_SIMD
- cpu_has_simd()? &_soxr_rate32s_cb :
+ control_block =
+#if WITH_VR32
+ ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR))? &_soxr_vr32_cb :
#endif
- &_soxr_rate32_cb, sizeof(p->control_block));
+#if WITH_CR32S
+ !WITH_CR32 || should_use_simd32()? &_soxr_rate32s_cb :
+#endif
+ &_soxr_rate32_cb;
}
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
else
#endif
#endif
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
{
p->deinterleave = (deinterleave_t)_soxr_deinterleave;
p->interleave = (interleave_t)_soxr_interleave;
- memcpy(&p->control_block, &_soxr_rate64_cb, sizeof(p->control_block));
+ control_block =
+#if WITH_CR64S
+ !WITH_CR64 || should_use_simd64()? &_soxr_rate64s_cb :
+#endif
+ &_soxr_rate64_cb;
}
#endif
+ memcpy(&p->control_block, control_block, sizeof(p->control_block));
- if (p->num_channels && io_ratio)
+ if (p->num_channels && io_ratio!=0)
error = soxr_set_io_ratio(p, io_ratio, 0);
}
if (error)
@@ -307,7 +497,8 @@ static void soxr_delete0(soxr_t p)
double soxr_delay(soxr_t p)
{
- return (p && !p->error && p->resamplers)? resampler_delay(p->resamplers[0]) : 0;
+ return
+ (p && !p->error && p->resamplers)? resampler_delay(p->resamplers[0]) : 0;
}
@@ -375,13 +566,13 @@ soxr_error_t soxr_set_io_ratio(soxr_t p, double io_ratio, size_t slew_len)
p->io_ratio = io_ratio;
return initialise(p);
}
- if (p->control_block[8]) {
+ if (resampler_set_io_ratio) {
for (i = 0; !error && i < p->num_channels; ++i)
resampler_set_io_ratio(p->resamplers[i], io_ratio, slew_len);
return error;
}
return fabs(p->io_ratio - io_ratio) < 1e-15? 0 :
- "Varying O/I ratio is not supported with this quality level";
+ "varying O/I ratio is not supported with this quality level";
}
@@ -406,7 +597,7 @@ soxr_error_t soxr_clear(soxr_t p) /* TODO: this, properly. */
p->io_spec = tmp.io_spec;
p->num_channels = tmp.num_channels;
p->input_fn_state = tmp.input_fn_state;
- memcpy(p->control_block, tmp.control_block, sizeof(p->control_block));
+ memcpy(&p->control_block, &tmp.control_block, sizeof(p->control_block));
p->deinterleave = tmp.deinterleave;
p->interleave = tmp.interleave;
return (p->q_spec.flags & RESET_ON_CLEAR)?
@@ -481,13 +672,8 @@ static size_t soxr_output_no_callback(soxr_t p, soxr_buf_t out, size_t len)
done = done1;
} else
#endif
- {
- if (p->num_channels > 1) {
- for (u = 0; u < p->num_channels; ++u)
- done = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], len, separated);
- } else
- done = soxr_output_1ch(p, 0, out, len, separated);
- }
+ for (u = 0; u < p->num_channels; ++u)
+ done = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], len, separated);
if (!separated)
p->clips += (p->interleave)(p->io_spec.otype, &out, (sample_t const * const *)p->channel_ptrs,
@@ -616,7 +802,7 @@ soxr_error_t soxr_oneshot(
soxr_quality_spec_t const * q_spec,
soxr_runtime_spec_t const * runtime_spec)
{
- soxr_t resampler = NULL;
+ soxr_t resampler;
soxr_error_t error = q_spec? q_spec->e : 0;
if (!error) {
soxr_quality_spec_t q_spec1;
diff --git a/soxr/src/soxr.h b/soxr/src/soxr.h
index 8d9622d..09ec7c4 100644
--- a/soxr/src/soxr.h
+++ b/soxr/src/soxr.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-18 robs@users.sourceforge.net
*
* This library is free software; you can redistribute it and/or modify it
* under the terms of the GNU Lesser General Public License as published by
@@ -65,8 +65,8 @@ input or output (e.g. ilen, olen). */
/* E.g. #if SOXR_THIS_VERSION >= SOXR_VERSION(0,1,1) ... */
#define SOXR_VERSION(x,y,z) (((x)<<16)|((y)<<8)|(z))
-#define SOXR_THIS_VERSION SOXR_VERSION(0,1,2)
-#define SOXR_THIS_VERSION_STR "0.1.2"
+#define SOXR_THIS_VERSION SOXR_VERSION(0,1,3)
+#define SOXR_THIS_VERSION_STR "0.1.3"
@@ -173,7 +173,7 @@ SOXR size_t /*odone*/ soxr_output(/* Resample and output a block of data.*/
SOXR soxr_error_t soxr_error(soxr_t); /* Query error status. */
SOXR size_t * soxr_num_clips(soxr_t); /* Query int. clip counter (for R/W). */
SOXR double soxr_delay(soxr_t); /* Query current delay in output samples.*/
-SOXR char const * soxr_engine(soxr_t p); /* Query resampling engine name. */
+SOXR char const * soxr_engine(soxr_t); /* Query resampling engine name. */
SOXR soxr_error_t soxr_clear(soxr_t); /* Ready for fresh signal, same config. */
SOXR void soxr_delete(soxr_t); /* Free resources. */
@@ -249,7 +249,6 @@ struct soxr_quality_spec { /* Typically */
#define SOXR_ROLLOFF_MEDIUM 1u /* <= 0.35 dB */
#define SOXR_ROLLOFF_NONE 2u /* For Chebyshev bandwidth. */
-#define SOXR_MAINTAIN_3DB_PT 4u /* Reserved for internal use. */
#define SOXR_HI_PREC_CLOCK 8u /* Increase `irrational' ratio accuracy. */
#define SOXR_DOUBLE_PRECISION 16u /* Use D.P. calcs even if precision <= 20. */
#define SOXR_VR 32u /* Variable-rate resampling. */
@@ -257,21 +256,18 @@ struct soxr_quality_spec { /* Typically */
struct soxr_runtime_spec { /* Typically */
- unsigned log2_min_dft_size;/* For DFT efficiency. [8,15] 10 */
- unsigned log2_large_dft_size;/* For DFT efficiency. [16,20] 17 */
- unsigned coef_size_kbytes; /* For SOXR_COEF_INTERP_AUTO (below). 400 */
- unsigned num_threads; /* If built so. 0 means `automatic'. 1 */
- void * e; /* Reserved for internal use. 0 */
- unsigned long flags; /* Per the following #defines. 0 */
+ unsigned log2_min_dft_size; /* For DFT efficiency. [8,15] 10 */
+ unsigned log2_large_dft_size; /* For DFT efficiency. [8,20] 17 */
+ unsigned coef_size_kbytes; /* For SOXR_COEF_INTERP_AUTO (below). 400 */
+ unsigned num_threads; /* 0: per OMP_NUM_THREADS; 1: 1 thread. 1 */
+ void * e; /* Reserved for internal use. 0 */
+ unsigned long flags; /* Per the following #defines. 0 */
};
/* For `irrational' ratios only: */
#define SOXR_COEF_INTERP_AUTO 0u /* Auto select coef. interpolation. */
#define SOXR_COEF_INTERP_LOW 2u /* Man. select: less CPU, more memory. */
#define SOXR_COEF_INTERP_HIGH 3u /* Man. select: more CPU, less memory. */
-#define SOXR_STRICT_BUFFERING 4u /* Reserved for future use. */
-#define SOXR_NOSMALLINTOPT 8u /* For test purposes only. */
-
/* -------------------------- API type constructors ------------------------- */
@@ -296,7 +292,7 @@ SOXR soxr_quality_spec_t soxr_quality_spec(
#define SOXR_24_BITQ 5
#define SOXR_28_BITQ 6
#define SOXR_32_BITQ 7
- /* Libsamplerate equivalent qualities: */
+ /* Reserved for internal use (to be removed): */
#define SOXR_LSR0Q 8 /* 'Best sinc'. */
#define SOXR_LSR1Q 9 /* 'Medium sinc'. */
#define SOXR_LSR2Q 10 /* 'Fast sinc'. */
@@ -304,8 +300,8 @@ SOXR soxr_quality_spec_t soxr_quality_spec(
#define SOXR_LINEAR_PHASE 0x00
#define SOXR_INTERMEDIATE_PHASE 0x10
#define SOXR_MINIMUM_PHASE 0x30
+
#define SOXR_STEEP_FILTER 0x40
-#define SOXR_ALLOW_ALIASING 0x80 /* Reserved for future use. */
diff --git a/soxr/src/sse2neon.h b/soxr/src/sse2neon.h
deleted file mode 100644
index 65efed3..0000000
--- a/soxr/src/sse2neon.h
+++ /dev/null
@@ -1,6292 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
-//
-// This header file does not yet translate all of the SSE intrinsics.
-//
-// Contributors to this work are:
-// John W. Ratcliff
-// Brandon Rowlett
-// Ken Fast
-// Eric van Beurden
-// Alexander Potylitsin
-// Hasindu Gamaarachchi
-// Jim Huang
-// Mark Cheng
-// Malcolm James MacLeod
-// Devin Hussey (easyaspi314)
-// Sebastian Pop
-// Developer Ecosystem Engineering
-// Danila Kutenin
-// François Turban (JishinMaster)
-// Pei-Hsuan Hung
-// Yang-Hao Yuan
-
-/*
- * sse2neon is freely redistributable under the MIT License.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Tunable configurations */
-
-/* Enable precise implementation of _mm_min_ps and _mm_max_ps
- * This would slow down the computation a bit, but gives consistent result with
- * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
- */
-#ifndef SSE2NEON_PRECISE_MINMAX
-#define SSE2NEON_PRECISE_MINMAX (0)
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
-#define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
-#else
-#error "Macro name collisions may happen with unsupported compiler."
-#ifdef FORCE_INLINE
-#undef FORCE_INLINE
-#endif
-#define FORCE_INLINE static inline
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
-#endif
-
-#include
-#include
-
-/* Architecture-specific build options */
-/* FIXME: #pragma GCC push_options is only available on GCC */
-#if defined(__GNUC__)
-#if defined(__arm__) && __ARM_ARCH == 7
-/* According to ARM C Language Extensions Architecture specification,
- * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
- * architecture supported.
- */
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
-#endif
-#pragma GCC push_options
-#pragma GCC target("fpu=neon")
-#elif defined(__aarch64__)
-#pragma GCC push_options
-#pragma GCC target("+simd")
-#else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
-#endif
-#endif
-
-#include
-
-/* Rounding functions require either Aarch64 instructions or libm failback */
-#if !defined(__aarch64__)
-#include
-#endif
-
-/* "__has_builtin" can be used to query support for built-in functions
- * provided by gcc/clang and other compilers that support it.
- */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if __GNUC__ <= 9
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-#else
-#define __has_builtin(x) 0
-#endif
-#endif
-
-/**
- * MACRO for shuffle parameter for _mm_shuffle_ps().
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
- * fp0 is the same for fp0 of result.
- */
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
- (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-#define _MM_FROUND_NO_EXC 0x08
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
-
-/* A few intrinsics accept traditional data types like ints or floats, but
- * most operate on data types that are specific to SSE.
- * If a vector type ends in d, it contains doubles, and if it does not have
- * a suffix, it contains floats. An integer vector type can contain any type
- * of integer, from chars to shorts to unsigned long longs.
- */
-typedef int64x1_t __m64;
-typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
-// On ARM 32-bit architecture, the float64x2_t is not supported.
-// The data type __m128d should be represented in a different way for related
-// intrinsic conversion.
-#if defined(__aarch64__)
-typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
-#else
-typedef float32x4_t __m128d;
-#endif
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
-
-/* type-safe casting between types */
-
-#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
-#define vreinterpretq_m128_f32(x) (x)
-#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
-
-#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
-#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
-#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
-#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
-#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
-#define vreinterpretq_f32_m128(x) (x)
-#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
-
-#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
-#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
-#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
-#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
-#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
-#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
-#define vreinterpretq_m128i_s64(x) (x)
-
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
-
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
-#define vreinterpretq_s64_m128i(x) (x)
-
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
-
-#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
-#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
-#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
-#define vreinterpret_m64_s64(x) (x)
-
-#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
-#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
-#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
-#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
-
-#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
-#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
-#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
-
-#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
-#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
-#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
-#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
-
-#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
-#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
-#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
-#define vreinterpret_s64_m64(x) (x)
-
-#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
-
-#if defined(__aarch64__)
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
-#define vreinterpretq_m128d_f64(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
-
-#define vreinterpretq_f64_m128d(x) (x)
-#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
-#else
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128d_f32(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_f32_m128d(x) (x)
-#endif
-
-// A struct is defined in this header file called 'SIMDVec' which can be used
-// by applications which attempt to access the contents of an _m128 struct
-// directly. It is important to note that accessing the __m128 struct directly
-// is bad coding practice by Microsoft: @see:
-// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
-//
-// However, some legacy source code may try to access the contents of an __m128
-// struct directly so the developer can use the SIMDVec as an alias for it. Any
-// casting must be done manually by the developer, as you cannot cast or
-// otherwise alias the base NEON data type for intrinsic operations.
-//
-// union intended to allow direct access to an __m128 variable using the names
-// that the MSVC compiler provides. This union should really only be used when
-// trying to access the members of the vector as integer values. GCC/clang
-// allow native access to the float members through a simple array access
-// operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause
-// a performance hit. If it really is needed however, the original __m128
-// variable can be aliased with a pointer to this union and used to access
-// individual components. The use of this union should be hidden behind a macro
-// that is used throughout the codebase to access the members instead of always
-// declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec {
- float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
- int8_t m128_i8[16]; // as signed 8-bit integers.
- int16_t m128_i16[8]; // as signed 16-bit integers.
- int32_t m128_i32[4]; // as signed 32-bit integers.
- int64_t m128_i64[2]; // as signed 64-bit integers.
- uint8_t m128_u8[16]; // as unsigned 8-bit integers.
- uint16_t m128_u16[8]; // as unsigned 16-bit integers.
- uint32_t m128_u32[4]; // as unsigned 32-bit integers.
- uint64_t m128_u64[2]; // as unsigned 64-bit integers.
-} SIMDVec;
-
-// casting using SIMDVec
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
-#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
-
-/* Backwards compatibility for compilers with lack of specific type support */
-
-// Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__)
-#if __GNUC__ <= 9
-FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
-{
- uint8x16x4_t ret;
- ret.val[0] = vld1q_u8(p + 0);
- ret.val[1] = vld1q_u8(p + 16);
- ret.val[2] = vld1q_u8(p + 32);
- ret.val[3] = vld1q_u8(p + 48);
- return ret;
-}
-#endif
-#endif
-
-/* Function Naming Conventions
- * The naming convention of SSE intrinsics is straightforward. A generic SSE
- * intrinsic function is given as follows:
- * _mm__
- *
- * The parts of this format are given as follows:
- * 1. describes the operation performed by the intrinsic
- * 2. identifies the data type of the function's primary arguments
- *
- * This last part, , is a little complicated. It identifies the
- * content of the input values, and can be set to any of the following values:
- * + ps - vectors contain floats (ps stands for packed single-precision)
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
- * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- * signed integers
- * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- * unsigned integers
- * + si128 - unspecified 128-bit vector or 256-bit vector
- * + m128/m128i/m128d - identifies input vector types when they are different
- * than the type of the returned vector
- *
- * For example, _mm_setzero_ps. The _mm implies that the function returns
- * a 128-bit vector. The _ps at the end implies that the argument vectors
- * contain floats.
- *
- * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
- * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
- * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
- * // Set packed 8-bit integers
- * // 128 bits, 16 chars, per 8 bits
- * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
- * 4, 5, 12, 13, 6, 7, 14, 15);
- * // Shuffle packed 8-bit integers
- * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
- *
- * Data (Number, Binary, Byte Index):
- +------+------+-------------+------+------+-------------+
- | 1 | 2 | 3 | 4 | Number
- +------+------+------+------+------+------+------+------+
- | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
- +------+------+------+------+------+------+------+------+
- | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index
- +------+------+------+------+------+------+------+------+
-
- +------+------+------+------+------+------+------+------+
- | 5 | 6 | 7 | 8 | Number
- +------+------+------+------+------+------+------+------+
- | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
- +------+------+------+------+------+------+------+------+
- | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index
- +------+------+------+------+------+------+------+------+
- * Index (Byte Index):
- +------+------+------+------+------+------+------+------+
- | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 |
- +------+------+------+------+------+------+------+------+
-
- +------+------+------+------+------+------+------+------+
- | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 |
- +------+------+------+------+------+------+------+------+
- * Result:
- +------+------+------+------+------+------+------+------+
- | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index
- +------+------+------+------+------+------+------+------+
- | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
- +------+------+------+------+------+------+------+------+
- | 256 | 2 | 5 | 6 | Number
- +------+------+------+------+------+------+------+------+
-
- +------+------+------+------+------+------+------+------+
- | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index
- +------+------+------+------+------+------+------+------+
- | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
- +------+------+------+------+------+------+------+------+
- | 3 | 7 | 4 | 8 | Number
- +------+------+------+------+------+------+-------------+
- */
-
-/* Set/get methods */
-
-/* Constants for use with _mm_prefetch. */
-enum _mm_hint {
- _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
- _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
- _MM_HINT_T1 = 2, /* load data to L2 cache only */
- _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
- _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
- _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */
- _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */
- _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */
-};
-
-// Loads one cache line of data from address p to a location closer to the
-// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
-FORCE_INLINE void _mm_prefetch(const void *p, int i)
-{
- (void) i;
- __builtin_prefetch(p);
-}
-
-// Copy the lower single-precision (32-bit) floating-point element of a to dst.
-//
-// dst[31:0] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
- return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-//
-// dst[31:0] := Convert_FP32_To_Int32(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
-#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-//
-// dst[63:0] := Convert_FP32_To_Int64(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
-FORCE_INLINE int _mm_cvtss_si64(__m128 a)
-{
-#if defined(__aarch64__)
- return vgetq_lane_s64(
- vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
-#else
- float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
- float32_t diff = data - floor(data);
- if (diff > 0.5)
- return (int64_t) ceil(data);
- if (diff == 0.5) {
- int64_t f = (int64_t) floor(data);
- int64_t c = (int64_t) ceil(data);
- return c & 1 ? f : c;
- }
- return (int64_t) floor(data);
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-//
-// FOR j := 0 to 1
-// i := 32*j
-// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
-FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
-{
- return vreinterpret_m64_s32(
- vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-//
-// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
-FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
-{
- return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-//
-// FOR j := 0 to 1
-// i := 32*j
-// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
-#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-//
-// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
-#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-//
-// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
-FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
-{
- return vgetq_lane_s64(
- vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
-}
-
-// Sets the 128-bit value to zero
-// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_setzero_si128(void)
-{
- return vreinterpretq_m128i_s32(vdupq_n_s32(0));
-}
-
-// Clears the four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
- return vreinterpretq_m128_f32(vdupq_n_f32(0));
-}
-
-// Return vector of type __m128d with all elements set to zero.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
-FORCE_INLINE __m128d _mm_setzero_pd(void)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(vdupq_n_f64(0));
-#else
- return vreinterpretq_m128d_f32(vdupq_n_f32(0));
-#endif
-}
-
-// Sets the four single-precision, floating-point values to w.
-//
-// r0 := r1 := r2 := r3 := w
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
- return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to w.
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
- return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs.
-// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
- float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
- return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Copy single-precision (32-bit) floating-point element a to the lower element
-// of dst, and zero the upper 3 elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
-FORCE_INLINE __m128 _mm_set_ss(float a)
-{
- float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
- return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs in
-// reverse order.
-// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
-{
- float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
- return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Sets the 8 signed 16-bit integer values in reverse order.
-//
-// Return Value
-// r0 := w0
-// r1 := w1
-// ...
-// r7 := w7
-FORCE_INLINE __m128i _mm_setr_epi16(short w0,
- short w1,
- short w2,
- short w3,
- short w4,
- short w5,
- short w6,
- short w7)
-{
- int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
- return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
-}
-
-// Sets the 4 signed 32-bit integer values in reverse order
-// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
-{
- int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
- return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values in reverse order.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
-FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
-{
- return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
-}
-
-// Sets the 16 signed 8-bit integer values to b.
-//
-// r0 := b
-// r1 := b
-// ...
-// r15 := b
-//
-// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
-{
- return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
-FORCE_INLINE __m128d _mm_set1_pd(double d)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(vdupq_n_f64(d));
-#else
- return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
-#endif
-}
-
-// Sets the 8 signed 16-bit integer values to w.
-//
-// r0 := w
-// r1 := w
-// ...
-// r7 := w
-//
-// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
-{
- return vreinterpretq_m128i_s16(vdupq_n_s16(w));
-}
-
-// Sets the 16 signed 8-bit integer values.
-// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
- signed char b14,
- signed char b13,
- signed char b12,
- signed char b11,
- signed char b10,
- signed char b9,
- signed char b8,
- signed char b7,
- signed char b6,
- signed char b5,
- signed char b4,
- signed char b3,
- signed char b2,
- signed char b1,
- signed char b0)
-{
- int8_t ALIGN_STRUCT(16)
- data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
- (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
- (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
- (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
- return (__m128i) vld1q_s8(data);
-}
-
-// Sets the 8 signed 16-bit integer values.
-// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi16(short i7,
- short i6,
- short i5,
- short i4,
- short i3,
- short i2,
- short i1,
- short i0)
-{
- int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
- return vreinterpretq_m128i_s16(vld1q_s16(data));
-}
-
-// Sets the 16 signed 8-bit integer values in reverse order.
-// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
- signed char b1,
- signed char b2,
- signed char b3,
- signed char b4,
- signed char b5,
- signed char b6,
- signed char b7,
- signed char b8,
- signed char b9,
- signed char b10,
- signed char b11,
- signed char b12,
- signed char b13,
- signed char b14,
- signed char b15)
-{
- int8_t ALIGN_STRUCT(16)
- data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
- (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
- (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
- (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
- return (__m128i) vld1q_s8(data);
-}
-
-// Sets the 4 signed 32-bit integer values to i.
-//
-// r0 := i
-// r1 := i
-// r2 := i
-// r3 := I
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
- return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
-}
-
-// Sets the 2 signed 64-bit integer values to i.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
-FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
-{
- return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
-}
-
-// Sets the 2 signed 64-bit integer values to i.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
-{
- return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
-}
-
-// Sets the 4 signed 32-bit integer values.
-// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
- int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
- return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
-{
- int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
- return vreinterpretq_m128i_s64(vld1q_s64(data));
-}
-
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
-{
- return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
-FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
-{
- double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
-#else
- return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
-#endif
-}
-
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
- vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
- vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
-{
- vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
- vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Stores the lower single - precision, floating - point value.
-// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
- vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
-FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
- vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
-#else
- vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
-FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
- float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
- vst1q_f64((float64_t *) mem_addr,
- vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
-#else
- float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
- vst1q_f32((float32_t *) mem_addr,
- vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
-#define _mm_store1_pd _mm_store_pd1
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
-FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
-{
- _mm_store_pd(mem_addr, a);
-}
-
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
-// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
-{
- uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
- uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
- *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
-}
-
-// Stores the lower two single-precision floating point values of a to the
-// address p.
-//
-// *p0 := a0
-// *p1 := a1
-//
-// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
-{
- *p = vreinterpret_m64_f32(vget_low_f32(a));
-}
-
-// Stores the upper two single-precision, floating-point values of a to the
-// address p.
-//
-// *p0 := a2
-// *p1 := a3
-//
-// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
-{
- *p = vreinterpret_m64_f32(vget_high_f32(a));
-}
-
-// Loads a single single-precision, floating-point value, copying it into all
-// four words
-// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
-{
- return vreinterpretq_m128_f32(vld1q_dup_f32(p));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-//
-// dst[31:0] := MEM[mem_addr+31:mem_addr]
-// dst[63:32] := MEM[mem_addr+31:mem_addr]
-// dst[95:64] := MEM[mem_addr+31:mem_addr]
-// dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
-#define _mm_load_ps1 _mm_load1_ps
-
-// Sets the lower two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the upper two values are passed
-// through from a.
-//
-// Return Value
-// r0 := *p0
-// r1 := *p1
-// r2 := a2
-// r3 := a3
-//
-// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
-{
- return vreinterpretq_m128_f32(
- vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
-}
-
-// Load 4 single-precision (32-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-// dst[31:0] := MEM[mem_addr+127:mem_addr+96]
-// dst[63:32] := MEM[mem_addr+95:mem_addr+64]
-// dst[95:64] := MEM[mem_addr+63:mem_addr+32]
-// dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
-FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
-{
- float32x4_t v = vrev64q_f32(vld1q_f32(p));
- return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
-}
-
-// Sets the upper two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the lower two values are passed
-// through from a.
-//
-// r0 := a0
-// r1 := a1
-// r2 := *p0
-// r3 := *p1
-//
-// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
-{
- return vreinterpretq_m128_f32(
- vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
-}
-
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
-{
- return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
-{
- // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
- // equivalent for neon
- return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load unaligned 16-bit integer from memory into the first element of dst.
-//
-// dst[15:0] := MEM[mem_addr+15:mem_addr]
-// dst[MAX:16] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
-FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
-{
- return vreinterpretq_m128i_s16(
- vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
-}
-
-// Load unaligned 64-bit integer from memory into the first element of dst.
-//
-// dst[63:0] := MEM[mem_addr+63:mem_addr]
-// dst[MAX:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
-FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
-{
- return vreinterpretq_m128i_s64(
- vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower of dst, and zero the upper element. mem_addr does not need to be
-// aligned on any particular boundary.
-//
-// dst[63:0] := MEM[mem_addr+63:mem_addr]
-// dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
-#else
- const float *fp = (const float *) p;
- float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
- return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Loads two double-precision from 16-byte aligned memory, floating-point
-// values.
-//
-// dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
-FORCE_INLINE __m128d _mm_load_pd(const double *p)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(vld1q_f64(p));
-#else
- const float *fp = (const float *) p;
- float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
- return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Loads two double-precision from unaligned memory, floating-point values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
-FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
-{
- return _mm_load_pd(p);
-}
-
-// Loads an single - precision, floating - point value into the low word and
-// clears the upper three words.
-// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
-{
- return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
-
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
-{
- /* Load the lower 64 bits of the value pointed to by p into the
- * lower 64 bits of the result, zeroing the upper 64 bits of the result.
- */
- return vreinterpretq_m128i_s32(
- vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower element of dst, and copy the upper element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-//
-// dst[63:0] := MEM[mem_addr+63:mem_addr]
-// dst[127:64] := a[127:64]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
-FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(
- vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
-#else
- return vreinterpretq_m128d_f32(
- vcombine_f32(vld1_f32((const float *) p),
- vget_high_f32(vreinterpretq_f32_m128d(a))));
-#endif
-}
-
-// Load 2 double-precision (64-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-// dst[63:0] := MEM[mem_addr+127:mem_addr+64]
-// dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
-FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
-{
-#if defined(__aarch64__)
- float64x2_t v = vld1q_f64(p);
- return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
-#else
- int64x2_t v = vld1q_s64((const int64_t *) p);
- return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
-#endif
-}
-
-// Sets the low word to the single-precision, floating-point value of b
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
-FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_f32(
- vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
- vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the lower double-precision (64-bit) floating-point element from b to the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-//
-// dst[63:0] := b[63:0]
-// dst[127:64] := a[127:64]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
-FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
-{
- return vreinterpretq_m128d_f32(
- vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
- vget_high_f32(vreinterpretq_f32_m128d(a))));
-}
-
-// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
-// upper element.
-//
-// dst[63:0] := a[63:0]
-// dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
-FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
-{
- return vreinterpretq_m128i_s64(
- vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
-}
-
-/* Logic/Binary operations */
-
-// Computes the bitwise AND-NOT of the four single-precision, floating-point
-// values of a and b.
-//
-// r0 := ~a0 & b0
-// r1 := ~a1 & b1
-// r2 := ~a2 & b2
-// r3 := ~a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_s32(
- vbicq_s32(vreinterpretq_s32_m128(b),
- vreinterpretq_s32_m128(a))); // *NOTE* argument swap
-}
-
-// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-//
-// FOR j := 0 to 1
-// i := j*64
-// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
-FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
-{
- // *NOTE* argument swap
- return vreinterpretq_m128d_s64(
- vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
-}
-
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
-// 128-bit value in a.
-//
-// r := (~a) & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s32(
- vbicq_s32(vreinterpretq_s32_m128i(b),
- vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
-}
-
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
-// b.
-//
-// r := a & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s32(
- vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the bitwise AND of the four single-precision, floating-point values
-// of a and b.
-//
-// r0 := a0 & b0
-// r1 := a1 & b1
-// r2 := a2 & b2
-// r3 := a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_s32(
- vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise AND of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-//
-// FOR j := 0 to 1
-// i := j*64
-// dst[i+63:i] := a[i+63:i] AND b[i+63:i]
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
-FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
-{
- return vreinterpretq_m128d_s64(
- vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Computes the bitwise OR of the four single-precision, floating-point values
-// of a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_s32(
- vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Computes bitwise EXOR (exclusive-or) of the four single-precision,
-// floating-point values of a and b.
-// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_s32(
- veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-//
-// FOR j := 0 to 1
-// i := j*64
-// dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
-FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
-{
- return vreinterpretq_m128d_s64(
- veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise OR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
-FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
-{
- return vreinterpretq_m128d_s64(
- vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
-//
-// r := a | b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s32(
- vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
-// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s32(
- veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Duplicate odd-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
-FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
-{
-#if __has_builtin(__builtin_shufflevector)
- return vreinterpretq_m128_f32(__builtin_shufflevector(
- vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
-#else
- float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
- float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
- float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
- return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Duplicate even-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
-FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
-{
-#if __has_builtin(__builtin_shufflevector)
- return vreinterpretq_m128_f32(__builtin_shufflevector(
- vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
-#else
- float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
- float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
- float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
- return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Moves the upper two values of B into the lower two values of A.
-//
-// r3 := a3
-// r2 := a2
-// r1 := b3
-// r0 := b2
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
-{
- float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
- float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
- return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-}
-
-// Moves the lower two values of B into the upper two values of A.
-//
-// r3 := b1
-// r2 := b0
-// r1 := a1
-// r0 := a0
-FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
-{
- float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
- float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
- return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-//
-// FOR j := 0 to 3
-// i := j*32
-// dst[i+31:i] := ABS(a[i+31:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-{
- return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-//
-// FOR j := 0 to 7
-// i := j*16
-// dst[i+15:i] := ABS(a[i+15:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
-{
- return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-//
-// FOR j := 0 to 15
-// i := j*8
-// dst[i+7:i] := ABS(a[i+7:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
-{
- return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-//
-// FOR j := 0 to 1
-// i := j*32
-// dst[i+31:i] := ABS(a[i+31:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
-FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
-{
- return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// dst[i+15:i] := ABS(a[i+15:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
-FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
-{
- return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-//
-// FOR j := 0 to 7
-// i := j*8
-// dst[i+7:i] := ABS(a[i+7:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
-FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-{
- return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-}
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
- float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
- float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
- return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high
-// end of result takes the higher two 32 bit values from b and swaps them and
-// places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
- float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
- float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
- return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
- float32x2_t a21 = vget_high_f32(
- vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
- float32x2_t b03 = vget_low_f32(
- vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
- return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
- float32x2_t a03 = vget_low_f32(
- vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
- float32x2_t b21 = vget_high_f32(
- vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
- return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
- float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
- float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
- return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
- float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
- float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
- return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
- float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
- float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
- return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
-// high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
- float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
- float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
- return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
- float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
- float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
- return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
- float32x2_t a22 =
- vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
- float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
- return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
- float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
- float32x2_t b22 =
- vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
- return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
- float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
- float32x2_t a22 =
- vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
- float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
- float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
- return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
- float32x2_t a33 =
- vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
- float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
- return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
- float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
- float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
- float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
- float32x2_t b20 = vset_lane_f32(b2, b00, 1);
- return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
- float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
- float32_t b2 = vgetq_lane_f32(b, 2);
- float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
- float32x2_t b20 = vset_lane_f32(b2, b00, 1);
- return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
- float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
- float32_t b2 = vgetq_lane_f32(b, 2);
- float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
- float32x2_t b20 = vset_lane_f32(b2, b00, 1);
- return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
-}
-
-// NEON does not support a general purpose permute intrinsic
-// Selects four specific single-precision, floating-point values from a and b,
-// based on the mask i.
-//
-// C equivalent:
-// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-// __constrange(0, 255) int imm) {
-// __m128 ret;
-// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
-// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
-// return ret;
-// }
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-#define _mm_shuffle_ps_default(a, b, imm) \
- __extension__({ \
- float32x4_t ret; \
- ret = vmovq_n_f32( \
- vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \
- ret = vsetq_lane_f32( \
- vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
- ret, 1); \
- ret = vsetq_lane_f32( \
- vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
- ret, 2); \
- ret = vsetq_lane_f32( \
- vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
- ret, 3); \
- vreinterpretq_m128_f32(ret); \
- })
-
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
-// int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_ps(a, b, imm) \
- __extension__({ \
- float32x4_t _input1 = vreinterpretq_f32_m128(a); \
- float32x4_t _input2 = vreinterpretq_f32_m128(b); \
- float32x4_t _shuf = __builtin_shufflevector( \
- _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
- (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
- vreinterpretq_m128_f32(_shuf); \
- })
-#else // generic
-#define _mm_shuffle_ps(a, b, imm) \
- __extension__({ \
- __m128 ret; \
- switch (imm) { \
- case _MM_SHUFFLE(1, 0, 3, 2): \
- ret = _mm_shuffle_ps_1032((a), (b)); \
- break; \
- case _MM_SHUFFLE(2, 3, 0, 1): \
- ret = _mm_shuffle_ps_2301((a), (b)); \
- break; \
- case _MM_SHUFFLE(0, 3, 2, 1): \
- ret = _mm_shuffle_ps_0321((a), (b)); \
- break; \
- case _MM_SHUFFLE(2, 1, 0, 3): \
- ret = _mm_shuffle_ps_2103((a), (b)); \
- break; \
- case _MM_SHUFFLE(1, 0, 1, 0): \
- ret = _mm_movelh_ps((a), (b)); \
- break; \
- case _MM_SHUFFLE(1, 0, 0, 1): \
- ret = _mm_shuffle_ps_1001((a), (b)); \
- break; \
- case _MM_SHUFFLE(0, 1, 0, 1): \
- ret = _mm_shuffle_ps_0101((a), (b)); \
- break; \
- case _MM_SHUFFLE(3, 2, 1, 0): \
- ret = _mm_shuffle_ps_3210((a), (b)); \
- break; \
- case _MM_SHUFFLE(0, 0, 1, 1): \
- ret = _mm_shuffle_ps_0011((a), (b)); \
- break; \
- case _MM_SHUFFLE(0, 0, 2, 2): \
- ret = _mm_shuffle_ps_0022((a), (b)); \
- break; \
- case _MM_SHUFFLE(2, 2, 0, 0): \
- ret = _mm_shuffle_ps_2200((a), (b)); \
- break; \
- case _MM_SHUFFLE(3, 2, 0, 2): \
- ret = _mm_shuffle_ps_3202((a), (b)); \
- break; \
- case _MM_SHUFFLE(3, 2, 3, 2): \
- ret = _mm_movehl_ps((b), (a)); \
- break; \
- case _MM_SHUFFLE(1, 1, 3, 3): \
- ret = _mm_shuffle_ps_1133((a), (b)); \
- break; \
- case _MM_SHUFFLE(2, 0, 1, 0): \
- ret = _mm_shuffle_ps_2010((a), (b)); \
- break; \
- case _MM_SHUFFLE(2, 0, 0, 1): \
- ret = _mm_shuffle_ps_2001((a), (b)); \
- break; \
- case _MM_SHUFFLE(2, 0, 3, 2): \
- ret = _mm_shuffle_ps_2032((a), (b)); \
- break; \
- default: \
- ret = _mm_shuffle_ps_default((a), (b), (imm)); \
- break; \
- } \
- ret; \
- })
-#endif
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
-{
- int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
- int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
- return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end
-// of result takes the higher two 32 bit values from a and swaps them and places
-// in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
-{
- int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
- int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
- return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
-}
-
-// rotates the least significant 32 bits into the most signficant 32 bits, and
-// shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
-{
- return vreinterpretq_m128i_s32(
- vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
-}
-
-// rotates the most significant 32 bits into the least signficant 32 bits, and
-// shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
-{
- return vreinterpretq_m128i_s32(
- vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
-{
- int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
- return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
-{
- int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
- int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
- return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
-// places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
-{
- int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
- return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
-{
- int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
- int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
- return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
-{
- int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
- int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
- return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-{
- int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
- int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
- return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
-{
- int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
- uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
- uint8x16_t idx_masked =
- vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
-#if defined(__aarch64__)
- return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#elif defined(__GNUC__)
- int8x16_t ret;
- // %e and %f represent the even and odd D registers
- // respectively.
- __asm__ __volatile__(
- "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
- "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
- : [ret] "=&w"(ret)
- : [tbl] "w"(tbl), [idx] "w"(idx_masked));
- return vreinterpretq_m128i_s8(ret);
-#else
- // use this line if testing on aarch64
- int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
- return vreinterpretq_m128i_s8(
- vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
- vtbl2_s8(a_split, vget_high_u8(idx_masked))));
-#endif
-}
-
-// C equivalent:
-// __m128i _mm_shuffle_epi32_default(__m128i a,
-// __constrange(0, 255) int imm) {
-// __m128i ret;
-// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
-// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
-// return ret;
-// }
-#define _mm_shuffle_epi32_default(a, imm) \
- __extension__({ \
- int32x4_t ret; \
- ret = vmovq_n_s32( \
- vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \
- ret = vsetq_lane_s32( \
- vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
- ret, 1); \
- ret = vsetq_lane_s32( \
- vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
- ret, 2); \
- ret = vsetq_lane_s32( \
- vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
- ret, 3); \
- vreinterpretq_m128i_s32(ret); \
- })
-
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm) \
- __extension__({ \
- vreinterpretq_m128i_s32( \
- vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
- })
-#else
-#define _mm_shuffle_epi32_splat(a, imm) \
- __extension__({ \
- vreinterpretq_m128i_s32( \
- vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
- })
-#endif
-
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
-// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-// __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_epi32(a, imm) \
- __extension__({ \
- int32x4_t _input = vreinterpretq_s32_m128i(a); \
- int32x4_t _shuf = __builtin_shufflevector( \
- _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
- ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
- vreinterpretq_m128i_s32(_shuf); \
- })
-#else // generic
-#define _mm_shuffle_epi32(a, imm) \
- __extension__({ \
- __m128i ret; \
- switch (imm) { \
- case _MM_SHUFFLE(1, 0, 3, 2): \
- ret = _mm_shuffle_epi_1032((a)); \
- break; \
- case _MM_SHUFFLE(2, 3, 0, 1): \
- ret = _mm_shuffle_epi_2301((a)); \
- break; \
- case _MM_SHUFFLE(0, 3, 2, 1): \
- ret = _mm_shuffle_epi_0321((a)); \
- break; \
- case _MM_SHUFFLE(2, 1, 0, 3): \
- ret = _mm_shuffle_epi_2103((a)); \
- break; \
- case _MM_SHUFFLE(1, 0, 1, 0): \
- ret = _mm_shuffle_epi_1010((a)); \
- break; \
- case _MM_SHUFFLE(1, 0, 0, 1): \
- ret = _mm_shuffle_epi_1001((a)); \
- break; \
- case _MM_SHUFFLE(0, 1, 0, 1): \
- ret = _mm_shuffle_epi_0101((a)); \
- break; \
- case _MM_SHUFFLE(2, 2, 1, 1): \
- ret = _mm_shuffle_epi_2211((a)); \
- break; \
- case _MM_SHUFFLE(0, 1, 2, 2): \
- ret = _mm_shuffle_epi_0122((a)); \
- break; \
- case _MM_SHUFFLE(3, 3, 3, 2): \
- ret = _mm_shuffle_epi_3332((a)); \
- break; \
- case _MM_SHUFFLE(0, 0, 0, 0): \
- ret = _mm_shuffle_epi32_splat((a), 0); \
- break; \
- case _MM_SHUFFLE(1, 1, 1, 1): \
- ret = _mm_shuffle_epi32_splat((a), 1); \
- break; \
- case _MM_SHUFFLE(2, 2, 2, 2): \
- ret = _mm_shuffle_epi32_splat((a), 2); \
- break; \
- case _MM_SHUFFLE(3, 3, 3, 3): \
- ret = _mm_shuffle_epi32_splat((a), 3); \
- break; \
- default: \
- ret = _mm_shuffle_epi32_default((a), (imm)); \
- break; \
- } \
- ret; \
- })
-#endif
-
-// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
-// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
-// __constrange(0,255) int
-// imm)
-#define _mm_shufflelo_epi16_function(a, imm) \
- __extension__({ \
- int16x8_t ret = vreinterpretq_s16_m128i(a); \
- int16x4_t lowBits = vget_low_s16(ret); \
- ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
- ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
- 1); \
- ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
- 2); \
- ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
- 3); \
- vreinterpretq_m128i_s16(ret); \
- })
-
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-// __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflelo_epi16(a, imm) \
- __extension__({ \
- int16x8_t _input = vreinterpretq_s16_m128i(a); \
- int16x8_t _shuf = __builtin_shufflevector( \
- _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
- (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
- vreinterpretq_m128i_s16(_shuf); \
- })
-#else // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
-#endif
-
-// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
-// __constrange(0,255) int
-// imm)
-#define _mm_shufflehi_epi16_function(a, imm) \
- __extension__({ \
- int16x8_t ret = vreinterpretq_s16_m128i(a); \
- int16x4_t highBits = vget_high_s16(ret); \
- ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
- ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
- 5); \
- ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
- 6); \
- ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
- 7); \
- vreinterpretq_m128i_s16(ret); \
- })
-
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-// __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflehi_epi16(a, imm) \
- __extension__({ \
- int16x8_t _input = vreinterpretq_s16_m128i(a); \
- int16x8_t _shuf = __builtin_shufflevector( \
- _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
- (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
- (((imm) >> 6) & 0x3) + 4); \
- vreinterpretq_m128i_s16(_shuf); \
- })
-#else // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
-#endif
-
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
-// the results in dst.
-//
-// FOR j := 0 to 7
-// i := j*16
-// IF imm8[j]
-// dst[i+15:i] := b[i+15:i]
-// ELSE
-// dst[i+15:i] := a[i+15:i]
-// FI
-// ENDFOR
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-// __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm) \
- __extension__({ \
- const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \
- ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \
- ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \
- ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \
- ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \
- ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \
- ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \
- ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
- uint16x8_t _mask_vec = vld1q_u16(_mask); \
- uint16x8_t _a = vreinterpretq_u16_m128i(a); \
- uint16x8_t _b = vreinterpretq_u16_m128i(b); \
- vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \
- })
-
-// Blend packed 8-bit integers from a and b using mask, and store the results in
-// dst.
-//
-// FOR j := 0 to 15
-// i := j*8
-// IF mask[i+7]
-// dst[i+7:i] := b[i+7:i]
-// ELSE
-// dst[i+7:i] := a[i+7:i]
-// FI
-// ENDFOR
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
-{
- // Use a signed shift right to create a mask with the sign bit
- uint8x16_t mask =
- vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
- uint8x16_t a = vreinterpretq_u8_m128i(_a);
- uint8x16_t b = vreinterpretq_u8_m128i(_b);
- return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
-}
-
-/* Shifts */
-
-
-// Shift packed 16-bit integers in a right by imm while shifting in sign
-// bits, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
-{
- const int count = (imm & ~15) ? 15 : imm;
- return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
-}
-
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-// ...
-// r7 := a7 << count
-//
-// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
-#define _mm_slli_epi16(a, imm) \
- __extension__({ \
- __m128i ret; \
- if ((imm) <= 0) { \
- ret = a; \
- } else if ((imm) > 15) { \
- ret = _mm_setzero_si128(); \
- } else { \
- ret = vreinterpretq_m128i_s16( \
- vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
- } \
- ret; \
- })
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros. :
-// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
-FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
-{
- if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
- return a;
- if (imm > 31) /* TODO: add unlikely macro */
- return _mm_setzero_si128();
- return vreinterpretq_m128i_s32(
- vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
-}
-
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
-{
- if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
- return a;
- if (imm > 63) /* TODO: add unlikely macro */
- return _mm_setzero_si128();
- return vreinterpretq_m128i_s64(
- vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-//
-// FOR j := 0 to 7
-// i := j*16
-// IF imm8[7:0] > 15
-// dst[i+15:i] := 0
-// ELSE
-// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
-// FI
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm) \
- __extension__({ \
- __m128i ret; \
- if ((imm) == 0) { \
- ret = a; \
- } else if (0 < (imm) && (imm) < 16) { \
- ret = vreinterpretq_m128i_u16( \
- vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
- } else { \
- ret = _mm_setzero_si128(); \
- } \
- ret; \
- })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-//
-// FOR j := 0 to 3
-// i := j*32
-// IF imm8[7:0] > 31
-// dst[i+31:i] := 0
-// ELSE
-// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
-// FI
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
-// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm) \
- __extension__({ \
- __m128i ret; \
- if ((imm) == 0) { \
- ret = a; \
- } else if (0 < (imm) && (imm) < 32) { \
- ret = vreinterpretq_m128i_u32( \
- vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
- } else { \
- ret = _mm_setzero_si128(); \
- } \
- ret; \
- })
-
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-//
-// FOR j := 0 to 1
-// i := j*64
-// IF imm8[7:0] > 63
-// dst[i+63:i] := 0
-// ELSE
-// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
-// FI
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm) \
- __extension__({ \
- __m128i ret; \
- if ((imm) == 0) { \
- ret = a; \
- } else if (0 < (imm) && (imm) < 64) { \
- ret = vreinterpretq_m128i_u64( \
- vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
- } else { \
- ret = _mm_setzero_si128(); \
- } \
- ret; \
- })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
-// and store the results in dst.
-//
-// FOR j := 0 to 3
-// i := j*32
-// IF imm8[7:0] > 31
-// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
-// ELSE
-// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
-// FI
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm) \
- __extension__({ \
- __m128i ret; \
- if ((imm) == 0) { \
- ret = a; \
- } else if (0 < (imm) && (imm) < 32) { \
- ret = vreinterpretq_m128i_s32( \
- vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
- } else { \
- ret = vreinterpretq_m128i_s32( \
- vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \
- } \
- ret; \
- })
-
-// Shifts the 128 - bit value in a right by imm bytes while shifting in
-// zeros.imm must be an immediate.
-//
-// r := srl(a, imm*8)
-//
-// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
-// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_si128(a, imm) \
- __extension__({ \
- __m128i ret; \
- if ((imm) <= 0) { \
- ret = a; \
- } else if ((imm) > 15) { \
- ret = _mm_setzero_si128(); \
- } else { \
- ret = vreinterpretq_m128i_s8( \
- vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
- } \
- ret; \
- })
-
-// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
-// must be an immediate.
-//
-// r := a << (imm * 8)
-//
-// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_si128(a, imm) \
- __extension__({ \
- __m128i ret; \
- if ((imm) <= 0) { \
- ret = a; \
- } else if ((imm) > 15) { \
- ret = _mm_setzero_si128(); \
- } else { \
- ret = vreinterpretq_m128i_s8(vextq_s8( \
- vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
- } \
- ret; \
- })
-
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-// ...
-// r7 := a7 << count
-//
-// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
-{
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
- if (c > 15)
- return _mm_setzero_si128();
-
- int16x8_t vc = vdupq_n_s16((int16_t) c);
- return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
-}
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-// r2 := a2 << count
-// r3 := a3 << count
-//
-// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
-{
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
- if (c > 31)
- return _mm_setzero_si128();
-
- int32x4_t vc = vdupq_n_s32((int32_t) c);
- return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
-}
-
-// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-//
-// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
-{
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
- if (c > 63)
- return _mm_setzero_si128();
-
- int64x2_t vc = vdupq_n_s64((int64_t) c);
- return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
-}
-
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// ...
-// r7 := srl(a7, count)
-//
-// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
-{
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
- if (c > 15)
- return _mm_setzero_si128();
-
- int16x8_t vc = vdupq_n_s16(-(int16_t) c);
- return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
-}
-
-// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// r2 := srl(a2, count)
-// r3 := srl(a3, count)
-//
-// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
-{
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
- if (c > 31)
- return _mm_setzero_si128();
-
- int32x4_t vc = vdupq_n_s32(-(int32_t) c);
- return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
-}
-
-// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-//
-// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-{
- uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
- if (c > 63)
- return _mm_setzero_si128();
-
- int64x2_t vc = vdupq_n_s64(-(int64_t) c);
- return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
-}
-
-// NEON does not provide a version of this function.
-// Creates a 16-bit mask from the most significant bits of the 16 signed or
-// unsigned 8-bit integers in a and zero extends the upper bits.
-// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-{
-#if defined(__aarch64__)
- uint8x16_t input = vreinterpretq_u8_m128i(a);
- const int8_t ALIGN_STRUCT(16)
- xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
- const uint8x16_t mask_and = vdupq_n_u8(0x80);
- const int8x16_t mask_shift = vld1q_s8(xr);
- const uint8x16_t mask_result =
- vshlq_u8(vandq_u8(input, mask_and), mask_shift);
- uint8x8_t lo = vget_low_u8(mask_result);
- uint8x8_t hi = vget_high_u8(mask_result);
-
- return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
-#else
- // Use increasingly wide shifts+adds to collect the sign bits
- // together.
- // Since the widening shifts would be rather confusing to follow in little
- // endian, everything will be illustrated in big endian order instead. This
- // has a different result - the bits would actually be reversed on a big
- // endian machine.
-
- // Starting input (only half the elements are shown):
- // 89 ff 1d c0 00 10 99 33
- uint8x16_t input = vreinterpretq_u8_m128i(a);
-
- // Shift out everything but the sign bits with an unsigned shift right.
- //
- // Bytes of the vector::
- // 89 ff 1d c0 00 10 99 33
- // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
- // | | | | | | | |
- // 01 01 00 01 00 00 01 00
- //
- // Bits of first important lane(s):
- // 10001001 (89)
- // \______
- // |
- // 00000001 (01)
- uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
- // Merge the even lanes together with a 16-bit unsigned shift right + add.
- // 'xx' represents garbage data which will be ignored in the final result.
- // In the important bytes, the add functions like a binary OR.
- //
- // 01 01 00 01 00 00 01 00
- // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
- // \| \| \| \|
- // xx 03 xx 01 xx 00 xx 02
- //
- // 00000001 00000001 (01 01)
- // \_______ |
- // \|
- // xxxxxxxx xxxxxx11 (xx 03)
- uint32x4_t paired16 =
- vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-
- // Repeat with a wider 32-bit shift + add.
- // xx 03 xx 01 xx 00 xx 02
- // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
- // 14))
- // \| \|
- // xx xx xx 0d xx xx xx 02
- //
- // 00000011 00000001 (03 01)
- // \\_____ ||
- // '----.\||
- // xxxxxxxx xxxx1101 (xx 0d)
- uint64x2_t paired32 =
- vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-
- // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
- // lanes. xx xx xx 0d xx xx xx 02
- // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
- // 28))
- // \|
- // xx xx xx xx xx xx xx d2
- //
- // 00001101 00000010 (0d 02)
- // \ \___ | |
- // '---. \| |
- // xxxxxxxx 11010010 (xx d2)
- uint8x16_t paired64 =
- vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-
- // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
- // xx xx xx xx xx xx xx d2
- // || return paired64[0]
- // d2
- // Note: Little endian would return the correct value 4b (01001011) instead.
- return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
-#endif
-}
-
-// Copy the lower 64-bit integer in a to dst.
-//
-// dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
-FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
-{
- return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-}
-
-// Copy the 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-//
-// dst[63:0] := a[63:0]
-// dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
-FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
-{
- return vreinterpretq_m128i_s64(
- vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
-}
-
-// NEON does not provide this method
-// Creates a 4-bit mask from the most significant bits of the four
-// single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
- uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__)
- static const int32x4_t shift = {0, 1, 2, 3};
- uint32x4_t tmp = vshrq_n_u32(input, 31);
- return vaddvq_u32(vshlq_u32(tmp, shift));
-#else
- // Uses the exact same method as _mm_movemask_epi8, see that for details.
- // Shift out everything but the sign bits with a 32-bit unsigned shift
- // right.
- uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
- // Merge the two pairs together with a 64-bit unsigned shift right + add.
- uint8x16_t paired =
- vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
- // Extract the result.
- return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
-#endif
-}
-
-// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
-// all 1's, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
-FORCE_INLINE int _mm_test_all_ones(__m128i a)
-{
- return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
- ~(uint64_t) 0;
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
-{
- int64x2_t a_and_mask =
- vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
- return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
- : 1;
-}
-
-/* Math operations */
-
-// Subtracts the four single-precision, floating-point values of a and b.
-//
-// r0 := a0 - b0
-// r1 := a1 - b1
-// r2 := a2 - b2
-// r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_f32(
- vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Subtract the lower single-precision (32-bit) floating-point element in b from
-// the lower single-precision (32-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper 3 packed elements from
-// a to the upper elements of dst.
-//
-// dst[31:0] := a[31:0] - b[31:0]
-// dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
-FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_sub_ps(a, b));
-}
-
-// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
-// and store the results in dst.
-// r0 := a0 - b0
-// r1 := a1 - b1
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s64(
- vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
-// unsigned 32-bit integers of a.
-//
-// r0 := a0 - b0
-// r1 := a1 - b1
-// r2 := a2 - b2
-// r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s32(
- vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s16(
- vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s8(
- vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
-//
-// dst[63:0] := a[63:0] - b[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
-FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
-{
- return vreinterpret_m64_s64(
- vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
-// integers of a and saturates..
-// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u16(
- vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
-// integers of a and saturates.
-//
-// r0 := UnsignedSaturate(a0 - b0)
-// r1 := UnsignedSaturate(a1 - b1)
-// ...
-// r15 := UnsignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
-// of a and saturates.
-//
-// r0 := SignedSaturate(a0 - b0)
-// r1 := SignedSaturate(a1 - b1)
-// ...
-// r15 := SignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s8(
- vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
-// of a and saturates.
-//
-// r0 := SignedSaturate(a0 - b0)
-// r1 := SignedSaturate(a1 - b1)
-// ...
-// r7 := SignedSaturate(a7 - b7)
-//
-// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s16(
- vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u16(
- vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed
-// 8-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-// for i in 0..15
-// if b[i] < 0
-// r[i] := -a[i]
-// else if b[i] == 0
-// r[i] := 0
-// else
-// r[i] := a[i]
-// fi
-// done
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
-{
- int8x16_t a = vreinterpretq_s8_m128i(_a);
- int8x16_t b = vreinterpretq_s8_m128i(_b);
-
- // signed shift right: faster than vclt
- // (b < 0) ? 0xFF : 0
- uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
-
- // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
- int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
-#else
- int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
-#endif
-
- // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
- // based on ltMask
- int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
- // res = masked & (~zeroMask)
- int8x16_t res = vbicq_s8(masked, zeroMask);
-
- return vreinterpretq_m128i_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed
-// 16-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-// for i in 0..7
-// if b[i] < 0
-// r[i] := -a[i]
-// else if b[i] == 0
-// r[i] := 0
-// else
-// r[i] := a[i]
-// fi
-// done
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
-{
- int16x8_t a = vreinterpretq_s16_m128i(_a);
- int16x8_t b = vreinterpretq_s16_m128i(_b);
-
- // signed shift right: faster than vclt
- // (b < 0) ? 0xFFFF : 0
- uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
- // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
- int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
-#else
- int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
-#endif
-
- // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
- // 'a') based on ltMask
- int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
- // res = masked & (~zeroMask)
- int16x8_t res = vbicq_s16(masked, zeroMask);
- return vreinterpretq_m128i_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed
-// 32-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-// for i in 0..3
-// if b[i] < 0
-// r[i] := -a[i]
-// else if b[i] == 0
-// r[i] := 0
-// else
-// r[i] := a[i]
-// fi
-// done
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
-{
- int32x4_t a = vreinterpretq_s32_m128i(_a);
- int32x4_t b = vreinterpretq_s32_m128i(_b);
-
- // signed shift right: faster than vclt
- // (b < 0) ? 0xFFFFFFFF : 0
- uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
-
- // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
- int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
-#else
- int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
-#endif
-
- // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
- // 'a') based on ltMask
- int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
- // res = masked & (~zeroMask)
- int32x4_t res = vbicq_s32(masked, zeroMask);
- return vreinterpretq_m128i_s32(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed 16-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-//
-// FOR j := 0 to 3
-// i := j*16
-// IF b[i+15:i] < 0
-// dst[i+15:i] := -(a[i+15:i])
-// ELSE IF b[i+15:i] == 0
-// dst[i+15:i] := 0
-// ELSE
-// dst[i+15:i] := a[i+15:i]
-// FI
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
-FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
-{
- int16x4_t a = vreinterpret_s16_m64(_a);
- int16x4_t b = vreinterpret_s16_m64(_b);
-
- // signed shift right: faster than vclt
- // (b < 0) ? 0xFFFF : 0
- uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
-
- // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
- int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
-#else
- int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
-#endif
-
- // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
- // based on ltMask
- int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
- // res = masked & (~zeroMask)
- int16x4_t res = vbic_s16(masked, zeroMask);
-
- return vreinterpret_m64_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed 32-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-//
-// FOR j := 0 to 1
-// i := j*32
-// IF b[i+31:i] < 0
-// dst[i+31:i] := -(a[i+31:i])
-// ELSE IF b[i+31:i] == 0
-// dst[i+31:i] := 0
-// ELSE
-// dst[i+31:i] := a[i+31:i]
-// FI
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
-FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
-{
- int32x2_t a = vreinterpret_s32_m64(_a);
- int32x2_t b = vreinterpret_s32_m64(_b);
-
- // signed shift right: faster than vclt
- // (b < 0) ? 0xFFFFFFFF : 0
- uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
-
- // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
- int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
-#else
- int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
-#endif
-
- // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
- // based on ltMask
- int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
- // res = masked & (~zeroMask)
- int32x2_t res = vbic_s32(masked, zeroMask);
-
- return vreinterpret_m64_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
-// in b is negative, and store the results in dst. Element in dst are zeroed out
-// when the corresponding element in b is zero.
-//
-// FOR j := 0 to 7
-// i := j*8
-// IF b[i+7:i] < 0
-// dst[i+7:i] := -(a[i+7:i])
-// ELSE IF b[i+7:i] == 0
-// dst[i+7:i] := 0
-// ELSE
-// dst[i+7:i] := a[i+7:i]
-// FI
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
-FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-{
- int8x8_t a = vreinterpret_s8_m64(_a);
- int8x8_t b = vreinterpret_s8_m64(_b);
-
- // signed shift right: faster than vclt
- // (b < 0) ? 0xFF : 0
- uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
-
- // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
- int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
-#else
- int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
-#endif
-
- // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
- // based on ltMask
- int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
- // res = masked & (~zeroMask)
- int8x8_t res = vbic_s8(masked, zeroMask);
-
- return vreinterpret_m64_s8(res);
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
-FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
-{
- return vreinterpret_m64_u16(
- vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-//
-// FOR j := 0 to 7
-// i := j*8
-// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
-FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
-{
- return vreinterpret_m64_u8(
- vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-//
-// FOR j := 0 to 7
-// i := j*8
-// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
-#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
-#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
-
-// Computes the average of the 16 unsigned 8-bit integers in a and the 16
-// unsigned 8-bit integers in b and rounds.
-//
-// r0 := (a0 + b0) / 2
-// r1 := (a1 + b1) / 2
-// ...
-// r15 := (a15 + b15) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the average of the 8 unsigned 16-bit integers in a and the 8
-// unsigned 16-bit integers in b and rounds.
-//
-// r0 := (a0 + b0) / 2
-// r1 := (a1 + b1) / 2
-// ...
-// r7 := (a7 + b7) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
-{
- return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
- vreinterpretq_u16_m128i(b));
-}
-
-// Adds the four single-precision, floating-point values of a and b.
-//
-// r0 := a0 + b0
-// r1 := a1 + b1
-// r2 := a2 + b2
-// r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_f32(
- vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Add packed double-precision (64-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
-FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(
- vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
- double *da = (double *) &a;
- double *db = (double *) &b;
- double c[2];
- c[0] = da[0] + db[0];
- c[1] = da[1] + db[1];
- return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add 64-bit integers a and b, and store the result in dst.
-//
-// dst[63:0] := a[63:0] + b[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
-FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
-{
- return vreinterpret_m64_s64(
- vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// adds the scalar single-precision floating point values of a and b.
-// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
- float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
- float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
- // the upper values in the result must be the remnants of .
- return vreinterpretq_m128_f32(vaddq_f32(a, value));
-}
-
-// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s64(
- vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-//
-// r0 := a0 + b0
-// r1 := a1 + b1
-// r2 := a2 + b2
-// r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s32(
- vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
-// unsigned 16-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s16(
- vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
-// unsigned 8-bit integers in b.
-// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s8(
- vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
-// and saturates.
-//
-// r0 := SignedSaturate(a0 + b0)
-// r1 := SignedSaturate(a1 + b1)
-// ...
-// r7 := SignedSaturate(a7 + b7)
-//
-// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s16(
- vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed signed 8-bit integers in a and b using saturation, and store the
-// results in dst.
-//
-// FOR j := 0 to 15
-// i := j*8
-// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
-FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s8(
- vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
-// b and saturates..
-// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
-// unsigned 16-bit integers from b.
-//
-// r0 := (a0 * b0)[15:0]
-// r1 := (a1 * b1)[15:0]
-// ...
-// r7 := (a7 * b7)[15:0]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s16(
- vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
-// unsigned 32-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s32(
- vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// tmp[31:0] := a[i+15:i] * b[i+15:i]
-// dst[i+15:i] := tmp[31:16]
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
-#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
-
-// Multiplies the four single-precision, floating-point values of a and b.
-//
-// r0 := a0 * b0
-// r1 := a1 * b1
-// r2 := a2 * b2
-// r3 := a3 * b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_f32(
- vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Multiply packed double-precision (64-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
-FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(
- vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
- double *da = (double *) &a;
- double *db = (double *) &b;
- double c[2];
- c[0] = da[0] * db[0];
- c[1] = da[1] * db[1];
- return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Multiply the lower single-precision (32-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-//
-// dst[31:0] := a[31:0] * b[31:0]
-// dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
-FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_mul_ps(a, b));
-}
-
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
-// a and b, and store the unsigned 64-bit results in dst.
-//
-// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
-// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
-{
- // vmull_u32 upcasts instead of masking, so we downcast.
- uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
- uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
- return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
-}
-
-// Multiply the low unsigned 32-bit integers from a and b, and store the
-// unsigned 64-bit result in dst.
-//
-// dst[63:0] := a[31:0] * b[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
-FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
-{
- return vreinterpret_m64_u64(vget_low_u64(
- vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
-}
-
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
-// a and b, and store the signed 64-bit results in dst.
-//
-// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
-// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-{
- // vmull_s32 upcasts instead of masking, so we downcast.
- int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
- int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
- return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-// r0 := (a0 * b0) + (a1 * b1)
-// r1 := (a2 * b2) + (a3 * b3)
-// r2 := (a4 * b4) + (a5 * b5)
-// r3 := (a6 * b6) + (a7 * b7)
-// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
-{
- int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
- vget_low_s16(vreinterpretq_s16_m128i(b)));
- int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
- vget_high_s16(vreinterpretq_s16_m128i(b)));
-
- int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
- int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
-
- return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
-// the packed 16-bit integers in dst.
-//
-// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
-// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
-// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
-// ...
-// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
-{
- // Has issues due to saturation
- // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
-
- // Multiply
- int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
- vget_low_s16(vreinterpretq_s16_m128i(b)));
- int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
- vget_high_s16(vreinterpretq_s16_m128i(b)));
-
- // Rounding narrowing shift right
- // narrow = (int16_t)((mul + 16384) >> 15);
- int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
- int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
-
- // Join together
- return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
-// and pack the saturated results in dst.
-//
-// FOR j := 0 to 7
-// i := j*16
-// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
-// a[i+7:i]*b[i+7:i] )
-// ENDFOR
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
- uint8x16_t a = vreinterpretq_u8_m128i(_a);
- int8x16_t b = vreinterpretq_s8_m128i(_b);
- int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
- vmovl_s8(vget_low_s8(b)));
- int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
- vmovl_s8(vget_high_s8(b)));
- return vreinterpretq_m128i_s16(
- vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
-#else
- // This would be much simpler if x86 would choose to zero extend OR sign
- // extend, not both. This could probably be optimized better.
- uint16x8_t a = vreinterpretq_u16_m128i(_a);
- int16x8_t b = vreinterpretq_s16_m128i(_b);
-
- // Zero extend a
- int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
- int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
-
- // Sign extend by shifting left then shifting right.
- int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
- int16x8_t b_odd = vshrq_n_s16(b, 8);
-
- // multiply
- int16x8_t prod1 = vmulq_s16(a_even, b_even);
- int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
-
- // saturated add
- return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
-#endif
-}
-
-// Computes the fused multiple add product of 32-bit floating point numbers.
-//
-// Return Value
-// Multiplies A and B, and adds C to the temporary result before returning it.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
-FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
- vreinterpretq_f32_m128(b),
- vreinterpretq_f32_m128(a)));
-#else
- return _mm_add_ps(_mm_mul_ps(a, b), c);
-#endif
-}
-
-// Alternatively add and subtract packed single-precision (32-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
-FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
-{
- __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
- return _mm_fmadd_ps(b, mask, a);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce two
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of 64-bit elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
-{
- uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
- uint16_t r0 = t[0] + t[1] + t[2] + t[3];
- uint16_t r4 = t[4] + t[5] + t[6] + t[7];
- uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
- return (__m128i) vsetq_lane_u16(r4, r, 4);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
-FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
-{
- uint16x4_t t =
- vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
- uint16_t r0 = t[0] + t[1] + t[2] + t[3];
- return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-//
-// FOR j := 0 to 7
-// i := j*8
-// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
-// ENDFOR
-// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
-// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
-#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
-
-// Divides the four single-precision, floating-point values of a and b.
-//
-// r0 := a0 / b0
-// r1 := a1 / b1
-// r2 := a2 / b2
-// r3 := a3 / b3
-//
-// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128_f32(
- vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
- float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
- float32x4_t recip1 =
- vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
- return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
-#endif
-}
-
-// Divides the scalar single-precision floating point value of a by b.
-// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
- float32_t value =
- vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
- return vreinterpretq_m128_f32(
- vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Compute the approximate reciprocal of packed single-precision (32-bit)
-// floating-point elements in a, and store the results in dst. The maximum
-// relative error for this approximation is less than 1.5*2^-12.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128_f32(
- vdivq_f32(vdupq_n_f32(1.0f), vreinterpretq_f32_m128(in)));
-#else
- float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
- recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
- return vreinterpretq_m128_f32(recip);
-#endif
-}
-
-// Compute the approximate reciprocal of the lower single-precision (32-bit)
-// floating-point element in a, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-//
-// dst[31:0] := (1.0 / a[31:0])
-// dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
-FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
-{
- return _mm_move_ss(a, _mm_rcp_ps(a));
-}
-
-// Computes the approximations of square roots of the four single-precision,
-// floating-point values of a. First computes reciprocal square roots and then
-// reciprocals of the four values.
-//
-// r0 := sqrt(a0)
-// r1 := sqrt(a1)
-// r2 := sqrt(a2)
-// r3 := sqrt(a3)
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-#else
- float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
- float32x4_t sq = vrecpeq_f32(recipsq);
- // ??? use step versions of both sqrt and recip for better accuracy?
- return vreinterpretq_m128_f32(sq);
-#endif
-}
-
-// Computes the approximation of the square root of the scalar single-precision
-// floating point value of in.
-// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
- float32_t value =
- vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
- return vreinterpretq_m128_f32(
- vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
-}
-
-// Computes the approximations of the reciprocal square roots of the four
-// single-precision floating point values of in.
-// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
- return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
-}
-
-// Compute the approximate reciprocal square root of the lower single-precision
-// (32-bit) floating-point element in a, store the result in the lower element
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
-{
- return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
-{
- return vreinterpret_m64_s16(
- vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
-// Computes the maximums of the four single-precision, floating-point values of
-// a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
- float32x4_t _a = vreinterpretq_f32_m128(a);
- float32x4_t _b = vreinterpretq_f32_m128(b);
- return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
-#else
- return vreinterpretq_m128_f32(
- vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-// FOR j := 0 to 7
-// i := j*8
-// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
-{
- return vreinterpret_m64_u8(
- vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-// FOR j := 0 to 7
-// i := j*8
-// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
-{
- return vreinterpret_m64_s16(
- vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-#define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
-// Computes the minima of the four single-precision, floating-point values of a
-// and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
- float32x4_t _a = vreinterpretq_f32_m128(a);
- float32x4_t _b = vreinterpretq_f32_m128(b);
- return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
-#else
- return vreinterpretq_m128_f32(
- vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-// FOR j := 0 to 7
-// i := j*8
-// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
-{
- return vreinterpret_m64_u8(
- vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-// FOR j := 0 to 7
-// i := j*8
-// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-#define _m_pminub(a, b) _mm_min_pu8(a, b)
-
-// Computes the maximum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
- float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
- return vreinterpretq_m128_f32(
- vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the minimum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
- float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
- return vreinterpretq_m128_f32(
- vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s16(
- vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
-FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s8(
- vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
-FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u16(
- vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
-FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s8(
- vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
-FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u16(
- vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s16(
- vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// epi versions of min/max
-// Computes the pariwise maximums of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-// r0 := (a0 > b0) ? a0 : b0
-// r1 := (a1 > b1) ? a1 : b1
-// r2 := (a2 > b2) ? a2 : b2
-// r3 := (a3 > b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s32(
- vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the pariwise minima of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-// r0 := (a0 < b0) ? a0 : b0
-// r1 := (a1 < b1) ? a1 : b1
-// r2 := (a2 < b2) ? a2 : b2
-// r3 := (a3 < b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s32(
- vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u32(
- vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u32(
- vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
-FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
-{
- return vreinterpret_m64_u16(vshrn_n_u32(
- vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-// r0 := (a0 * b0)[31:16]
-// r1 := (a1 * b1)[31:16]
-// ...
-// r7 := (a7 * b7)[31:16]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
- /* FIXME: issue with large values because of result saturation */
- // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
- // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
- // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
- int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
- int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
- int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
- int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
- int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
- int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
- uint16x8x2_t r =
- vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
- return vreinterpretq_m128i_u16(r.val[1]);
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
-FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
-{
- uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
- uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
- uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__)
- uint32x4_t ab7654 =
- vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
- uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
- vreinterpretq_u16_u32(ab7654));
- return vreinterpretq_m128i_u16(r);
-#else
- uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
- uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
- uint32x4_t ab7654 = vmull_u16(a7654, b7654);
- uint16x8x2_t r =
- vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
- return vreinterpretq_m128i_u16(r.val[1]);
-#endif
-}
-
-// Computes pairwise add of each argument as single-precision, floating-point
-// values a and b.
-// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128_f32(
- vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
- float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
- float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
- float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
- float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
- return vreinterpretq_m128_f32(
- vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
-}
-
-// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
-{
- int16x8_t a = vreinterpretq_s16_m128i(_a);
- int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
- return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
-#else
- return vreinterpretq_m128i_s16(
- vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
- vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
-#endif
-}
-
-// Horizontally substract adjacent pairs of single-precision (32-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
-FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128_f32(vsubq_f32(
- vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
- vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
-#else
- float32x4x2_t c =
- vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
- return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
-FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
-{
- return vreinterpret_m64_s16(
- vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
-FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
-{
- return vreinterpret_m64_s32(
- vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
-}
-
-// Computes pairwise difference of each argument as a 16-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
-{
- int32x4_t a = vreinterpretq_s32_m128i(_a);
- int32x4_t b = vreinterpretq_s32_m128i(_b);
- // Interleave using vshrn/vmovn
- // [a0|a2|a4|a6|b0|b2|b4|b6]
- // [a1|a3|a5|a7|b1|b3|b5|b7]
- int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
- int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
- // Subtract
- return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
-}
-
-// Computes saturated pairwise sub of each argument as a 16-bit signed
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
- int16x8_t a = vreinterpretq_s16_m128i(_a);
- int16x8_t b = vreinterpretq_s16_m128i(_b);
- return vreinterpretq_s64_s16(
- vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
- int32x4_t a = vreinterpretq_s32_m128i(_a);
- int32x4_t b = vreinterpretq_s32_m128i(_b);
- // Interleave using vshrn/vmovn
- // [a0|a2|a4|a6|b0|b2|b4|b6]
- // [a1|a3|a5|a7|b1|b3|b5|b7]
- int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
- int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
- // Saturated add
- return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
-#endif
-}
-
-// Computes saturated pairwise difference of each argument as a 16-bit signed
-// integer values a and b.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
- int16x8_t a = vreinterpretq_s16_m128i(_a);
- int16x8_t b = vreinterpretq_s16_m128i(_b);
- return vreinterpretq_s64_s16(
- vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
- int32x4_t a = vreinterpretq_s32_m128i(_a);
- int32x4_t b = vreinterpretq_s32_m128i(_b);
- // Interleave using vshrn/vmovn
- // [a0|a2|a4|a6|b0|b2|b4|b6]
- // [a1|a3|a5|a7|b1|b3|b5|b7]
- int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
- int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
- // Saturated subtract
- return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
-#endif
-}
-
-// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
-{
- int32x4_t a = vreinterpretq_s32_m128i(_a);
- int32x4_t b = vreinterpretq_s32_m128i(_b);
- return vreinterpretq_m128i_s32(
- vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
- vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
-}
-
-// Computes pairwise difference of each argument as a 32-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
-{
- int64x2_t a = vreinterpretq_s64_m128i(_a);
- int64x2_t b = vreinterpretq_s64_m128i(_b);
- // Interleave using vshrn/vmovn
- // [a0|a2|b0|b2]
- // [a1|a2|b1|b3]
- int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
- int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
- // Subtract
- return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
-}
-
-// Kahan summation for accurate summation of floating-point numbers.
-// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
-FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
-{
- y -= *c;
- float t = *sum + y;
- *c = (t - *sum) - y;
- *sum = t;
-}
-
-// Conditionally multiply the packed single-precision (32-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products,
-// and conditionally store the sum in dst using the low 4 bits of imm.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
-FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
-{
-#if defined(__aarch64__)
- /* shortcuts */
- if (imm == 0xFF) {
- return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
- }
- if (imm == 0x7F) {
- float32x4_t m = _mm_mul_ps(a, b);
- m[3] = 0;
- return _mm_set1_ps(vaddvq_f32(m));
- }
-#endif
-
- float s = 0, c = 0;
- float32x4_t f32a = vreinterpretq_f32_m128(a);
- float32x4_t f32b = vreinterpretq_f32_m128(b);
-
- /* To improve the accuracy of floating-point summation, Kahan algorithm
- * is used for each operation.
- */
- if (imm & (1 << 4))
- sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
- if (imm & (1 << 5))
- sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
- if (imm & (1 << 6))
- sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
- if (imm & (1 << 7))
- sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
- s += c;
-
- float32x4_t res = {
- (imm & 0x1) ? s : 0,
- (imm & 0x2) ? s : 0,
- (imm & 0x4) ? s : 0,
- (imm & 0x8) ? s : 0,
- };
- return vreinterpretq_m128_f32(res);
-}
-
-/* Compare operations */
-
-// Compares for less than
-// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_u32(
- vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for less than
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
-FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_cmplt_ps(a, b));
-}
-
-// Compares for greater than.
-//
-// r0 := (a0 > b0) ? 0xffffffff : 0x0
-// r1 := (a1 > b1) ? 0xffffffff : 0x0
-// r2 := (a2 > b2) ? 0xffffffff : 0x0
-// r3 := (a3 > b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_u32(
- vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
-}
-
-// Compares for greater than or equal.
-// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_u32(
- vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_cmpge_ps(a, b));
-}
-
-// Compares for less than or equal.
-//
-// r0 := (a0 <= b0) ? 0xffffffff : 0x0
-// r1 := (a1 <= b1) ? 0xffffffff : 0x0
-// r2 := (a2 <= b2) ? 0xffffffff : 0x0
-// r3 := (a3 <= b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_u32(
- vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_cmple_ps(a, b));
-}
-
-// Compares for equality.
-// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_u32(
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for equality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
-}
-
-// Compares for inequality.
-// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
- return vreinterpretq_m128_u32(vmvnq_u32(
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compares for inequality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
-}
-
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
-{
- return _mm_cmplt_ps(a, b);
-}
-
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
-{
- return _mm_cmplt_ss(a, b);
-}
-
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
-{
- return _mm_cmple_ps(a, b);
-}
-
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
-{
- return _mm_cmple_ss(a, b);
-}
-
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
-{
- return _mm_cmpgt_ps(a, b);
-}
-
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
-{
- return _mm_cmpgt_ss(a, b);
-}
-
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
-{
- return _mm_cmpge_ps(a, b);
-}
-
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
-{
- return _mm_cmpge_ss(a, b);
-}
-
-// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
-// unsigned 8-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
-// unsigned 16-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u16(
- vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u32(
- vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed 64-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128i_u64(
- vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
-#else
- // ARMv7 lacks vceqq_u64
- // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
- uint32x4_t cmp =
- vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
- uint32x4_t swapped = vrev64q_u32(cmp);
- return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for lesser than.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for greater than.
-//
-// r0 := (a0 > b0) ? 0xff : 0x0
-// r1 := (a1 > b1) ? 0xff : 0x0
-// ...
-// r15 := (a15 > b15) ? 0xff : 0x0
-//
-// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for less than.
-//
-// r0 := (a0 < b0) ? 0xffff : 0x0
-// r1 := (a1 < b1) ? 0xffff : 0x0
-// ...
-// r7 := (a7 < b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u16(
- vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for greater than.
-//
-// r0 := (a0 > b0) ? 0xffff : 0x0
-// r1 := (a1 > b1) ? 0xffff : 0x0
-// ...
-// r7 := (a7 > b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u16(
- vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for less than.
-// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u32(
- vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for greater than.
-// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u32(
- vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
-// in b for greater than.
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128i_u64(
- vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
- // ARMv7 lacks vcgtq_s64.
- // This is based off of Clang's SSE2 polyfill:
- // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
-
- // Mask the sign bit out since we need a signed AND an unsigned comparison
- // and it is ugly to try and split them.
- int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
- int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
- int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
- // Check if a > b
- int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
- // Copy upper mask to lower mask
- // a_hi > b_hi
- int64x2_t gt_hi = vshrq_n_s64(greater, 63);
- // Copy lower mask to upper mask
- // a_lo > b_lo
- int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
- // Compare for equality
- int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
- // Copy upper mask to lower mask
- // a_hi == b_hi
- int64x2_t eq_hi = vshrq_n_s64(equal, 63);
- // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
- int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
- return vreinterpretq_m128i_s64(ret);
-#endif
-}
-
-// Compares the four 32-bit floats in a and b to check if any values are NaN.
-// Ordered compare between each value returns true for "orderable" and false for
-// "not orderable" (NaN).
-// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
-// also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-{
- // Note: NEON does not have ordered compare builtin
- // Need to compare a eq a and b eq b to check for NaN
- // Do AND of results to get final
- uint32x4_t ceqaa =
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
- uint32x4_t ceqbb =
- vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
- return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
-}
-
-// Compares for ordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_cmpord_ps(a, b));
-}
-
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
-{
- uint32x4_t f32a =
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
- uint32x4_t f32b =
- vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
- return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
-}
-
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
-{
- return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than operation. :
-// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
-// note!! The documentation on MSDN is incorrect! If either of the values is a
-// NAN the docs say you will get a one, but in fact, it will return a zero!!
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
- uint32x4_t a_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
- uint32x4_t b_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
- uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
- uint32x4_t a_lt_b =
- vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
- return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than operation. :
-// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
- // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
- // vreinterpretq_f32_m128(b)), 0);
- uint32x4_t a_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
- uint32x4_t b_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
- uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
- uint32x4_t a_gt_b =
- vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
- return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
- // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
- // vreinterpretq_f32_m128(b)), 0);
- uint32x4_t a_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
- uint32x4_t b_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
- uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
- uint32x4_t a_le_b =
- vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
- return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
- // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
- // vreinterpretq_f32_m128(b)), 0);
- uint32x4_t a_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
- uint32x4_t b_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
- uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
- uint32x4_t a_ge_b =
- vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
- return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using an equality operation. :
-// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
- // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
- // vreinterpretq_f32_m128(b)), 0);
- uint32x4_t a_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
- uint32x4_t b_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
- uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
- uint32x4_t a_eq_b =
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
- return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using an inequality operation. :
-// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
- // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
- // vreinterpretq_f32_m128(b)), 0);
- uint32x4_t a_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
- uint32x4_t b_not_nan =
- vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
- uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
- uint32x4_t a_neq_b = vmvnq_u32(
- vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
- return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
-}
-
-// according to the documentation, these intrinsics behave the same as the
-// non-'u' versions. We'll just alias them here.
-#define _mm_ucomilt_ss _mm_comilt_ss
-#define _mm_ucomile_ss _mm_comile_ss
-#define _mm_ucomigt_ss _mm_comigt_ss
-#define _mm_ucomige_ss _mm_comige_ss
-#define _mm_ucomieq_ss _mm_comieq_ss
-#define _mm_ucomineq_ss _mm_comineq_ss
-
-/* Conversions */
-
-// Convert packed signed 32-bit integers in b to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, and copy the upper 2 packed elements from a to the upper elements of
-// dst.
-//
-// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-// dst[95:64] := a[95:64]
-// dst[127:96] := a[127:96]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
-FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
-{
- return vreinterpretq_m128_f32(
- vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
- vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-//
-// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-// dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
-FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
-{
- return vreinterpretq_m128_f32(
- vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-//
-// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-// dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
-#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
-
-// Convert the signed 64-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-//
-// dst[31:0] := Convert_Int64_To_FP32(b[63:0])
-// dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
-FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
-{
- return vreinterpretq_m128_f32(
- vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
-FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
-{
-#if defined(__aarch64__)
- return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-#else
- float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
- float32_t diff = data - floor(data);
- if (diff > 0.5)
- return (int32_t) ceil(data);
- if (diff == 0.5) {
- int32_t f = (int32_t) floor(data);
- int32_t c = (int32_t) ceil(data);
- return c & 1 ? f : c;
- }
- return (int32_t) floor(data);
-#endif
-}
-
-// Convert packed 16-bit integers in a to packed single-precision (32-bit)
-// floating-point elements, and store the results in dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// m := j*32
-// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
-FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
-{
- return vreinterpretq_m128_f32(
- vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
-}
-
-// Convert packed 32-bit integers in b to packed single-precision (32-bit)
-// floating-point elements, store the results in the lower 2 elements of dst,
-// and copy the upper 2 packed elements from a to the upper elements of dst.
-//
-// dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-// dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-// dst[95:64] := a[95:64]
-// dst[127:96] := a[127:96]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
-FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
-{
- return vreinterpretq_m128_f32(
- vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
- vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, then covert the packed signed 32-bit integers in b to
-// single-precision (32-bit) floating-point element, and store the results in
-// the upper 2 elements of dst.
-//
-// dst[31:0] := Convert_Int32_To_FP32(a[31:0])
-// dst[63:32] := Convert_Int32_To_FP32(a[63:32])
-// dst[95:64] := Convert_Int32_To_FP32(b[31:0])
-// dst[127:96] := Convert_Int32_To_FP32(b[63:32])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
-FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
-{
- return vreinterpretq_m128_f32(vcvtq_f32_s32(
- vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
-}
-
-// Convert the lower packed 8-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-//
-// FOR j := 0 to 3
-// i := j*8
-// m := j*32
-// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
-FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
-{
- return vreinterpretq_m128_f32(vcvtq_f32_s32(
- vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
-}
-
-// Convert packed unsigned 16-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-//
-// FOR j := 0 to 3
-// i := j*16
-// m := j*32
-// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
-FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
-{
- return vreinterpretq_m128_f32(
- vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
-}
-
-// Convert the lower packed unsigned 8-bit integers in a to packed
-// single-precision (32-bit) floating-point elements, and store the results in
-// dst.
-//
-// FOR j := 0 to 3
-// i := j*8
-// m := j*32
-// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
-FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
-{
- return vreinterpretq_m128_f32(vcvtq_f32_u32(
- vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
-}
-
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values using truncate.
-// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
- return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-//
-// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
-FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
- return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
- double ret = *((double *) &a);
- return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-//
-// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
-#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
-
-// Converts the four signed 32-bit integer values of a to single-precision,
-// floating-point values
-// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
- return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-{
- uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
- uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
- return vreinterpretq_m128i_u16(u16x8);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-{
- uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
- uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
- uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
- return vreinterpretq_m128i_u32(u32x4);
-}
-
-// Converts the two unsigned 8-bit integers in the lower 16 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
-{
- uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
- uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
- uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
- uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
- return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-{
- int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
- int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
- return vreinterpretq_m128i_s16(s16x8);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-{
- int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
- int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
- int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
- return vreinterpretq_m128i_s32(s32x4);
-}
-
-// Converts the two signed 8-bit integers in the lower 32 bits to four
-// signed 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-{
- int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
- int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
- int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
- int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
- return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Converts the four signed 16-bit integers in the lower 64 bits to four signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
-{
- return vreinterpretq_m128i_s32(
- vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
-}
-
-// Converts the two signed 16-bit integers in the lower 32 bits two signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-{
- int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
- int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
- int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
- return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Converts the four unsigned 16-bit integers in the lower 64 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
-{
- return vreinterpretq_m128i_u32(
- vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
-}
-
-// Converts the two unsigned 16-bit integers in the lower 32 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-{
- uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
- uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
- uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
- return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Converts the two unsigned 32-bit integers in the lower 64 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
-{
- return vreinterpretq_m128i_u64(
- vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
-}
-
-// Converts the two signed 32-bit integers in the lower 64 bits to two signed
-// 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
-{
- return vreinterpretq_m128i_s64(
- vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
-}
-
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values.
-//
-// r0 := (int) a0
-// r1 := (int) a1
-// r2 := (int) a2
-// r3 := (int) a3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
-// does not support! It is supported on ARMv8-A however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
-#else
- uint32x4_t signmask = vdupq_n_u32(0x80000000);
- float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
- vdupq_n_f32(0.5f)); /* +/- 0.5 */
- int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
- vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
- int32x4_t r_trunc =
- vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
- int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
- vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
- int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
- vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
- float32x4_t delta = vsubq_f32(
- vreinterpretq_f32_m128(a),
- vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
- uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
- return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
-#endif
-}
-
-// Copy the lower 32-bit integer in a to dst.
-//
-// dst[31:0] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
- return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-//
-// dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
-FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
-{
- return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-//
-// dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
-// zero extending the upper bits.
-//
-// r0 := a
-// r1 := 0x0
-// r2 := 0x0
-// r3 := 0x0
-//
-// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
- return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-}
-
-// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
-// zero extending the upper bits.
-//
-// r0 := a
-// r1 := 0x0
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
-{
- return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-}
-
-// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
-FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
-{
- return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
-}
-
-// Applies a type cast to reinterpret four 32-bit floating point values passed
-// in as a 128-bit parameter as packed 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb514099.aspx
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
- return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-}
-
-// Applies a type cast to reinterpret four 32-bit integers passed in as a
-// 128-bit parameter as packed 32-bit floating point values.
-// https://msdn.microsoft.com/en-us/library/bb514029.aspx
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
- return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-}
-
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
- return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-// dst[63:0] := MEM[mem_addr+63:mem_addr]
-// dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
-FORCE_INLINE __m128d _mm_load1_pd(const double *p)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
-#else
- return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// upper element of dst, and copy the lower element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-//
-// dst[63:0] := a[63:0]
-// dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
-FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(
- vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
-#else
- return vreinterpretq_m128d_f32(vcombine_f32(
- vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-// dst[63:0] := MEM[mem_addr+63:mem_addr]
-// dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
-#define _mm_load_pd1 _mm_load1_pd
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-// dst[63:0] := MEM[mem_addr+63:mem_addr]
-// dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
-#define _mm_loaddup_pd _mm_load1_pd
-
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
-{
- return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load unaligned 32-bit integer from memory into the first element of dst.
-//
-// dst[31:0] := MEM[mem_addr+31:mem_addr]
-// dst[MAX:32] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
-FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
-{
- return vreinterpretq_m128i_s32(
- vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed single-precision (32-bit) floating-point elements, and store the
-// results in dst.
-//
-// FOR j := 0 to 1
-// i := 32*j
-// k := 64*j
-// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
-// ENDFOR
-// dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
-FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
-{
-#if defined(__aarch64__)
- float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
- return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
-#else
- float a0 = (float) ((double *) &a)[0];
- float a1 = (float) ((double *) &a)[1];
- return _mm_set_ps(0, 0, a1, a0);
-#endif
-}
-
-// Copy the lower double-precision (64-bit) floating-point element of a to dst.
-//
-// dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
-FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
-{
-#if defined(__aarch64__)
- return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
-#else
- return ((double *) &a)[0];
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed double-precision (64-bit) floating-point elements, and store the
-// results in dst.
-//
-// FOR j := 0 to 1
-// i := 64*j
-// k := 32*j
-// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
-FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128d_f64(
- vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
-#else
- double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
- double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
- return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
-FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
-{
- return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128d to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
-FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
-{
- return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
-FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
-{
- return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
- vreinterpretq_f32_m128(b),
- vreinterpretq_f32_m128(a)));
-}
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
-FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
-{
- uint64x2_t mask =
- vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__)
- float64x2_t a = vreinterpretq_f64_m128d(_a);
- float64x2_t b = vreinterpretq_f64_m128d(_b);
- return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
-#else
- uint64x2_t a = vreinterpretq_u64_m128d(_a);
- uint64x2_t b = vreinterpretq_u64_m128d(_b);
- return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed single-precision
-// floating-point elements in dst.
-// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
-{
-#if defined(__aarch64__)
- switch (rounding) {
- case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
- return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
- case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
- return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
- case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
- return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
- case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
- return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
- default: //_MM_FROUND_CUR_DIRECTION
- return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
- }
-#else
- float *v_float = (float *) &a;
- __m128 zero, neg_inf, pos_inf;
-
- switch (rounding) {
- case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
- return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
- case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
- return (__m128){floorf(v_float[0]), floorf(v_float[1]),
- floorf(v_float[2]), floorf(v_float[3])};
- case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
- return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
- ceilf(v_float[3])};
- case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
- zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
- neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
- floorf(v_float[2]), floorf(v_float[3]));
- pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
- ceilf(v_float[2]), ceilf(v_float[3]));
- return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
- default: //_MM_FROUND_CUR_DIRECTION
- return (__m128){roundf(v_float[0]), roundf(v_float[1]),
- roundf(v_float[2]), roundf(v_float[3])};
- }
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-//
-// FOR j := 0 to 1
-// i := 32*j
-// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
-// ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
-FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
-{
-#if defined(__aarch64__)
- return vreinterpret_m64_s32(
- vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
-#else
- return vreinterpret_m64_s32(
- vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
- _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a up to
-// an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
-FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
-{
- return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a down
-// to an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
-FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
-{
- return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-
-
-// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
-// may perform better than _mm_loadu_si128 when the data crosses a cache line
-// boundary.
-//
-// dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
-#define _mm_lddqu_si128 _mm_loadu_si128
-
-/* Miscellaneous Operations */
-
-// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-// r0 := a0 >> count
-// r1 := a1 >> count
-// ...
-// r7 := a7 >> count
-//
-// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
-{
- int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
- if (c > 15)
- return _mm_cmplt_epi16(a, _mm_setzero_si128());
- return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
-}
-
-// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-// r0 := a0 >> count
-// r1 := a1 >> count
-// r2 := a2 >> count
-// r3 := a3 >> count
-//
-// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
-{
- int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
- if (c > 31)
- return _mm_cmplt_epi32(a, _mm_setzero_si128());
- return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
-}
-
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
-// saturates.
-// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s8(
- vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
- vqmovn_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
-// integers and saturates.
-//
-// r0 := UnsignedSaturate(a0)
-// r1 := UnsignedSaturate(a1)
-// ...
-// r7 := UnsignedSaturate(a7)
-// r8 := UnsignedSaturate(b0)
-// r9 := UnsignedSaturate(b1)
-// ...
-// r15 := UnsignedSaturate(b7)
-//
-// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
- vqmovun_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
-// and saturates.
-//
-// r0 := SignedSaturate(a0)
-// r1 := SignedSaturate(a1)
-// r2 := SignedSaturate(a2)
-// r3 := SignedSaturate(a3)
-// r4 := SignedSaturate(b0)
-// r5 := SignedSaturate(b1)
-// r6 := SignedSaturate(b2)
-// r7 := SignedSaturate(b3)
-//
-// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_s16(
- vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
- vqmovn_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
-// integers and saturates.
-//
-// r0 := UnsignedSaturate(a0)
-// r1 := UnsignedSaturate(a1)
-// r2 := UnsignedSaturate(a2)
-// r3 := UnsignedSaturate(a3)
-// r4 := UnsignedSaturate(b0)
-// r5 := UnsignedSaturate(b1)
-// r6 := UnsignedSaturate(b2)
-// r7 := UnsignedSaturate(b3)
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u16(
- vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
- vqmovun_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
-// 8 signed or unsigned 8-bit integers in b.
-//
-// r0 := a0
-// r1 := b0
-// r2 := a1
-// r3 := b1
-// ...
-// r14 := a7
-// r15 := b7
-//
-// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128i_s8(
- vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
- int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
- int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
- int8x8x2_t result = vzip_s8(a1, b1);
- return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
-// lower 4 signed or unsigned 16-bit integers in b.
-//
-// r0 := a0
-// r1 := b0
-// r2 := a1
-// r3 := b1
-// r4 := a2
-// r5 := b2
-// r6 := a3
-// r7 := b3
-//
-// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128i_s16(
- vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
- int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
- int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
- int16x4x2_t result = vzip_s16(a1, b1);
- return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
-// lower 2 signed or unsigned 32 - bit integers in b.
-//
-// r0 := a0
-// r1 := b0
-// r2 := a1
-// r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128i_s32(
- vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
- int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
- int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
- int32x2x2_t result = vzip_s32(a1, b1);
- return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
-{
- int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
- int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
- return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-}
-
-// Selects and interleaves the lower two single-precision, floating-point values
-// from a and b.
-//
-// r0 := a0
-// r1 := b0
-// r2 := a1
-// r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128_f32(
- vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
- float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
- float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
- float32x2x2_t result = vzip_f32(a1, b1);
- return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Selects and interleaves the upper two single-precision, floating-point values
-// from a and b.
-//
-// r0 := a2
-// r1 := b2
-// r2 := a3
-// r3 := b3
-//
-// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128_f32(
- vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
- float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
- float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
- float32x2x2_t result = vzip_f32(a1, b1);
- return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
-// 8 signed or unsigned 8-bit integers in b.
-//
-// r0 := a8
-// r1 := b8
-// r2 := a9
-// r3 := b9
-// ...
-// r14 := a15
-// r15 := b15
-//
-// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128i_s8(
- vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
- int8x8_t a1 =
- vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
- int8x8_t b1 =
- vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
- int8x8x2_t result = vzip_s8(a1, b1);
- return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
-// upper 4 signed or unsigned 16-bit integers in b.
-//
-// r0 := a4
-// r1 := b4
-// r2 := a5
-// r3 := b5
-// r4 := a6
-// r5 := b6
-// r6 := a7
-// r7 := b7
-//
-// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128i_s16(
- vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
- int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
- int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
- int16x4x2_t result = vzip_s16(a1, b1);
- return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
-// upper 2 signed or unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
- return vreinterpretq_m128i_s32(
- vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
- int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
- int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
- int32x2x2_t result = vzip_s32(a1, b1);
- return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper signed or unsigned 64-bit integer in a with the
-// upper signed or unsigned 64-bit integer in b.
-//
-// r0 := a1
-// r1 := b1
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
-{
- int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
- int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
- return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-}
-
-// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
-// in a, store the minimum and index in dst, and zero the remaining bits in dst.
-//
-// index[2:0] := 0
-// min[15:0] := a[15:0]
-// FOR j := 0 to 7
-// i := j*16
-// IF a[i+15:i] < min[15:0]
-// index[2:0] := j
-// min[15:0] := a[i+15:i]
-// FI
-// ENDFOR
-// dst[15:0] := min[15:0]
-// dst[18:16] := index[2:0]
-// dst[127:19] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
-FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
-{
- __m128i dst;
- uint16_t min, idx = 0;
- // Find the minimum value
-#if defined(__aarch64__)
- min = vminvq_u16(vreinterpretq_u16_m128i(a));
-#else
- __m64 tmp;
- tmp = vreinterpret_m64_u16(
- vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
- vget_high_u16(vreinterpretq_u16_m128i(a))));
- tmp = vreinterpret_m64_u16(
- vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
- tmp = vreinterpret_m64_u16(
- vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
- min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
-#endif
- // Get the index of the minimum value
- int i;
- for (i = 0; i < 8; i++) {
- if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
- idx = (uint16_t) i;
- break;
- }
- a = _mm_srli_si128(a, 2);
- }
- // Generate result
- dst = _mm_setzero_si128();
- dst = vreinterpretq_m128i_u16(
- vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
- dst = vreinterpretq_m128i_u16(
- vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
- return dst;
-}
-
-// shift to right
-// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
-// http://blog.csdn.net/hemmingway/article/details/44828303
-// Clang requires a macro here, as it is extremely picky about c being a
-// literal.
-#define _mm_alignr_epi8(a, b, c) \
- ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the CF value.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
-FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
-{
- int64x2_t s64 =
- vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
- vreinterpretq_s64_m128i(b));
- return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the ZF value.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
-FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
-{
- int64x2_t s64 =
- vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
- return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Extracts the selected signed or unsigned 8-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
-// Inserts the least significant 8 bits of b into the selected 8-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-// __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm) \
- __extension__({ \
- vreinterpretq_m128i_s8( \
- vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
- })
-
-// Extracts the selected signed or unsigned 16-bit integer from a and zero
-// extends.
-// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
- vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Inserts the least significant 16 bits of b into the selected 16-bit integer
-// of a.
-// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-// __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm) \
- __extension__({ \
- vreinterpretq_m128i_s16( \
- vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
- })
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
-#define _mm_insert_pi16(a, b, imm) \
- __extension__({ \
- vreinterpret_m64_s16( \
- vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
- })
-
-// Extracts the selected signed or unsigned 32-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
- vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
-
-// Inserts the least significant 32 bits of b into the selected 32-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-// __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm) \
- __extension__({ \
- vreinterpretq_m128i_s32( \
- vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
- })
-
-// Extracts the selected signed or unsigned 64-bit integer from a and zero
-// extends.
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
- vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Inserts the least significant 64 bits of b into the selected 64-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-// __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm) \
- __extension__({ \
- vreinterpretq_m128i_s64( \
- vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
- })
-
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcount)
- return __builtin_popcount(a);
-#else
- return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
-#endif
-#else
- uint32_t count = 0;
- uint8x8_t input_val, count8x8_val;
- uint16x4_t count16x4_val;
- uint32x2_t count32x2_val;
-
- input_val = vld1_u8((uint8_t *) &a);
- count8x8_val = vcnt_u8(input_val);
- count16x4_val = vpaddl_u8(count8x8_val);
- count32x2_val = vpaddl_u16(count16x4_val);
-
- vst1_u32(&count, count32x2_val);
- return count;
-#endif
-}
-
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcountll)
- return __builtin_popcountll(a);
-#else
- return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-#endif
-#else
- uint64_t count = 0;
- uint8x8_t input_val, count8x8_val;
- uint16x4_t count16x4_val;
- uint32x2_t count32x2_val;
- uint64x1_t count64x1_val;
-
- input_val = vld1_u8((uint8_t *) &a);
- count8x8_val = vcnt_u8(input_val);
- count16x4_val = vpaddl_u8(count8x8_val);
- count32x2_val = vpaddl_u16(count16x4_val);
- count64x1_val = vpaddl_u32(count32x2_val);
- vst1_u64(&count, count64x1_val);
- return count;
-#endif
-}
-
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
- do { \
- float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
- float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
- row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
- vget_low_f32(ROW23.val[0])); \
- row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
- vget_low_f32(ROW23.val[1])); \
- row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
- vget_high_f32(ROW23.val[0])); \
- row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
- vget_high_f32(ROW23.val[1])); \
- } while (0)
-
-/* Crypto Extensions */
-
-#if defined(__ARM_FEATURE_CRYPTO)
-// Wraps vmull_p64
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
- poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
- poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
- return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-#else // ARMv7 polyfill
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-//
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-// 64-bit->128-bit polynomial multiply.
-//
-// It needs some work and is somewhat slow, but it is still faster than all
-// known scalar methods.
-//
-// Algorithm adapted to C from
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-// (https://hal.inria.fr/hal-01506572)
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
- poly8x8_t a = vreinterpret_p8_u64(_a);
- poly8x8_t b = vreinterpret_p8_u64(_b);
-
- // Masks
- uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
- vcreate_u8(0x00000000ffffffff));
- uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
- vcreate_u8(0x0000000000000000));
-
- // Do the multiplies, rotating with vext to get all combinations
- uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
- uint8x16_t e =
- vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
- uint8x16_t f =
- vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
- uint8x16_t g =
- vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
- uint8x16_t h =
- vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
- uint8x16_t i =
- vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
- uint8x16_t j =
- vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
- uint8x16_t k =
- vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
-
- // Add cross products
- uint8x16_t l = veorq_u8(e, f); // L = E + F
- uint8x16_t m = veorq_u8(g, h); // M = G + H
- uint8x16_t n = veorq_u8(i, j); // N = I + J
-
- // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
- // instructions.
-#if defined(__aarch64__)
- uint8x16_t lm_p0 = vreinterpretq_u8_u64(
- vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
- uint8x16_t lm_p1 = vreinterpretq_u8_u64(
- vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
- uint8x16_t nk_p0 = vreinterpretq_u8_u64(
- vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
- uint8x16_t nk_p1 = vreinterpretq_u8_u64(
- vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-#else
- uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
- uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
- uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
- uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
-#endif
- // t0 = (L) (P0 + P1) << 8
- // t1 = (M) (P2 + P3) << 16
- uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
- uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
- uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
-
- // t2 = (N) (P4 + P5) << 24
- // t3 = (K) (P6 + P7) << 32
- uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
- uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
- uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
-
- // De-interleave
-#if defined(__aarch64__)
- uint8x16_t t0 = vreinterpretq_u8_u64(
- vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
- uint8x16_t t1 = vreinterpretq_u8_u64(
- vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
- uint8x16_t t2 = vreinterpretq_u8_u64(
- vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
- uint8x16_t t3 = vreinterpretq_u8_u64(
- vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-#else
- uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
- uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
- uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
- uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
-#endif
- // Shift the cross products
- uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
- uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
- uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
- uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
-
- // Accumulate the products
- uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
- uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
- uint8x16_t mix = veorq_u8(d, cross1);
- uint8x16_t r = veorq_u8(mix, cross2);
- return vreinterpretq_u64_u8(r);
-}
-#endif // ARMv7 polyfill
-
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
-{
- uint64x2_t a = vreinterpretq_u64_m128i(_a);
- uint64x2_t b = vreinterpretq_u64_m128i(_b);
- switch (imm & 0x11) {
- case 0x00:
- return vreinterpretq_m128i_u64(
- _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
- case 0x01:
- return vreinterpretq_m128i_u64(
- _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
- case 0x10:
- return vreinterpretq_m128i_u64(
- _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
- case 0x11:
- return vreinterpretq_m128i_u64(
- _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
- default:
- abort();
- }
-}
-
-#if !defined(__ARM_FEATURE_CRYPTO)
-/* clang-format off */
-#define SSE2NEON_AES_DATA(w) \
- { \
- w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
- w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
- w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
- w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
- w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
- w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
- w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
- w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
- w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
- w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
- w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
- w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
- w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
- w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
- w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
- w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
- w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
- w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
- w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
- w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
- w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
- w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
- w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
- w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
- w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
- w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
- w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
- w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
- w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
- w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
- w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
- w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
- w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
- w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
- w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
- w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
- w(0xb0), w(0x54), w(0xbb), w(0x16) \
- }
-/* clang-format on */
-
-/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
-#define SSE2NEON_AES_H0(x) (x)
-static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
-#undef SSE2NEON_AES_H0
-
-// In the absence of crypto extensions, implement aesenc using regular neon
-// intrinsics instead. See:
-// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
-// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
-// for more information Reproduced with permission of the author.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
-{
-#if defined(__aarch64__)
- static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
- 0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
- 0xc, 0x1, 0x6, 0xb};
- static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
- 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
-
- uint8x16_t v;
- uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
-
- // shift rows
- w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
- // sub bytes
- v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
- v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
- v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
- v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-
- // mix columns
- w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
- w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
- w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
- // add round key
- return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
- (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
- (b0))
-#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
-#define SSE2NEON_AES_U0(p) \
- SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
-#define SSE2NEON_AES_U1(p) \
- SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
-#define SSE2NEON_AES_U2(p) \
- SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
-#define SSE2NEON_AES_U3(p) \
- SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
- static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
- SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
- SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
- SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
- SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
- };
-#undef SSE2NEON_AES_B2W
-#undef SSE2NEON_AES_F2
-#undef SSE2NEON_AES_F3
-#undef SSE2NEON_AES_U0
-#undef SSE2NEON_AES_U1
-#undef SSE2NEON_AES_U2
-#undef SSE2NEON_AES_U3
-
- uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
- uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
- uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
- uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
-
- __m128i out = _mm_set_epi32(
- (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
- aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
- (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
- aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
- (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
- aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
- (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
- aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
-
- return _mm_xor_si128(out, RoundKey);
-#endif
-}
-
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
- /* FIXME: optimized for NEON */
- uint8_t v[4][4] = {
- {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
- {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
- {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
- {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
- SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
- };
- for (int i = 0; i < 16; i++)
- vreinterpretq_nth_u8_m128i(a, i) =
- v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
- return a;
-}
-
-// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
-// This instruction generates a round key for AES encryption. See
-// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
-// for details.
-//
-// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
-{
- uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
- uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
- for (int i = 0; i < 4; ++i) {
- ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
- ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
- }
- return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
- ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-}
-#undef SSE2NEON_AES_DATA
-
-#else /* __ARM_FEATURE_CRYPTO */
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
-// AESMC and then manually applying the real key as an xor operation. This
-// unfortunately means an additional xor op; the compiler should be able to
-// optimize this away for repeated calls however. See
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
-// for more details.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
-{
- return vreinterpretq_m128i_u8(
- vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
- vreinterpretq_u8_m128i(b));
-}
-
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
- return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
- vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
- RoundKey);
-}
-
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
- // AESE does ShiftRows and SubBytes on A
- uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
-
- uint8x16_t dest = {
- // Undo ShiftRows step from AESE and extract X1 and X3
- u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
- u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
- u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
- u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
- };
- uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
- return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
-}
-#endif
-
-/* Streaming Extensions */
-
-// Guarantees that every preceding store is globally visible before any
-// subsequent store.
-// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_sfence(void)
-{
- __sync_synchronize();
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
-// point elements) from a into memory using a non-temporal memory hint.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
-FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
- __builtin_nontemporal_store(a, (float32x4_t *) p);
-#else
- vst1q_f32(p, vreinterpretq_f32_m128(a));
-#endif
-}
-
-// Stores the data in a to the address p without polluting the caches. If the
-// cache line containing address p is already in the cache, the cache will be
-// updated.
-// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
- __builtin_nontemporal_store(a, p);
-#else
- vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
-#endif
-}
-
-// Load 128-bits of integer data from memory into dst using a non-temporal
-// memory hint. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-// dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
-FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
-{
-#if __has_builtin(__builtin_nontemporal_store)
- return __builtin_nontemporal_load(p);
-#else
- return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
-#endif
-}
-
-// Cache line containing p is flushed and invalidated from all caches in the
-// coherency domain. :
-// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
-FORCE_INLINE void _mm_clflush(void const *p)
-{
- (void) p;
- // no corollary for Neon?
-}
-
-// Allocate aligned blocks of memory.
-// https://software.intel.com/en-us/
-// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
-{
- void *ptr;
- if (align == 1)
- return malloc(size);
- if (align == 2 || (sizeof(void *) == 8 && align == 4))
- align = sizeof(void *);
- if (!posix_memalign(&ptr, align, size))
- return ptr;
- return NULL;
-}
-
-FORCE_INLINE void _mm_free(void *addr)
-{
- free(addr);
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
- __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
- : [c] "+r"(crc)
- : [v] "r"(v));
-#else
- crc ^= v;
- for (int bit = 0; bit < 8; bit++) {
- if (crc & 1)
- crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
- else
- crc = (crc >> 1);
- }
-#endif
- return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
- __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
- : [c] "+r"(crc)
- : [v] "r"(v));
-#else
- crc = _mm_crc32_u8(crc, v & 0xff);
- crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
-#endif
- return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
- __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
- : [c] "+r"(crc)
- : [v] "r"(v));
-#else
- crc = _mm_crc32_u16(crc, v & 0xffff);
- crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
-#endif
- return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
- __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
- : [c] "+r"(crc)
- : [v] "r"(v));
-#else
- crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
- crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
-#endif
- return crc;
-}
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma pop_macro("ALIGN_STRUCT")
-#pragma pop_macro("FORCE_INLINE")
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC pop_options
-#endif
-
-#endif
diff --git a/soxr/src/std-types.h b/soxr/src/std-types.h
new file mode 100644
index 0000000..c5e8636
--- /dev/null
+++ b/soxr/src/std-types.h
@@ -0,0 +1,48 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if !defined soxr_std_types_included
+#define soxr_std_types_included
+
+#include "soxr-config.h"
+
+#include
+
+#if HAVE_STDBOOL_H
+ #include
+#else
+ #undef bool
+ #undef false
+ #undef true
+ #define bool int
+ #define false 0
+ #define true 1
+#endif
+
+#if HAVE_STDINT_H
+ #include
+#else
+ #undef int16_t
+ #undef int32_t
+ #undef int64_t
+ #undef uint32_t
+ #undef uint64_t
+ #define int16_t short
+ #if LONG_MAX > 2147483647L
+ #define int32_t int
+ #define int64_t long
+ #elif LONG_MAX < 2147483647L
+ #error this library requires that 'long int' has at least 32-bits
+ #else
+ #define int32_t long
+ #if defined _MSC_VER
+ #define int64_t __int64
+ #else
+ #define int64_t long long
+ #endif
+ #endif
+ #define uint32_t unsigned int32_t
+ #define uint64_t unsigned int64_t
+#endif
+
+#endif
diff --git a/soxr/src/simd.c b/soxr/src/util-simd.c
similarity index 69%
rename from soxr/src/simd.c
rename to soxr/src/util-simd.c
index 7659ab9..ec548fd 100644
--- a/soxr/src/simd.c
+++ b/soxr/src/util-simd.c
@@ -1,15 +1,15 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
#include
#include
#include
-#include "simd.h"
-#include "simd-dev.h"
-#define SIMD_ALIGNMENT (sizeof(float) * 4)
+#include "soxr-config.h"
-void * _soxr_simd_aligned_malloc(size_t size)
+#define SIMD_ALIGNMENT (sizeof(float) * (1 + (PFFFT_DOUBLE|AVCODEC_FOUND)) * 4)
+
+void * SIMD_ALIGNED_MALLOC(size_t size)
{
char * p1 = 0, * p = malloc(size + SIMD_ALIGNMENT);
if (p) {
@@ -21,9 +21,9 @@ void * _soxr_simd_aligned_malloc(size_t size)
-void * _soxr_simd_aligned_calloc(size_t nmemb, size_t size)
+void * SIMD_ALIGNED_CALLOC(size_t nmemb, size_t size)
{
- void * p = _soxr_simd_aligned_malloc(nmemb * size);
+ void * p = SIMD_ALIGNED_MALLOC(nmemb * size);
if (p)
memset(p, 0, nmemb * size);
return p;
@@ -31,7 +31,7 @@ void * _soxr_simd_aligned_calloc(size_t nmemb, size_t size)
-void _soxr_simd_aligned_free(void * p1)
+void SIMD_ALIGNED_FREE(void * p1)
{
if (p1)
free(*((void * *)p1 - 1));
@@ -39,11 +39,16 @@ void _soxr_simd_aligned_free(void * p1)
-void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b)
+#define PFFT_MACROS_ONLY
+#include "pffft.c"
+
+
+
+void ORDERED_CONVOLVE_SIMD(int n, void * not_used, float * a, float const * b)
{
int i;
float ab0, ab1;
- v4sf * /*RESTRICT*/ va = (v4sf *)a;
+ v4sf * RESTRICT va = (v4sf *)a;
v4sf const * RESTRICT vb = (v4sf const *)b;
assert(VALIGNED(a) && VALIGNED(b));
ab0 = a[0] * b[0], ab1 = a[1] * b[1];
@@ -62,11 +67,11 @@ void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float
-void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b)
+void ORDERED_PARTIAL_CONVOLVE_SIMD(int n, float * a, float const * b)
{
int i;
float ab0;
- v4sf * /*RESTRICT*/ va = (v4sf *)a;
+ v4sf * RESTRICT va = (v4sf *)a;
v4sf const * RESTRICT vb = (v4sf const *)b;
assert(VALIGNED(a) && VALIGNED(b));
ab0 = a[0] * b[0];
diff --git a/soxr/src/util32s.c b/soxr/src/util32s.c
new file mode 100644
index 0000000..b9c9e08
--- /dev/null
+++ b/soxr/src/util32s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#define PFFFT_DOUBLE 0
+
+#include "util32s.h"
+
+#include "util-simd.c"
diff --git a/soxr/src/util32s.h b/soxr/src/util32s.h
new file mode 100644
index 0000000..12226e5
--- /dev/null
+++ b/soxr/src/util32s.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if !defined soxr_util32s_included
+#define soxr_util32s_included
+
+#include
+
+void * _soxr_simd32_aligned_malloc(size_t);
+void * _soxr_simd32_aligned_calloc(size_t, size_t);
+void _soxr_simd32_aligned_free(void *);
+
+#define SIMD_ALIGNED_MALLOC _soxr_simd32_aligned_malloc
+#define SIMD_ALIGNED_CALLOC _soxr_simd32_aligned_calloc
+#define SIMD_ALIGNED_FREE _soxr_simd32_aligned_free
+
+void _soxr_ordered_convolve_simd32(int n, void * not_used, float * a, float const * b);
+void _soxr_ordered_partial_convolve_simd32(int n, float * a, float const * b);
+
+#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd32
+#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd32
+
+#endif
diff --git a/soxr/src/util64s.c b/soxr/src/util64s.c
new file mode 100644
index 0000000..0faa9e9
--- /dev/null
+++ b/soxr/src/util64s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#define PFFFT_DOUBLE 1
+
+#include "util64s.h"
+
+#include "util-simd.c"
diff --git a/soxr/src/util64s.h b/soxr/src/util64s.h
new file mode 100644
index 0000000..7beeb89
--- /dev/null
+++ b/soxr/src/util64s.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#if !defined soxr_util64s_included
+#define soxr_util64s_included
+
+#include
+
+void * _soxr_simd64_aligned_malloc(size_t);
+void * _soxr_simd64_aligned_calloc(size_t, size_t);
+void _soxr_simd64_aligned_free(void *);
+
+#define SIMD_ALIGNED_MALLOC _soxr_simd64_aligned_malloc
+#define SIMD_ALIGNED_CALLOC _soxr_simd64_aligned_calloc
+#define SIMD_ALIGNED_FREE _soxr_simd64_aligned_free
+
+void _soxr_ordered_convolve_simd64(int n, void * not_used, double * a, double const * b);
+void _soxr_ordered_partial_convolve_simd64(int n, double * a, double const * b);
+
+#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd64
+#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd64
+
+#endif
diff --git a/soxr/src/vr-coefs.c b/soxr/src/vr-coefs.c
index 14886df..a57bec8 100644
--- a/soxr/src/vr-coefs.c
+++ b/soxr/src/vr-coefs.c
@@ -103,6 +103,9 @@ static void iir(int N, double Fp, char const * name)
int main(int argc, char **argv)
{
+ puts("/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net");
+ puts(" * Licence for this file: LGPL v2.1 See LICENCE for details. */\n");
+
fir(241, 1, .45, .5, 160, 32, "half_fir_coefs");
fir( 24, .5, .25, .5, 1, 31, "fast_half_fir_coefs");
fir( 20, 12, .9 , 1.5, 160, 58, "coefs0_d");
diff --git a/soxr/src/vr-coefs.h b/soxr/src/vr-coefs.h
index 9790ec0..e44138e 100644
--- a/soxr/src/vr-coefs.h
+++ b/soxr/src/vr-coefs.h
@@ -1,3 +1,6 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
static float const half_fir_coefs[] = {
0.471112154f, 0.316907549f, 0.0286963396f, -0.101927032f,
-0.0281272982f, 0.0568029535f, 0.027196876f, -0.0360795942f,
diff --git a/soxr/src/vr32.c b/soxr/src/vr32.c
index 65eed3f..5159603 100644
--- a/soxr/src/vr32.c
+++ b/soxr/src/vr32.c
@@ -1,16 +1,10 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
/* Variable-rate resampling. */
#include
-#include
-#if !defined M_PI
-#define M_PI 3.14159265358979323846
-#endif
-#if !defined M_LN2
-#define M_LN2 0.69314718055994530942
-#endif
+#include "math-wrap.h"
#include
#include
#include "internal.h"
@@ -197,7 +191,7 @@ static float poly_fir1_u(float const * input, uint32_t frac)
typedef struct {
union {
int64_t all;
-#if WORDS_BIGENDIAN
+#if HAVE_BIGENDIAN
struct {int32_t integer; uint32_t frac;} part;
#else
struct {uint32_t frac; int32_t integer;} part;
@@ -316,7 +310,7 @@ static void vr_init(rate_t * p, double default_io_ratio, int num_stages, double
}
fifo_create(&p->output_fifo, sizeof(float));
p->default_io_ratio = default_io_ratio;
- if (!fade_coefs[0]) {
+ if (fade_coefs[0]==0) {
for (i = 0; i < iAL(fade_coefs); ++i)
fade_coefs[i] = (float)(.5 * (1 + cos(M_PI * i / (AL(fade_coefs) - 1))));
prepare_coefs(poly_fir_coefs_u, POLY_FIR_LEN_U, PHASES0_U, PHASES_U, coefs0_u, mult);
@@ -354,8 +348,9 @@ static bool set_step_step(stream_t * p, double io_ratio, int slew_len)
return p->step_step.all != 0;
}
-static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
+static void vr_set_io_ratio(void * P, double io_ratio, size_t slew_len)
{
+ rate_t *p = P;
assert(io_ratio > 0);
if (slew_len) {
if (!set_step_step(&p->current, io_ratio, p->slew_len = (int)slew_len))
@@ -367,7 +362,7 @@ static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
}
}
else {
- if (p->default_io_ratio) { /* Then this is the first call to this fn. */
+ if (p->default_io_ratio!=0) { /* Then this is the first call to this fn. */
int octave = (int)floor(log(io_ratio) / M_LN2);
p->current.stage_num = octave < 0? -1 : min(octave, p->num_stages0-1);
enter_new_stage(p, 0);
@@ -375,7 +370,7 @@ static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
else if (p->fade_len)
set_step(&p->fadeout, io_ratio);
set_step(&p->current, io_ratio);
- if (p->default_io_ratio) FRAC(p->current.at) = FRAC(p->current.step) >> 1;
+ if (p->default_io_ratio!=0) FRAC(p->current.at) = FRAC(p->current.step) >> 1;
p->default_io_ratio = 0;
}
}
@@ -427,10 +422,11 @@ static bool do_input_stage(rate_t * p, int stage_num, int sign, int min_stage_nu
return true;
}
-static int vr_process(rate_t * p, int olen0)
+static void vr_process(void * P, size_t olen0)
{
+ rate_t *p = P;
assert(p->num_stages > 0);
- if (p->default_io_ratio)
+ if (p->default_io_ratio!=0)
vr_set_io_ratio(p, p->default_io_ratio, 0);
{
float * output = fifo_reserve(&p->output_fifo, olen0);
@@ -462,7 +458,7 @@ static int vr_process(rate_t * p, int olen0)
olen = min(olen, (int)(AL(buf) >> 1));
if (p->slew_len)
olen = min(olen, p->slew_len);
- else if (p->new_io_ratio) {
+ else if (p->new_io_ratio!=0) {
set_step(&p->current, p->new_io_ratio);
set_step(&p->fadeout, p->new_io_ratio);
p->fadeout.step_step.all = p->current.step_step.all = 0;
@@ -568,17 +564,18 @@ static int vr_process(rate_t * p, int olen0)
fifo_read(&p->stages[i].fifo, idone, NULL);
}
fifo_trim_by(&p->output_fifo, olen0 - odone0);
- return odone0;
}
}
-static float * vr_input(rate_t * p, float const * input, size_t n)
+static void * vr_input(void * p, void * input, size_t n)
{
- return fifo_write(&p->stages[0].fifo, (int)n, input);
+ return fifo_write(&((rate_t *)p)->stages[0].fifo, (int)n, input);
}
-static float const * vr_output(rate_t * p, float * output, size_t * n)
+static void const * vr_output(void * P, void * O, size_t * n)
{
+ rate_t *p = P;
+ float *output = O;
fifo_t * fifo = &p->output_fifo;
if (1 || !p->num_stages0)
return fifo_read(fifo, (int)(*n = min(*n, (size_t)fifo_occupancy(fifo))), output);
@@ -594,17 +591,19 @@ static float const * vr_output(rate_t * p, float * output, size_t * n)
}
}
-static void vr_flush(rate_t * p)
+static void vr_flush(void * P)
{
+ rate_t *p = P;
if (!p->flushing) {
stage_preload(&p->stages[0]);
++p->flushing;
}
}
-static void vr_close(rate_t * p)
+static void vr_close(void * P)
{
int i;
+ rate_t *p = P;
fifo_delete(&p->output_fifo);
for (i = -1; i < p->num_stages; ++i) {
@@ -614,7 +613,7 @@ static void vr_close(rate_t * p)
free(p->stages - 1);
}
-static double vr_delay(rate_t * p)
+static double vr_delay(void * p)
{
return 100; /* TODO */
(void)p;
@@ -639,19 +638,20 @@ static char const * vr_create(void * channel, void * shared,double max_io_ratio,
static char const * vr_id(void)
{
- return "single-precision variable-rate";
+ return "vr32";
}
-typedef void (* fn_t)(void);
-fn_t _soxr_vr32_cb[] = {
- (fn_t)vr_input,
- (fn_t)vr_process,
- (fn_t)vr_output,
- (fn_t)vr_flush,
- (fn_t)vr_close,
- (fn_t)vr_delay,
- (fn_t)vr_sizes,
- (fn_t)vr_create,
- (fn_t)vr_set_io_ratio,
- (fn_t)vr_id,
+#include "cb_t.h"
+
+control_block_t _soxr_vr32_cb = {
+ vr_input,
+ vr_process,
+ vr_output,
+ vr_flush,
+ vr_close,
+ vr_delay,
+ vr_sizes,
+ vr_create,
+ vr_set_io_ratio,
+ vr_id,
};
diff --git a/soxr/src/vr32s.c b/soxr/src/vr32s.c
deleted file mode 100644
index cf0fdaa..0000000
--- a/soxr/src/vr32s.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1 See LICENCE for details. */
-
-/* Variable-rate resampling. */
-
-#include
-#include
-#if !defined M_PI
-#define M_PI 3.14159265358979323846
-#endif
-#if !defined M_LN2
-#define M_LN2 0.69314718055994530942
-#endif
-#include
-#include
-#if defined(__x86_64__) || defined(_M_X64)
-#include
-#elif defined(__ARM_NEON)
-#include "sse2neon.h"
-#endif
-#include "internal.h"
-#define FIFO_SIZE_T int
-#define FIFO_MIN 0x8000
-#include "fifo.h"
-#include "vr-coefs.h"
-
-#define FADE_LEN_BITS 9
-#define PHASE_BITS_D 10
-#define PHASE_BITS_U 9
-
-#define PHASES0_D 12
-#define POLY_FIR_LEN_D 20
-#define POLY_FIR_LEN_D_VEC (POLY_FIR_LEN_D / 4)
-#define PHASES0_U 6
-#define POLY_FIR_LEN_U 12
-#define POLY_FIR_LEN_U_VEC (POLY_FIR_LEN_U / 4)
-
-#define MULT32 (65536. * 65536.)
-#define PHASES_D (1 << PHASE_BITS_D)
-#define PHASES_U (1 << PHASE_BITS_U)
-
-#define CONVOLVE \
- _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \
- _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \
- _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
-
-#define HALF_FIR_LEN_2 (iAL(half_fir_coefs) - 1)
-#define HALF_FIR_LEN_4 (HALF_FIR_LEN_2 / 2)
-
-#define _ sum += (input[-i] + input[i]) * half_fir_coefs[i], ++i;
-static float half_fir(float const * input)
-{
- long i = 1;
- float sum = input[0] * half_fir_coefs[0];
- CONVOLVE CONVOLVE
- assert(i == HALF_FIR_LEN_2 + 1);
- return (float)sum;
-}
-#undef _
-
-#define _ sum += (input[-i] + input[i]) * half_fir_coefs[2*i], ++i;
-static float double_fir0(float const * input)
-{
- int i = 1;
- float sum = input[0] * half_fir_coefs[0];
- CONVOLVE
- assert(i == HALF_FIR_LEN_4 + 1);
- return (float)(sum * 2);
-}
-#undef _
-
-#define _ sum += (input[-i] + input[1+i]) * half_fir_coefs[2*i+1], ++i;
-static float double_fir1(float const * input)
-{
- int i = 0;
- float sum = 0;
- CONVOLVE
- assert(i == HALF_FIR_LEN_4 + 0);
- return (float)(sum * 2);
-}
-#undef _
-
-static float fast_half_fir(float const * input)
-{
- int i = 0;
- float sum = input[0] * .5f;
-#define _ sum += (input[-(2*i+1)] + input[2*i+1]) * fast_half_fir_coefs[i], ++i;
- _ _ _ _ _ _
-#undef _
- return (float)sum;
-}
-
-#define IIR_FILTER _ _ _ _ _ _ _
-#define _ in1=(in1-p->y[i])*iir_coefs[i]+tmp1;tmp1=p->y[i],p->y[i]=in1;++i;\
- in0=(in0-p->y[i])*iir_coefs[i]+tmp0;tmp0=p->y[i],p->y[i]=in0;++i;
-
-typedef struct {float x[2], y[AL(iir_coefs)];} half_iir_t;
-
-static float half_iir1(half_iir_t * p, float in0, float in1)
-{
- int i = 0;
- float tmp0, tmp1;
- tmp0 = p->x[0], p->x[0] = in0;
- tmp1 = p->x[1], p->x[1] = in1;
- IIR_FILTER
- p->y[i] = in1 = (in1 - p->y[i]) * iir_coefs[i] + tmp1;
- return in1 + in0;
-}
-#undef _
-
-static void half_iir(half_iir_t * p, float * obuf, float const * ibuf, int olen)
-{
- int i;
- for (i=0; i < olen; obuf[i] = (float)half_iir1(p, ibuf[i*2], ibuf[i*2+1]),++i);
-}
-
-static void half_phase(half_iir_t * p, float * buf, int len)
-{
- float const small_normal = 1/MULT32/MULT32; /* To quash denormals on path 0.*/
- int i;
- for (i = 0; i < len; buf[i] = (float)half_iir1(p, buf[i], 0), ++i);
-#define _ p->y[i] += small_normal, i += 2;
- i = 0, _ IIR_FILTER
-#undef _
-#define _ p->y[i] -= small_normal, i += 2;
- i = 0, _ IIR_FILTER
-#undef _
-}
-
-#define coefs(coef_p, fir_len, phase_num, coef_vec_num) \
- coef_p[(fir_len) * (phase_num) + (coef_vec_num)]
-
-#define COEF(h,l,i) ((i)<0||(i)>=(l)?0:(h)[(i)>(l)/2?(l)-(i):(i)])
-static void prepare_coefs(__m128 * coefs_a, __m128 * coefs_b,
- int n, int phases0, int phases, float const * coefs0, double multiplier)
-{
- double k[6];
- int length0 = n * phases0, length = n * phases, K0 = iAL(k)/2 - 1, i, j, pos;
- float * coefs1 = malloc(((size_t)length / 2 + 1) * sizeof(*coefs1));
- float * p = coefs1, f0, f1 = 0;
-
- for (j = 0; j < iAL(k); k[j] = COEF(coefs0, length0, j - K0), ++j);
- for (pos = i = 0; i < length0 / 2; ++i) {
- double b=(1/24.)*(k[0]+k[4]+6*k[2]-4*(k[1]+k[3])),d=.5*(k[1]+k[3])-k[2]-b;
- double a=(1/120.)*(k[5]-k[2]-9*(9*b+d)+2.5*(k[3]-k[1])-2*(k[4]-k[0]));
- double c=(1/12.)*(k[4]-k[0]-2*(k[3]-k[1])-60*a),e=.5*(k[3]-k[1])-a-c;
- for (; pos / phases == i; pos += phases0) {
- double x = (double)(pos % phases) / phases;
- *p++ = (float)(k[K0] + ((((a*x + b)*x + c)*x + d)*x + e)*x);
- }
- for (j = 0; j < iAL(k) - 1; k[j] = k[j + 1], ++j);
- k[j] = COEF(coefs0, length0, i + iAL(k) / 2 + 1);
- }
- if (!(length & 1))
- *p++ = (float)k[K0];
- assert(p - coefs1 == length / 2 + 1);
-
- for (i = 0; i < n; ++i) for (j = phases - 1; j >= 0; --j, f1 = f0) {
- pos = (n - 1 - i) * phases + j;
- f0 = COEF(coefs1, length, pos) * (float)multiplier;
- ((float*)&coefs(coefs_a, n / 4, j, i / 4))[i % 4] = (float)f0;
- ((float*)&coefs(coefs_b, n / 4, j, i / 4))[i % 4] = (float)(f1 - f0);
- }
- free(coefs1);
-}
-
-#define _ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_add_ps(_mm_mul_ps(b, x), a), _mm_loadu_ps(&input[i*4]))), ++i;
-#define a (coefs(poly_fir_coefs_d_a, POLY_FIR_LEN_D_VEC, phase, i))
-#define b (coefs(poly_fir_coefs_d_b, POLY_FIR_LEN_D_VEC, phase, i))
-static __m128 poly_fir_coefs_d_a[POLY_FIR_LEN_D_VEC * PHASES_D];
-static __m128 poly_fir_coefs_d_b[POLY_FIR_LEN_D_VEC * PHASES_D];
-
-static float poly_fir1_d(float const * input, uint32_t frac)
-{
- int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_D));
- __m128 sum = _mm_set1_ps(0.f), x = _mm_set1_ps((float)(frac << PHASE_BITS_D) * (float)(1 / MULT32));
- _ _ _ _ _
- assert(i == POLY_FIR_LEN_D_VEC);
- return ((float*)&sum)[0] + ((float*)&sum)[1] + ((float*)&sum)[2] + ((float*)&sum)[3];
-}
-#undef a
-#undef b
-#define a (coefs(poly_fir_coefs_u_a, POLY_FIR_LEN_U_VEC, phase, i))
-#define b (coefs(poly_fir_coefs_u_b, POLY_FIR_LEN_U_VEC, phase, i))
-static __m128 poly_fir_coefs_u_a[POLY_FIR_LEN_U_VEC * PHASES_U];
-static __m128 poly_fir_coefs_u_b[POLY_FIR_LEN_U_VEC * PHASES_U];
-
-static float poly_fir1_u(float const * input, uint32_t frac)
-{
- int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_U));
- __m128 sum = _mm_set1_ps(0.f), x = _mm_set1_ps((float)(frac << PHASE_BITS_U) * (float)(1 / MULT32));
- _ _ _
- assert(i == POLY_FIR_LEN_U_VEC);
- return ((float*)&sum)[0] + ((float*)&sum)[1] + ((float*)&sum)[2] + ((float*)&sum)[3];
-}
-#undef a
-#undef b
-#undef _
-
-#define ADD_TO(x,y) x.all += y.all
-#define SUBTRACT_FROM(x,y) x.all -= y.all
-#define FRAC(x) x.part.frac
-#define INT(x) x.part.integer
-
-typedef struct {
- union {
- int64_t all;
-#if WORDS_BIGENDIAN
- struct {int32_t integer; uint32_t frac;} part;
-#else
- struct {uint32_t frac; int32_t integer;} part;
-#endif
- } at, step, step_step;
- float const * input;
- int len, stage_num;
- bool is_d; /* true: downsampling at x2 rate; false: upsampling at 1x rate. */
- double step_mult;
-} stream_t;
-
-static int poly_fir_d(stream_t * s, float * output, int olen)
-{
- int i;
- float const * input = s->input - POLY_FIR_LEN_D / 2 + 1;
- for (i = 0; i < olen && INT(s->at) < s->len; ++i) {
- output[i] = poly_fir1_d(input + INT(s->at), FRAC(s->at));
- ADD_TO(s->at, s->step);
- if (!(INT(s->at) < s->len)) {
- SUBTRACT_FROM(s->at, s->step);
- break;
- }
- output[++i] = poly_fir1_d(input + INT(s->at), FRAC(s->at));
- ADD_TO(s->at, s->step);
- ADD_TO(s->step, s->step_step);
- }
- return i;
-}
-
-static int poly_fir_fade_d(
- stream_t * s, float const * vol, int step, float * output, int olen)
-{
- int i;
- float const * input = s->input - POLY_FIR_LEN_D / 2 + 1;
- for (i = 0; i < olen && INT(s->at) < s->len; ++i, vol += step) {
- output[i] += *vol * poly_fir1_d(input + INT(s->at), FRAC(s->at));
- ADD_TO(s->at, s->step);
- if (!(INT(s->at) < s->len)) {
- SUBTRACT_FROM(s->at, s->step);
- break;
- }
- output[++i] += *(vol += step) * poly_fir1_d(input + INT(s->at),FRAC(s->at));
- ADD_TO(s->at, s->step);
- ADD_TO(s->step, s->step_step);
- }
- return i;
-}
-
-static int poly_fir_u(stream_t * s, float * output, int olen)
-{
- int i;
- float const * input = s->input - POLY_FIR_LEN_U / 2 + 1;
- for (i = 0; i < olen && INT(s->at) < s->len; ++i) {
- output[i] = poly_fir1_u(input + INT(s->at), FRAC(s->at));
- ADD_TO(s->at, s->step);
- ADD_TO(s->step, s->step_step);
- }
- return i;
-}
-
-static int poly_fir_fade_u(
- stream_t * s, float const * vol, int step, float * output, int olen)
-{
- int i;
- float const * input = s->input - POLY_FIR_LEN_U / 2 + 1;
- for (i = 0; i < olen && INT(s->at) < s->len; i += 2, vol += step) {
- output[i] += *vol * poly_fir1_u(input + INT(s->at), FRAC(s->at));
- ADD_TO(s->at, s->step);
- ADD_TO(s->step, s->step_step);
- }
- return i;
-}
-
-#define shiftr(x,by) ((by) < 0? (x) << (-(by)) : (x) >> (by))
-#define shiftl(x,by) shiftr(x,-(by))
-#define stage_occupancy(s) (fifo_occupancy(&(s)->fifo) - 4*HALF_FIR_LEN_2)
-#define stage_read_p(s) ((float *)fifo_read_ptr(&(s)->fifo) + 2*HALF_FIR_LEN_2)
-#define stage_preload(s) memset(fifo_reserve(&(s)->fifo, (s)->preload), \
- 0, sizeof(float) * (size_t)(s)->preload);
-
-typedef struct {
- fifo_t fifo;
- double step_mult;
- int is_fast, x_fade_len, preload;
-} stage_t;
-
-typedef struct {
- int num_stages0, num_stages, flushing;
- int fade_len, slew_len, xfade, stage_inc, switch_stage_num;
- double new_io_ratio, default_io_ratio;
- stage_t * stages;
- fifo_t output_fifo;
- half_iir_t halfer;
- stream_t current, fadeout; /* Current/fade-in, fadeout streams. */
-} rate_t;
-
-static float fade_coefs[(2 << FADE_LEN_BITS) + 1];
-
-static void vr_init(rate_t * p, double default_io_ratio, int num_stages, double mult)
-{
- int i;
- assert(num_stages >= 0);
- memset(p, 0, sizeof(*p));
-
- p->num_stages0 = num_stages;
- p->num_stages = num_stages = max(num_stages, 1);
- p->stages = (stage_t *)calloc((unsigned)num_stages + 1, sizeof(*p->stages)) + 1;
- for (i = -1; i < p->num_stages; ++i) {
- stage_t * s = &p->stages[i];
- fifo_create(&s->fifo, sizeof(float));
- s->step_mult = 2 * MULT32 / shiftl(2, i);
- s->preload = i < 0? 0 : i == 0? 2 * HALF_FIR_LEN_2 : 3 * HALF_FIR_LEN_2 / 2;
- stage_preload(s);
- s->is_fast = true;
- lsx_debug("%-3i preload=%i", i, s->preload);
- }
- fifo_create(&p->output_fifo, sizeof(float));
- p->default_io_ratio = default_io_ratio;
- if (!fade_coefs[0]) {
- for (i = 0; i < iAL(fade_coefs); ++i)
- fade_coefs[i] = (float)(.5 * (1 + cos(M_PI * i / (AL(fade_coefs) - 1))));
- prepare_coefs(poly_fir_coefs_u_a, poly_fir_coefs_u_b, POLY_FIR_LEN_U, PHASES0_U, PHASES_U, coefs0_u, mult);
- prepare_coefs(poly_fir_coefs_d_a, poly_fir_coefs_d_b, POLY_FIR_LEN_D, PHASES0_D, PHASES_D, coefs0_d, mult *.5);
- }
- assert(fade_coefs[0]);
-}
-
-static void enter_new_stage(rate_t * p, int occupancy0)
-{
- p->current.len = shiftr(occupancy0, p->current.stage_num);
- p->current.input = stage_read_p(&p->stages[p->current.stage_num]);
-
- p->current.step_mult = p->stages[p->current.stage_num].step_mult;
- p->current.is_d = p->current.stage_num >= 0;
- if (p->current.is_d)
- p->current.step_mult *= .5;
-}
-
-static void set_step(stream_t * p, double io_ratio)
-{
- p->step.all = (int64_t)(io_ratio * p->step_mult + .5);
-}
-
-static bool set_step_step(stream_t * p, double io_ratio, int slew_len)
-{
- int64_t dif;
- int difi;
- stream_t tmp = *p;
- set_step(&tmp, io_ratio);
- dif = tmp.step.all - p->step.all;
- dif = dif < 0? dif - (slew_len >> 1) : dif + (slew_len >> 1);
- difi = (int)dif; /* Try to avoid int64_t div. */
- p->step_step.all = difi == dif? difi / slew_len : dif / slew_len;
- return p->step_step.all != 0;
-}
-
-static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
-{
- assert(io_ratio > 0);
- if (slew_len) {
- if (!set_step_step(&p->current, io_ratio, p->slew_len = (int)slew_len))
- p->slew_len = 0, p->new_io_ratio = 0, p->fadeout.step_step.all = 0;
- else {
- p->new_io_ratio = io_ratio;
- if (p->fade_len)
- set_step_step(&p->fadeout, io_ratio, p->slew_len);
- }
- }
- else {
- if (p->default_io_ratio) { /* Then this is the first call to this fn. */
- int octave = (int)floor(log(io_ratio) / M_LN2);
- p->current.stage_num = octave < 0? -1 : min(octave, p->num_stages0-1);
- enter_new_stage(p, 0);
- }
- else if (p->fade_len)
- set_step(&p->fadeout, io_ratio);
- set_step(&p->current, io_ratio);
- if (p->default_io_ratio) FRAC(p->current.at) = FRAC(p->current.step) >> 1;
- p->default_io_ratio = 0;
- }
-}
-
-static bool do_input_stage(rate_t * p, int stage_num, int sign, int min_stage_num)
-{
- int i = 0;
- float * dest;
- stage_t * s = &p->stages[stage_num];
- stage_t * s1 = &p->stages[stage_num - sign];
- float const * src = (float *)fifo_read_ptr(&s1->fifo) + HALF_FIR_LEN_2;
- int len = shiftr(fifo_occupancy(&s1->fifo) - HALF_FIR_LEN_2 * 2, sign);
- int already_done = fifo_occupancy(&s->fifo) - s->preload;
- if ((len -= already_done) <= 0)
- return false;
- src += shiftl(already_done, sign);
-
- dest = fifo_reserve(&s->fifo, len);
- if (stage_num < 0) for (; i < len; ++src)
- dest[i++] = double_fir0(src), dest[i++] = double_fir1(src);
- else {
- bool should_be_fast = p->stage_inc;
- if (!s->x_fade_len && stage_num == p->switch_stage_num) {
- p->switch_stage_num = 0;
- if (s->is_fast != should_be_fast) {
- s->x_fade_len = 1 << FADE_LEN_BITS, s->is_fast = should_be_fast, ++p->xfade;
- lsx_debug("xfade level %i, inc?=%i", stage_num, p->stage_inc);
- }
- }
- if (s->x_fade_len) {
- float const * vol1 = fade_coefs + (s->x_fade_len << 1);
- float const * vol2 = fade_coefs + (((1 << FADE_LEN_BITS) - s->x_fade_len) << 1);
- int n = min(len, s->x_fade_len);
- /*lsx_debug("xfade level %i, inc?=%i len=%i n=%i", stage_num, p->stage_inc, s->x_fade_len, n);*/
- if (should_be_fast)
- for (; i < n; vol2 += 2, vol1 -= 2, src += 2)
- dest[i++] = *vol1 * fast_half_fir(src) + *vol2 * half_fir(src);
- else for (; i < n; vol2 += 2, vol1 -= 2, src += 2)
- dest[i++] = *vol2 * fast_half_fir(src) + *vol1 * half_fir(src);
- s->x_fade_len -= n;
- p->xfade -= !s->x_fade_len;
- }
- if (stage_num < min_stage_num)
- for (; i < len; dest[i++] = fast_half_fir(src), src += 2);
- else for (; i < len; dest[i++] = half_fir(src), src += 2);
- }
- if (p->flushing > 0)
- stage_preload(s);
- return true;
-}
-
-static int vr_process(rate_t * p, int olen0)
-{
- assert(p->num_stages > 0);
- if (p->default_io_ratio)
- vr_set_io_ratio(p, p->default_io_ratio, 0);
- {
- float * output = fifo_reserve(&p->output_fifo, olen0);
- int j, odone0 = 0, min_stage_num = p->current.stage_num;
- int occupancy0, max_stage_num = min_stage_num;
- if (p->fade_len) {
- min_stage_num = min(min_stage_num, p->fadeout.stage_num);
- max_stage_num = max(max_stage_num, p->fadeout.stage_num);
- }
-
- for (j = min(min_stage_num, 0); j <= max_stage_num; ++j)
- if (j && !do_input_stage(p, j, j < 0? -1 : 1, min_stage_num))
- break;
- if (p->flushing > 0)
- p->flushing = -1;
-
- occupancy0 = shiftl(max(0,stage_occupancy(&p->stages[max_stage_num])), max_stage_num);
- p->current.len = shiftr(occupancy0, p->current.stage_num);
- p->current.input = stage_read_p(&p->stages[p->current.stage_num]);
- if (p->fade_len) {
- p->fadeout.len = shiftr(occupancy0, p->fadeout.stage_num);
- p->fadeout.input = stage_read_p(&p->stages[p->fadeout.stage_num]);
- }
-
- while (odone0 < olen0) {
- int odone, odone2, olen = olen0 - odone0, stage_dif = 0, shift;
- float buf[64 << 1];
-
- olen = min(olen, (int)(AL(buf) >> 1));
- if (p->slew_len)
- olen = min(olen, p->slew_len);
- else if (p->new_io_ratio) {
- set_step(&p->current, p->new_io_ratio);
- set_step(&p->fadeout, p->new_io_ratio);
- p->fadeout.step_step.all = p->current.step_step.all = 0;
- p->new_io_ratio = 0;
- }
- if (!p->flushing && !p->fade_len && !p->xfade) {
- if (p->current.is_d) {
- if (INT(p->current.step) && FRAC(p->current.step))
- stage_dif = 1, ++max_stage_num;
- else if (!INT(p->current.step) && FRAC(p->current.step) < (1u << 31))
- stage_dif = -1, --min_stage_num;
- } else if (INT(p->current.step) > 1 && FRAC(p->current.step))
- stage_dif = 1, ++max_stage_num;
- }
- if (stage_dif) {
- int n = p->current.stage_num + stage_dif;
- if (n >= p->num_stages)
- --max_stage_num;
- else {
- p->stage_inc = stage_dif > 0;
- p->fadeout = p->current;
- p->current.stage_num += stage_dif;
- if (!p->stage_inc)
- p->switch_stage_num = p->current.stage_num;
- if ((p->current.stage_num < 0 && stage_dif < 0) ||
- (p->current.stage_num > 0 && stage_dif > 0)) {
- stage_t * s = &p->stages[p->current.stage_num];
- fifo_clear(&s->fifo);
- stage_preload(s);
- s->is_fast = false;
- do_input_stage(p, p->current.stage_num, stage_dif, p->current.stage_num);
- }
- if (p->current.stage_num > 0 && stage_dif < 0) {
- int idone = INT(p->current.at);
- stage_t * s = &p->stages[p->current.stage_num];
- fifo_trim_to(&s->fifo, 2 * HALF_FIR_LEN_2 + idone + (POLY_FIR_LEN_D >> 1));
- do_input_stage(p, p->current.stage_num, 1, p->current.stage_num);
- }
- enter_new_stage(p, occupancy0);
- shift = -stage_dif;
-#define lshift(x,by) (x)=(by)>0?(x)<<(by):(x)>>-(by)
- lshift(p->current.at.all, shift);
- shift += p->fadeout.is_d - p->current.is_d;
- lshift(p->current.step.all, shift);
- lshift(p->current.step_step.all, shift);
- p->fade_len = AL(fade_coefs) - 1;
- lsx_debug("switch from stage %i to %i, x2 from %i to %i", p->fadeout.stage_num, p->current.stage_num, p->fadeout.is_d, p->current.is_d);
- }
- }
-
- if (p->fade_len) {
- float const * vol1 = fade_coefs + p->fade_len;
- float const * vol2 = fade_coefs + (iAL(fade_coefs) - 1 - p->fade_len);
- int olen2 = (olen = min(olen, p->fade_len >> 1)) << 1;
-
- /* x2 is more fine-grained so may fail to produce a pair of samples
- * where x1 would not (the x1 second sample is a zero so is always
- * available). So do x2 first, then feed odone to the second one. */
- memset(buf, 0, sizeof(*buf) * (size_t)olen2);
- if (p->current.is_d && p->fadeout.is_d) {
- odone = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2);
- odone2 = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, odone);
- } else if (p->current.is_d) {
- odone = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2);
- odone2 = poly_fir_fade_u(&p->fadeout, vol2, 2, buf, odone);
- } else {
- assert(p->fadeout.is_d);
- odone = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, olen2);
- odone2 = poly_fir_fade_u(&p->current, vol1,-2, buf, odone);
- }
- assert(odone == odone2);
- (void)odone2;
- p->fade_len -= odone;
- if (!p->fade_len) {
- if (p->stage_inc)
- p->switch_stage_num = min_stage_num++;
- else
- --max_stage_num;
- }
- half_iir(&p->halfer, &output[odone0], buf, odone >>= 1);
- }
- else if (p->current.is_d) {
- odone = poly_fir_d(&p->current, buf, olen << 1) >> 1;
- half_iir(&p->halfer, &output[odone0], buf, odone);
- }
- else {
- odone = poly_fir_u(&p->current, &output[odone0], olen);
- if (p->num_stages0)
- half_phase(&p->halfer, &output[odone0], odone);
- }
- odone0 += odone;
- if (p->slew_len)
- p->slew_len -= odone;
- if (odone != olen)
- break; /* Need more input. */
- } {
- int from = max(0, max_stage_num), to = min(0, min_stage_num);
- int i, idone = shiftr(INT(p->current.at), from - p->current.stage_num);
- INT(p->current.at) -= shiftl(idone, from - p->current.stage_num);
- if (p->fade_len)
- INT(p->fadeout.at) -= shiftl(idone, from - p->fadeout.stage_num);
- for (i = from; i >= to; --i, idone <<= 1)
- fifo_read(&p->stages[i].fifo, idone, NULL);
- }
- fifo_trim_by(&p->output_fifo, olen0 - odone0);
- return odone0;
- }
-}
-
-static float * vr_input(rate_t * p, float const * input, size_t n)
-{
- return fifo_write(&p->stages[0].fifo, (int)n, input);
-}
-
-static float const * vr_output(rate_t * p, float * output, size_t * n)
-{
- fifo_t * fifo = &p->output_fifo;
- if (1 || !p->num_stages0)
- return fifo_read(fifo, (int)(*n = min(*n, (size_t)fifo_occupancy(fifo))), output);
- else { /* Ignore this complication for now. */
- int const IIR_DELAY = 2;
- float * ptr = fifo_read_ptr(fifo);
- int olen = min((int)*n, max(0, fifo_occupancy(fifo) - IIR_DELAY));
- *n = (size_t)olen;
- if (output)
- memcpy(output, ptr + IIR_DELAY, *n * sizeof(*output));
- fifo_read(fifo, olen, NULL);
- return ptr + IIR_DELAY;
- }
-}
-
-static void vr_flush(rate_t * p)
-{
- if (!p->flushing) {
- stage_preload(&p->stages[0]);
- ++p->flushing;
- }
-}
-
-static void vr_close(rate_t * p)
-{
- int i;
-
- fifo_delete(&p->output_fifo);
- for (i = -1; i < p->num_stages; ++i) {
- stage_t * s = &p->stages[i];
- fifo_delete(&s->fifo);
- }
- free(p->stages - 1);
-}
-
-static double vr_delay(rate_t * p)
-{
- return 100; /* TODO */
- (void)p;
-}
-
-static void vr_sizes(size_t * shared, size_t * channel)
-{
- *shared = 0;
- *channel = sizeof(rate_t);
-}
-
-static char const * vr_create(void * channel, void * shared,double max_io_ratio,
- void * q_spec, void * r_spec, double scale)
-{
- double x = max_io_ratio;
- int n;
- for (n = 0; x > 1; x *= .5, ++n);
- vr_init(channel, max_io_ratio, n, scale);
- return 0;
- (void)shared, (void)q_spec, (void)r_spec;
-}
-
-static char const * vr_id(void)
-{
- return "single-precision variable-rate";
-}
-
-typedef void (* fn_t)(void);
-fn_t _soxr_vr32_cb[] = {
- (fn_t)vr_input,
- (fn_t)vr_process,
- (fn_t)vr_output,
- (fn_t)vr_flush,
- (fn_t)vr_close,
- (fn_t)vr_delay,
- (fn_t)vr_sizes,
- (fn_t)vr_create,
- (fn_t)vr_set_io_ratio,
- (fn_t)vr_id,
-};
diff --git a/soxr/tests/CMakeLists.txt b/soxr/tests/CMakeLists.txt
index fc350de..ee8dd0b 100644
--- a/soxr/tests/CMakeLists.txt
+++ b/soxr/tests/CMakeLists.txt
@@ -1,8 +1,8 @@
# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
# Licence for this file: LGPL v2.1 See LICENCE for details.
-add_definitions (${PROJECT_C_FLAGS})
-link_libraries (soxr)
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}")
+link_libraries (${PROJECT_NAME} ${LIBM_LIBRARIES})
file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.c)
foreach (fe ${SOURCES})
@@ -10,7 +10,10 @@ foreach (fe ${SOURCES})
add_executable (${f} ${fe})
endforeach ()
-enable_testing ()
+# Can't use c89 for this file:
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+ set_property (SOURCE throughput APPEND_STRING PROPERTY COMPILE_FLAGS "-std=gnu89")
+endif ()
set (sweep_to_freq 22050)
set (leader 1)
@@ -20,33 +23,40 @@ math (EXPR base_rate "${sweep_to_freq} + ${sweep_to_freq}")
macro (add_vector r)
set (output ${CMAKE_CURRENT_BINARY_DIR}/ref-${r}.s32)
add_custom_command (OUTPUT ${output} DEPENDS vector-gen ${CMAKE_CURRENT_LIST_FILE}
- COMMAND vector-gen ${r} ${leader} ${len} ${sweep_to_freq} 1 ${output})
+ COMMAND vector-gen ${r} ${leader} ${len} 0 ${sweep_to_freq} 1 ${output})
set (vectors ${output} ${vectors})
endmacro ()
-macro (add_cmp_test from to bits)
- set (name ${bits}-bit-perfect-${from}-${to})
- add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN} -DEXAMPLES_BIN=${EXAMPLES_BIN} -Dleader=${leader} -Dto=${to}
- -Dfrom=${from} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake)
- add_vector (${from})
- add_vector (${to})
+macro (add_cmp_test irate orate bits)
+ set (name ${bits}-bit-perfect-${irate}-${orate})
+ add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN}
+ -DEXAMPLES_BIN=${EXAMPLES_BIN} -DlenToSkip=${leader} -Dorate=${orate}
+ -Dirate=${irate} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake)
+ add_vector (${irate})
+ add_vector (${orate})
endmacro ()
unset (test_bits)
-if (WITH_SINGLE_PRECISION)
+if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
set (test_bits 20)
endif ()
-if (WITH_DOUBLE_PRECISION)
- set (test_bits ${test_bits} 24)
+if (WITH_CR64 OR WITH_CR64S)
+ set (test_bits ${test_bits} 28)
endif ()
+set (rates 192000)
+if (WITH_HI_PREC_CLOCK)
+ set (rates ${rates} 65537)
+endif ()
foreach (b ${test_bits})
- foreach (r 96000 65537)
+ foreach (r ${rates})
add_cmp_test (${base_rate} ${r} ${b})
add_cmp_test (${r} ${base_rate} ${b})
endforeach ()
endforeach ()
-add_custom_target (test-vectors ALL DEPENDS ${vectors})
+if (NOT CMAKE_CROSSCOMPILING)
+ add_custom_target (test-vectors ALL DEPENDS ${vectors})
+endif ()
add_test (1-delay-clear ${BIN}1-delay-clear)
diff --git a/soxr/tests/bandwidth-test b/soxr/tests/bandwidth-test
index 47c2303..4efdcc9 100755
--- a/soxr/tests/bandwidth-test
+++ b/soxr/tests/bandwidth-test
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
set -e
# SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -9,8 +9,9 @@ set -e
tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
-spec="spectrogram -z120 -Z-20 -wd -ho"
+spec="spectrogram -z120 -Z-20 -w$w -ho"
ext=f32; e=0
rate1=48000
rate2=44100
@@ -23,12 +24,12 @@ rate1n=`expr $rate1 / 2`
sox -r $rate1 -n 0.$ext synth 8 sin 0:$rate1n gain -1
for pass in `seq 79 5 99`; do
- f=bw1-$rate2-p`printf %02u $pass`
+ f=bw1-$rate2-p`printf %02u $pass`-$w
$tool $rate1 $rate2 1 $e $e 4 0 $pass < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:100"
done
for pass in `seq 79 5 99`; do
- f=bw2-$rate2-p`printf %02u $pass`
+ f=bw2-$rate2-p`printf %02u $pass`-$w
stop=`expr 200 - $pass`
$tool $rate1 $rate2 1 $e $e 4 0 $pass $stop < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:$stop"
done
diff --git a/soxr/tests/cmp-test.cmake b/soxr/tests/cmp-test.cmake
index 8db76c5..a836322 100644
--- a/soxr/tests/cmp-test.cmake
+++ b/soxr/tests/cmp-test.cmake
@@ -1,17 +1,13 @@
# SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
# Licence for this file: LGPL v2.1 See LICENCE for details.
-if (${bits} STREQUAL 24)
- set (quality 45)
-else ()
- set (quality 44)
-endif ()
+math (EXPR quality "43 + (${bits} - 13) / 4")
+set (ofile ${irate}-${orate}-${quality}.s32)
+#message (STATUS "Output file = [${ofile}]")
-set (output ${from}-${to}-${quality}.s32)
-
-execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${from} ${to} 1 2 2 ${quality} a
- INPUT_FILE ref-${from}.s32
- OUTPUT_FILE ${output}
+execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${irate} ${orate} 1 2 2 ${quality} a
+ INPUT_FILE ref-${irate}.s32
+ OUTPUT_FILE ${ofile}
ERROR_VARIABLE test_error
RESULT_VARIABLE test_result)
@@ -19,7 +15,11 @@ if (test_result)
message (FATAL_ERROR "Resampling failure: ${test_error}")
endif ()
-execute_process(COMMAND ${BIN}vector-cmp ref-${to}.s32 ${output} ${to} ${leader} ${len} ${bits} 98
+set (percentageToCheck 98)
+math (EXPR lenToCheck "${len} * ${percentageToCheck}")
+string (REGEX REPLACE "(..)$" ".\\1" lenToCheck "${lenToCheck}") # Divide by 100
+
+execute_process(COMMAND ${BIN}vector-cmp ref-${orate}.s32 ${ofile} ${orate} ${lenToSkip} ${lenToCheck} ${bits}
OUTPUT_VARIABLE test_output
RESULT_VARIABLE test_result)
diff --git a/soxr/tests/eg-test b/soxr/tests/eg-test
index 58d085c..ccf4ce3 100755
--- a/soxr/tests/eg-test
+++ b/soxr/tests/eg-test
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
set -e
# SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -9,6 +9,7 @@ set -e
len=8
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
#vg="valgrind --leak-check=full --show-reachable=yes"
@@ -42,6 +43,6 @@ signals=(sine-wave saw-tooth-wave)
for n in 0 1 2 3; do
signal=${signals[`expr $n % 2 || true`]}
variation=${variations[`expr $n / 2 || true`]}
- $vg ./5-variable-rate $n | sox -tf32 -r44100 -c1 - -n spectrogram -z130 -hwd -o v$n.png -X 50 -c "variation:$variation signal:$signal"
+ $vg ./5-variable-rate $n | sox -tf32 -r44100 -c1 - -n spectrogram -z130 -hw$w -o v$n-$w.png -X 50 -c "variation:$variation signal:$signal"
vg=""
done
diff --git a/soxr/tests/io-test b/soxr/tests/io-test
index a291c78..608bc9a 100755
--- a/soxr/tests/io-test
+++ b/soxr/tests/io-test
@@ -1,7 +1,7 @@
-#!/bin/bash
+#!/usr/bin/env bash
set -e
-# SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
# Licence for this file: LGPL v2.1 See LICENCE for details.
# Tests IO
@@ -14,22 +14,28 @@ len=16
f=1/32768
g=32768:0
tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
types=(f32 f64 s32 s16)
zs=(180 180 180 180 180 120 120 120 120)
do_one() {
- $tool $ir $or $c $1 $2 $3 < $c.${types[$1]} |
- sox -t ${types[`expr $2 % 4`]} -r $or -c $c - -n spectrogram -X50 -hwk -z${zs[$n]} -o io$c$n.png -c "io-test i:${types[$1]} o:${types[`expr $2 % 4`]} ($2) q:$3"
+ it=${types[$1]}; ot=${types[`expr $2 % 4 || true`]}
+ $tool $ir $or $c $1 $2 $3 < $c.$it > a.$ot
+ sox -r $or -c $c a.$ot -n spectrogram -X50 -hw$w -z${zs[$n]} -o io$c$n-$w.png -c "io-test i:$it o:$ot ($2) q:$3"
+ ./4-split-channels $ir $or $c $1 $2 $3 < $c.$it > b.$ot
+ [ $2 != 3 ] && cmp a.$ot b.$ot ||
+ test $(sox -mv-1 -r$or -c$c a.$ot -r$or -c$c b.$ot -n stats 2>&1 |grep Pk\ l|tr ' ' '\n'|grep '[0-9]'|uniq) = -84.29
+ rm [ab].$ot
n=`expr $n + 1`
}
-j=3; test z$1 != z && j=$1
+test z$1 != z && j=$1 || j=1
for c in `seq 1 $j`; do
for n in `seq 0 3`; do
- sox -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1
+ sox -R -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1
done
n=0
diff --git a/soxr/tests/large-ratio-test b/soxr/tests/large-ratio-test
index 64f1789..540c5df 100755
--- a/soxr/tests/large-ratio-test
+++ b/soxr/tests/large-ratio-test
@@ -1,23 +1,22 @@
-#!/bin/bash
+#!/usr/bin/env bash
set -e
-# SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
# Licence for this file: LGPL v2.1 See LICENCE for details.
-# Tests interpolating then decimating be the same, large ratio.
+# Tests interpolating then decimating by the same, large ratio.
tool=../examples/3-options-input-fn
-q=6
-ratio=2e4
-srate=8000
-nrate=`expr $srate / 2`
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+q=4
+test x$1 = x && ratio=1e5 || ratio=$1
+test x$2 = x && rate=8000 || rate=$2
-rm -f lr.png
+sox -r$rate -n 1.s32 synth 10 sin 0:`expr $rate / 2` vol .9375
+sync
-../tests/vector-gen $srate 0 8 $nrate .9375 1.s32
+time { $tool 1 $ratio 1 2 1 $q a < 1.s32 | $tool $ratio 1 1 1 2 $q a > 2.s32;}
-$tool 1 $ratio 1 2 1 $q < 1.s32 | $tool $ratio 1 1 1 2 $q > 2.s32
-
-sox -M -r $srate -c1 1.s32 -r $srate -c1 2.s32 -n spectrogram -hwd -Z-10 -z180 -o lr.png -c "large-ratio-test q:$q ratio:$ratio"
+sox -mv-1 -r$rate -c1 1.s32 -r$rate -c1 2.s32 -n spectrogram -hw$w -z150 -o lr-$w.png -c "large-ratio-test q:$q ratio:$ratio"
rm [12].s32
diff --git a/soxr/tests/phase-test b/soxr/tests/phase-test
index 4c491d8..3c34268 100755
--- a/soxr/tests/phase-test
+++ b/soxr/tests/phase-test
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
set -e
# SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -7,7 +7,8 @@ set -e
# Tests varying phase-response.
tool=./3-options-input-fn
-spec="spectrogram -z160 -Z-20 -X 2000 -wd -ho"
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+spec="spectrogram -z160 -Z-20 -X 2000 -w$w -ho"
ext=f32; e=0
rate1=48000
rate2=44100
@@ -20,7 +21,7 @@ for n in 1 2; do
filters=(standard-filter steep-filter)
for q in `seq 0 7`; do
- f=ph-$rate2-q$q
+ f=ph-$rate2-q$q-$w
name=${names[`expr $q % 4 || true`]}
filter=${filters[`expr $q / 4 || true`]}
$tool $rate1 $rate2 1 $e $e $q'6' < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test $filter $name"
@@ -28,7 +29,7 @@ for n in 1 2; do
# Test specific phase-response percentages:
for q in `seq 0 20 100`; do
- f=ph-$rate2-p`printf %03u $q`
+ f=ph-$rate2-p`printf %03u $q`-$w
$tool $rate1 $rate2 1 $e $e 46 0 0 0 $q < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test phase:${q}%"
done
diff --git a/soxr/tests/q-test b/soxr/tests/q-test
index 7a0f0a2..f274cb5 100755
--- a/soxr/tests/q-test
+++ b/soxr/tests/q-test
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
set -e
# SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -9,6 +9,7 @@ set -e
tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
ext=f64; e=1
c=1
q1=0; q2=7
@@ -16,7 +17,7 @@ rates=48000
zs=(50 87 87 87 111 135 159 180 95)
zz() {
- echo "spectrogram -z${zs[$1]} -Z-30 -wd -ho"
+ echo "spectrogram -z${zs[$1]} -Z-30 -w$w -ho"
}
for rate0 in $rates; do
@@ -36,11 +37,11 @@ sox -r $rate1 -n -c $c 0.$ext synth 8 sin 0:$rate1n gain -1
for q in `seq $q1 $q2`; do
f=qa-$rate1-$rate2-$q
- $tool $rate1 $rate2 $c $e $e $q 0 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f.png -c $f
+ $tool $rate1 $rate2 $c $e $e $q 0 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f-$w.png -c $f
done
q=8
f=qa-$rate1-$rate2-v
-$tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f.png -c $f
+$tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f-$w.png -c $f
diff --git a/soxr/tests/scripts b/soxr/tests/scripts
index f245919..8b6023f 100755
--- a/soxr/tests/scripts
+++ b/soxr/tests/scripts
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
set -e
# SoX Resampler Library Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -6,8 +6,9 @@ set -e
../../tests/bandwidth-test
../../tests/eg-test
-../../tests/io-test
+../../tests/io-test 3
../../tests/large-ratio-test
../../tests/phase-test
../../tests/q-test
-../../tests/time-test
+../../tests/time-test 1
+../../tests/time-test 2
diff --git a/soxr/tests/throughput-test b/soxr/tests/throughput-test
new file mode 100644
index 0000000..aef36f6
--- /dev/null
+++ b/soxr/tests/throughput-test
@@ -0,0 +1,11 @@
+#!/bin/sh
+set -e
+
+# SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1 See LICENCE for details.
+
+test -r throughput.exe && wine=wine
+
+test /$1 = / && list="`seq 0 3`" || list="$*"
+
+for n in $list; do $wine ./throughput 44.1 48 1 0 $n 4; done
diff --git a/soxr/tests/throughput-test.bat b/soxr/tests/throughput-test.bat
new file mode 100644
index 0000000..6644d8d
--- /dev/null
+++ b/soxr/tests/throughput-test.bat
@@ -0,0 +1,5 @@
+@echo off
+rem SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+rem Licence for this file: LGPL v2.1 See LICENCE for details.
+
+for /L %%i in (0,1,3) DO throughput 44.1 48 1 0 %%i
diff --git a/soxr/tests/throughput.c b/soxr/tests/throughput.c
new file mode 100644
index 0000000..c52b885
--- /dev/null
+++ b/soxr/tests/throughput.c
@@ -0,0 +1,141 @@
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1 See LICENCE for details. */
+
+#include
+#include "rint.h"
+#include "../examples/examples-common.h"
+
+#define k 1000
+
+#if defined _WIN32
+ #define WIN32_LEAN_AND_MEAN
+ #include
+ #define timerStart(msecs) LARGE_INTEGER start, stop, tmp; \
+ QueryPerformanceCounter(&start), QueryPerformanceFrequency(&tmp), \
+ stop.QuadPart = (msecs * tmp.QuadPart + k/2) / k
+ #define timerRunning() (QueryPerformanceCounter(&tmp), \
+ (tmp.QuadPart-start.QuadPart < stop.QuadPart))
+#else
+ #include
+ #if defined timeradd
+ #define K k
+ #define tv_frac tv_usec
+ #define timespec timeval
+ #define get_time(x) gettimeofday(x, NULL)
+ #else
+ #include
+ #include
+ #if defined _POSIX_TIMERS && _POSIX_TIMERS > 0
+ #define K (k*k)
+ #define tv_frac tv_nsec
+ #if defined _POSIX_MONOTONIC_CLOCK
+ #define get_time(x) clock_gettime(CLOCK_MONOTONIC, x)
+ #else
+ #define get_time(x) clock_gettime(CLOCK_REALTIME, x)
+ #endif
+ #else
+ #include
+ #define K 1
+ #define tv_frac millitm
+ #define tv_sec time
+ #define timespec timeb
+ #define get_time(x) ftime(x)
+ #endif
+ #endif
+
+ #define timerStart(msecs) struct timespec stop, tmp; get_time(&stop), \
+ stop.tv_frac += (msecs%k)*K, \
+ stop.tv_sec += msecs/k + stop.tv_frac/(K*k), \
+ stop.tv_frac %= K*k
+ #define timerRunning() (get_time(&tmp), \
+ (tmp.tv_sec < stop.tv_sec || tmp.tv_frac < stop.tv_frac))
+#endif
+
+int main(int n, char const * arg[])
+{
+ char const * const arg0 = n? --n, *arg++ : "", * engine = "";
+ double const irate = n? --n, atof(*arg++) : 96000.;
+ double const orate = n? --n, atof(*arg++) : 44100.;
+ unsigned const chans = n? --n, (unsigned)atoi(*arg++) : 1;
+ soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+ unsigned const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+ unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
+ unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+ double const passband_end = n? --n, atof(*arg++) : 0;
+ double const stopband_begin = n? --n, atof(*arg++) : 0;
+ double const phase_response = n? --n, atof(*arg++) : -1;
+ int const use_threads = n? --n, atoi(*arg++) : 1;
+ soxr_datatype_t const otype = ospec & 3;
+
+ soxr_quality_spec_t q_spec = soxr_quality_spec(q_recipe, q_flags);
+ soxr_io_spec_t io_spec = soxr_io_spec(itype, otype);
+ soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
+
+ /* Allocate resampling input and output buffers in proportion to the input
+ * and output rates: */
+ #define buf_total_len 15000 /* In samples per channel. */
+ size_t const osize = soxr_datatype_size(otype) * chans;
+ size_t const isize = soxr_datatype_size(itype) * chans;
+ size_t const olen0= (size_t)(orate * buf_total_len / (irate + orate) + .5);
+ size_t const olen = min(max(olen0, 1), buf_total_len - 1);
+ size_t const ilen = buf_total_len - olen;
+ void * const obuf = malloc(osize * olen);
+ void * const ibuf = malloc(isize * ilen);
+
+ size_t odone = 0, clips = 0, omax = 0, i;
+ soxr_error_t error;
+ soxr_t soxr;
+ int32_t seed = 0;
+ char const * e = getenv("SOXR_THROUGHPUT_GAIN");
+ double gain = e? atof(e) : .5;
+
+ /* Overrides (if given): */
+ if (passband_end > 0) q_spec.passband_end = passband_end / 100;
+ if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100;
+ if (phase_response >=0) q_spec.phase_response = phase_response;
+ io_spec.flags = ospec & ~7u;
+
+ /* Create a stream resampler: */
+ soxr = soxr_create(
+ irate, orate, chans, /* Input rate, output rate, # of channels. */
+ &error, /* To report any error during creation. */
+ &io_spec, &q_spec, &runtime_spec);
+
+#define ranqd1(x) ((x) = 1664525 * (x) + 1013904223) /* int32_t x */
+#define dranqd1(x) (ranqd1(x) * (1. / (65536. * 32768.))) /* [-1,1) */
+#define RAND (dranqd1(seed) * gain)
+#define DURATION_MSECS 125
+#define NUM_ATTEMPTS 8
+
+ if (!error) { /* If all is well, run the resampler: */
+ engine = soxr_engine(soxr);
+ switch (itype & 3) {
+ case 0: for (i=0;i' $rate2 c=$c q=$q
- time $tool $rate1 $rate2 $c $e $e $q < 0.$ext > /dev/null;
+ sox -R -r $rate1 -n -c $c 0.$ext synth $len noise; sync
+ for q in $qs; do
+ test $q = v && Q="4 20" || Q=$q
+ $time -f %e -o $TIME $tool $rate1 $rate2 $c $e $e $Q < 0.$ext > /dev/null 2> $ERR
+ echo $rate1 '-->' $rate2 c=$c q=$q t=`cat $TIME` `cat $ERR | sed 's/.*(/(/'`
done
-
- echo $rate1 '-->' $rate2 c=$c q=v
- time $tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext > /dev/null
-
- rate1=44100
- rate2=$rate0
+ rate1=$rate0
+ rate2=44100
done
done
diff --git a/soxr/tests/vector-cmp.c b/soxr/tests/vector-cmp.c
index 6edd2d5..f90cc7f 100644
--- a/soxr/tests/vector-cmp.c
+++ b/soxr/tests/vector-cmp.c
@@ -1,53 +1,56 @@
-/* SoX Resampler Library Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library Copyright (c) 2007-16 robs@users.sourceforge.net
* Licence for this file: LGPL v2.1 See LICENCE for details. */
/* Utility used to help test the library; not for general consumption.
*
- * Compare two swept-sine files. */
+ * Measure the peak bit difference between two files. */
#include
#include
-#include
#include "../src/rint.h"
+#include "../examples/examples-common.h"
-int main(int bit, char const * arg[])
+#define TYPE 0 /* As vector-gen */
+
+#if TYPE
+ #define sample_t double
+ #define N 50
+ #define DIFF(s1,s2) abs(rint32((s1-s2)*ldexp(1,N-1)))
+#else
+ #define sample_t int32_t
+ #define N 32
+ #define DIFF(s1,s2) abs((int)(s1-s2))
+#endif
+
+int main(int argc, char const * arg[])
{
- FILE * f1 = fopen(arg[1], "rb"),
- * f2 = fopen(arg[2], "rb");
- double rate = atof (arg[3]), /* Rate for this vector */
- leader_len = atof (arg[4]), /* Leader length in seconds */
- len = atof (arg[5]), /* Sweep length (excl. leader_len) */
- expect_bits= atof (arg[6]),
- expect_bw = atof (arg[7]);
+ int two = !!arg[2][0];
+ FILE * f1 = fopen(arg[1], "rb"), * f2 = two? fopen(arg[2], "rb") : 0;
+ double rate = atof (arg[3]), /* Sample-rate */
+ skip_len = atof (arg[4]), /* Skip length in seconds */
+ len = atof (arg[5]), /* Compare length in seconds */ r;
+ int i = 0, count = rint32(rate * len), max = 0, diff;
+ sample_t s1, s2;
- int32_t s1, s2;
- long count = 0;
- static long thresh[32];
- double bw, prev = 0;
-
- for (; fread(&s1, sizeof(s1), 1, f1) == 1 &&
- fread(&s2, sizeof(s2), 1, f2) == 1; ++count) {
- long diff = abs((int)(s1 - s2));
- for (bit = 0; diff && bit < 32; bit++, diff >>= 1)
- if ((diff & 1) && !thresh[bit])
- thresh[bit] = count + 1;
- }
-
- if (count != (long)((leader_len + len) * rate + .5)) {
- printf("incorrect file length\n");
- exit(1);
- }
-
- for (bit = 0; bit < 32; ++bit) {
- bw = ((double)thresh[bit] - 1) / rate - leader_len;
- if (bit && bw >= 0 && (bw - prev) * 100 / len < .08) {
- --bit;
- break;
+ fseek(f1, rint32(rate * skip_len) * (int)sizeof(s1), SEEK_CUR);
+ if (two) {
+ fseek(f2, rint32(rate * skip_len) * (int)sizeof(s2), SEEK_CUR);
+ for (; i < count &&
+ fread(&s1, sizeof(s1), 1, f1) &&
+ fread(&s2, sizeof(s2), 1, f2); ++i) {
+ diff = DIFF(s1, s2);
+ max = max(max, diff);
}
- prev = bw;
}
- bit = 32 - bit;
- bw = bw * 100 / len;
- printf("Bit perfect to %i bits, from DC to %.2f%% nyquist.\n", bit, bw);
- return !(bit >= expect_bits && bw >= expect_bw);
+ else for (; i < count && fread(&s1, sizeof(s1), 1, f1); ++i) {
+ diff = DIFF(s1, 0);
+ max = max(max, diff);
+ }
+
+ if (i != count) {
+ fprintf(stderr, "incorrect file length\n");
+ return 1;
+ }
+ printf("%f\n", r = N-log(max)/log(2));
+ return argc>6? r 1
#include
#endif
-#include "../examples/examples-common.h"
+#include "math-wrap.h"
+#include
+#include
-#if QUAD
- #define modf modfq
- #define cos cosq
- #define sin sinq
- #undef M_PI
- #define M_PI M_PIq
- #define real __float128
- #define atof(x) strtoflt128(x, 0)
+#if TYPE
+ #if TYPE > 1
+ #define modf modfq
+ #define cos cosq
+ #define sin sinq
+ #define PI M_PIq
+ #define real __float128
+ #define atof(x) strtoflt128(x, 0)
+ #else
+ #define modf modfl
+ #define cos cosl
+ #define sin sinl
+ #define PI M_PIl
+ #define real long double
+ #endif
+ #define MULT 1
+ #define OUT(d) double output = d
#else
+ #define PI M_PI
#define real double
#include "rint.h"
+ #define MULT (32768. * 65536 - 1/scale)
+ #define OUT(d) int32_t output = rint32(d)
#endif
-int main(int i, char const * argv[])
+int main(int argc, char const * argv[])
{
- real rate = atof(argv[1]), /* Rate for this vector */
- lead_in_len = atof(argv[2]), /* Lead-in length in seconds */
- len = atof(argv[3]), /* Sweep length (excl. lead_in_len) */
- sweep_to_freq = atof(argv[4]), /* Sweep from DC to this freq. */
- multiplier = atof(argv[5]), /* For headroom */
- f1 = -sweep_to_freq / len * lead_in_len, f2 = sweep_to_freq,
- n1 = rate * -lead_in_len, n2 = rate * len,
- m = (f2 - f1) / (n2 - n1) / 2, dummy;
- FILE * file = fopen(argv[6], "wb");
- i = (int)n1;
- if (!file || i != n1)
- exit(1);
- for (; i < (int)(n2 + .5); ++i) {
- double d1 = multiplier * sin(2 * M_PI * modf(i * m * i / rate, &dummy));
- double d = i < 0? d1 * (1 - cos(M_PI * (i + n1) / n1)) * .5 : d1;
-#if QUAD
- size_t actual = fwrite(&d, sizeof(d), 1, file);
-#else
- int32_t out = rint32(d * (32768. * 65536 - 1));
- size_t actual = fwrite(&out, sizeof(out), 1, file);
-#endif
- if (actual != 1)
- return 1;
+ real rate = atof(argv[1]), /* Rate for this vector */
+ lead_in_len = atof(argv[2]), /* Lead-in length in seconds */
+ len = atof(argv[3]), /* Sweep length (excl. lead_in_len) */
+ f1 = atof(argv[4]),
+ f2 = atof(argv[5]),
+ scale = atof(argv[6]), /* For headroom */
+ n1 = rate * -lead_in_len,
+ m = (f2 - f1) / (rate * len * 2), dummy;
+ FILE * file = fopen(argv[7], "wb");
+ int i = (int)n1, err = !file || i != n1;
+ for (; !err && i < (int)(rate*(len+lead_in_len)+.5); ++i) {
+ real d = sin(2 * PI * modf((f1 + i * m) * i / rate, &dummy));
+ OUT((double)(scale * MULT * d));
+ err = fwrite(&output, sizeof(output), 1, file) != 1;
}
- return 0;
+ return err |!argc;
}