diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85eab0a..584a36e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ endif ()
 
 set(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}")
 
-add_subdirectory(soxr/src)
+add_subdirectory(soxr)
 
 add_library(boo
   lib/audiodev/Common.hpp
diff --git a/soxr/.gitignore b/soxr/.gitignore
new file mode 100644
index 0000000..ac1dff9
--- /dev/null
+++ b/soxr/.gitignore
@@ -0,0 +1,2 @@
+Release*/
+Debug*/
diff --git a/soxr/CMakeLists.txt b/soxr/CMakeLists.txt
index 61bd596..76950ae 100644
--- a/soxr/CMakeLists.txt
+++ b/soxr/CMakeLists.txt
@@ -1,30 +1,36 @@
-# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
-cmake_minimum_required (VERSION 2.8 FATAL_ERROR)
-cmake_policy(SET CMP0075 OLD)
-
-#project (soxr C)
-#set (DESCRIPTION_SUMMARY "High quality, one-dimensional sample-rate conversion library")
+cmake_minimum_required (VERSION 3.1 FATAL_ERROR)
+project (soxr C)
+set (DESCRIPTION_SUMMARY
+    "High quality, one-dimensional sample-rate conversion library")
 
+cmake_policy(SET CMP0075 NEW)
+cmake_policy(SET CMP0115 OLD)
+cmake_policy(SET CMP0127 OLD)
 
 
 # Release versioning:
 
 set (PROJECT_VERSION_MAJOR 0)
 set (PROJECT_VERSION_MINOR 1)
-set (PROJECT_VERSION_PATCH 2)
+set (PROJECT_VERSION_PATCH 3)
 
 # For shared-object; if, since the last public release:
-#  * library code changed at all: ++revision
-#  * interfaces changed at all:   ++current, revision = 0
-#  * interfaces added:            ++age
-#  * interfaces removed:          age = 0
+#   1) library code changed at all: ++revision
+#   2) interfaces changed at all:   ++current, revision = 0
+#   3) interfaces added:            ++age
+#   4) interfaces removed:          age = 0
 
 set (SO_VERSION_CURRENT  1)
-set (SO_VERSION_REVISION 1)
+set (SO_VERSION_REVISION 2)
 set (SO_VERSION_AGE      1)
 
+math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}")
+
 
 
 # Main options:
@@ -32,112 +38,147 @@ set (SO_VERSION_AGE      1)
 include (CMakeDependentOption)
 
 if (NOT CMAKE_BUILD_TYPE)
-  set (CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
+  set (CMAKE_BUILD_TYPE Release CACHE STRING
+    "Build type, one of: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
 endif ()
 
-#option (BUILD_TESTS "Build sanity-tests."  ON)
-#option (BUILD_SHARED_LIBS "Build shared libraries." ON)
-#option (BUILD_EXAMPLES "Build examples." OFF)
-option (WITH_LSR_BINDINGS "Include a `libsamplerate'-like interface." ON)
-cmake_dependent_option (WITH_SINGLE_PRECISION "Build with single precision (for up to 20-bit accuracy)." ON
-  "WITH_DOUBLE_PRECISION" ON)
-cmake_dependent_option (WITH_DOUBLE_PRECISION "Build with double precision (for up to 32-bit accuracy)." ON
-  "WITH_SINGLE_PRECISION" ON)
-cmake_dependent_option (WITH_SIMD "Use SIMD (for faster single precision)." ON
-  "WITH_SINGLE_PRECISION" OFF)
-cmake_dependent_option (WITH_AVFFT "Use libavcodec (LGPL) for SIMD DFT." OFF
-  "WITH_SIMD;NOT WITH_PFFFT" OFF)
-cmake_dependent_option (WITH_PFFFT "Use PFFFT (BSD-like licence) for SIMD DFT." ON
-  "WITH_SIMD;NOT WITH_AVFFT" OFF)
-option (SOXR_SILENT "Disable debug messages, even in debug mode" OFF)
-if (UNIX)
-  if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/lsr-tests)
-    cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF
-      "WITH_LSR_BINDINGS" OFF)
-  endif ()
-endif ()
+option (BUILD_TESTS "Build sanity-tests." OFF)
+option (BUILD_EXAMPLES "Build examples." OFF)
+option (WITH_OPENMP "Include OpenMP threading." OFF)
+option (WITH_LSR_BINDINGS "Include a `libsamplerate'-like interface." OFF)
+
+cmake_dependent_option (BUILD_SHARED_LIBS
+  "Build shared (dynamic) soxr libraries." OFF
+  "NOT WITH_DEV_GPROF" OFF)
+cmake_dependent_option (WITH_VR32
+  "Include HQ variable-rate resampling engine." ON
+  "WITH_CR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S OR NOT DEFINED WITH_VR32" ON)
+cmake_dependent_option (WITH_CR32
+  "Include HQ constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_CR64
+  "Include VHQ constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_CR64S
+  "Include VHQ SIMD constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64" ON)
+cmake_dependent_option (WITH_CR32S
+  "Include HQ SIMD constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR64 OR WITH_CR32 OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_PFFFT
+  "Use PFFFT (BSD-like licence) for HQ SIMD DFT." ON
+  "WITH_CR32S;NOT WITH_AVFFT" OFF)
+cmake_dependent_option (WITH_AVFFT
+  "Use libavcodec (LGPL) for HQ SIMD DFT." OFF
+  "WITH_CR32S;NOT WITH_PFFFT" OFF)
+cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF
+  "UNIX;NOT CMAKE_CROSSCOMPILING;EXISTS ${PROJECT_SOURCE_DIR}/lsr-tests;WITH_LSR_BINDINGS" OFF)
+
+option (WITH_HI_PREC_CLOCK "Enable high-precision time-base." ON)
+option (WITH_FLOAT_STD_PREC_CLOCK
+  "Use floating-point for standard-precision time-base." OFF)
+option (WITH_DEV_TRACE "Enable developer trace capability." ON)
+option (WITH_DEV_GPROF "Enable developer grpof output." OFF)
+mark_as_advanced (WITH_HI_PREC_CLOCK WITH_FLOAT_STD_PREC_CLOCK
+  WITH_DEV_TRACE WITH_DEV_GPROF)
 
 
 
 # Introspection:
 
-list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+list (APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 include (CheckFunctionExists)
 include (CheckIncludeFiles)
 include (CheckLibraryExists)
-include (TestBigEndian)
+include (SetSystemProcessor)
+if (NOT EMSCRIPTEN)
+  include(TestBigEndian)
+endif ()
+
+set_system_processor ()
 
 check_library_exists (m pow "" NEED_LIBM)
 if (NEED_LIBM)
   set (CMAKE_REQUIRED_LIBRARIES "m;${CMAKE_REQUIRED_LIBRARIES}")
-  link_libraries (m)
+  set (LIBM_LIBRARIES m)
 endif ()
 
-#if (WITH_OPENMP)
-#  find_package (OpenMP)
-#  if (OPENMP_FOUND)
-#    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
-#    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
-#    set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_SHARED_LINKER_FLAGS}")
-#  endif ()
-#endif ()
-
-if (WITH_SIMD)
-  find_package (SIMD)
-  if (SIMD_FOUND)
-    set (HAVE_SIMD 1)
-  endif ()
+if (${BUILD_EXAMPLES})
+  project (${PROJECT_NAME}) # Adds c++ compiler
 endif ()
 
-if (WITH_SINGLE_PRECISION)
-  set (HAVE_SINGLE_PRECISION 1)
+if (WITH_OPENMP)
+  find_package (OpenMP)
+  if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    if (MINGW) # Is this still needed?
+      set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+      set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+    endif ()
+  endif()
 endif ()
 
-if (WITH_DOUBLE_PRECISION)
-  set (HAVE_DOUBLE_PRECISION 1)
+if (WITH_CR32S)
+  find_package (SIMD32)
+  set (WITH_CR32S ${SIMD32_FOUND})
+endif ()
+
+if (WITH_CR64S)
+  find_package (SIMD64)
+  set (WITH_CR64S ${SIMD64_FOUND})
 endif ()
 
 if (WITH_AVFFT)
-  find_package (LibAVCodec)
+  find_package (LibAVCodec REQUIRED)
   if (AVCODEC_FOUND)
     include_directories (${AVCODEC_INCLUDE_DIRS})
-    link_libraries (${AVCODEC_LIBRARIES})
-    set (HAVE_AVFFT 1)
+    set (LIBS ${LIBS} ${AVCODEC_LIBRARIES})
   endif ()
 endif ()
 
-if (SOXR_SILENT)
-  add_definitions (-DSOXR_SILENT=1)
+if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD32_FOUND AND WITH_CR32))
+  find_package (LibAVUtil)
+  if (AVUTIL_FOUND)
+    include_directories (${AVUTIL_INCLUDE_DIRS})
+    set (LIBS ${LIBS} ${AVUTIL_LIBRARIES})
+  endif ()
 endif ()
 
 check_function_exists (lrint HAVE_LRINT)
 check_include_files (fenv.h HAVE_FENV_H)
-test_big_endian (WORDS_BIGENDIAN)
-
-macro (make_exist)
-  foreach (x ${ARGN})
-    if (NOT ${x})
-      set (${x} 0)
-    endif ()
-  endforeach ()
-endmacro ()
-
-make_exist (HAVE_LRINT HAVE_FENV_H WORDS_BIGENDIAN HAVE_SIMD)
-make_exist (HAVE_SINGLE_PRECISION HAVE_DOUBLE_PRECISION HAVE_AVFFT)
+check_include_files (stdbool.h HAVE_STDBOOL_H)
+check_include_files (stdint.h HAVE_STDINT_H)
+if (EMSCRIPTEN)
+  set(HAVE_BIGENDIAN OFF)
+else()
+  test_big_endian (HAVE_BIGENDIAN)
+endif()
 
 
 
 # Compiler configuration:
 
-if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX)
-  set (PROJECT_CXX_FLAGS "-Wconversion -Wall -W -Wundef -Wcast-align -Wpointer-arith -Wno-long-long")
-  set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} -Wnested-externs -Wmissing-prototypes -Wstrict-prototypes")
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wconversion -Wall -Wextra \
+      -pedantic -Wundef -Wpointer-arith -Wno-long-long")
+  if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wno-keyword-macro")
+  endif ()
+  if (WITH_DEV_GPROF)
+    set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -pg")
+  endif ()
+  # Can use std=c89, but gnu89 should give faster sinf, cosf, etc.:
+  set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} \
+       -std=gnu89 -Wnested-externs -Wmissing-prototypes -Wstrict-prototypes")
   if (CMAKE_BUILD_TYPE STREQUAL "Release")
     set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s") # strip
   endif ()
-  cmake_dependent_option (VISIBILITY_HIDDEN "Build with -fvisibility=hidden." ON
+  cmake_dependent_option (VISIBILITY_HIDDEN
+    "Build shared libraries with -fvisibility=hidden." ON
     "BUILD_SHARED_LIBS" OFF)
+  mark_as_advanced (VISIBILITY_HIDDEN)
   if (VISIBILITY_HIDDEN)
     add_definitions (-fvisibility=hidden -DSOXR_VISIBILITY)
   endif ()
@@ -145,9 +186,14 @@ endif ()
 
 if (MSVC)
   add_definitions (-D_USE_MATH_DEFINES -D_CRT_SECURE_NO_WARNINGS)
-  option (ENABLE_STATIC_RUNTIME "Visual Studio, link with runtime statically."  OFF)
-  if (ENABLE_STATIC_RUNTIME)
-    foreach (flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_MINSIZEREL CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+  option (BUILD_SHARED_RUNTIME "MSVC, link with runtime dynamically."  ON)
+  if (NOT BUILD_SHARED_RUNTIME)
+    foreach (flag_var
+        CMAKE_C_FLAGS                CMAKE_CXX_FLAGS
+        CMAKE_C_FLAGS_DEBUG          CMAKE_CXX_FLAGS_DEBUG
+        CMAKE_C_FLAGS_RELEASE        CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL     CMAKE_CXX_FLAGS_MINSIZEREL
+        CMAKE_C_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_RELWITHDEBINFO)
       string (REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
     endforeach ()
   endif ()
@@ -161,8 +207,9 @@ endif ()
 
 # Build configuration:
 
-if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows) # Allow exes to find dlls:
-  set (BIN ${CMAKE_CURRENT_BINARY_DIR}/bin/)
+if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows)
+  # Allow exes to find dlls:
+  set (BIN ${PROJECT_BINARY_DIR}/bin/)
   set (EXAMPLES_BIN ${BIN})
   set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${BIN})
   set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${BIN})
@@ -179,6 +226,10 @@ if (BUILD_SHARED_LIBS)
   endif ()
 endif ()
 
+if (CMAKE_BUILD_TYPE STREQUAL "None") # As used by some distros.
+  add_definitions (-DNDEBUG)
+endif ()
+
 
 
 # Installation configuration:
@@ -194,7 +245,7 @@ if (NOT DEFINED INCLUDE_INSTALL_DIR)
 endif ()
 if (NOT DEFINED DOC_INSTALL_DIR)
   if (UNIX)
-    set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/doc/libsoxr")
+    set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/doc/lib${PROJECT_NAME}")
   else ()
     set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/doc")
   endif ()
@@ -202,25 +253,24 @@ endif ()
 
 if (APPLE)
   option (BUILD_FRAMEWORK "Build an OS X framework." OFF)
-  set (FRAMEWORK_INSTALL_DIR "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.")
+  set (FRAMEWORK_INSTALL_DIR
+      "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.")
 endif ()
 
 
 
 # Top-level:
 
-set (PROJECT_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH})
-math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
-math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}")
-math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}")
+set (PROJECT_VERSION
+    ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH})
 set (SO_VERSION ${SO_VERSION_MAJOR}.${SO_VERSION_MINOR}.${SO_VERSION_PATCH})
 
 configure_file (
-  ${CMAKE_CURRENT_SOURCE_DIR}/soxr-config.h.in
-  ${CMAKE_CURRENT_BINARY_DIR}/soxr-config.h)
-include_directories (${CMAKE_CURRENT_BINARY_DIR})
+  ${PROJECT_SOURCE_DIR}/${PROJECT_NAME}-config.h.in
+  ${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config.h)
+include_directories (${PROJECT_BINARY_DIR})
 
-if (BUILD_TESTS OR BUILD_LSR_TESTS)
+if (NOT CMAKE_CROSSCOMPILING AND (BUILD_TESTS OR BUILD_LSR_TESTS))
   enable_testing ()
 endif ()
 
@@ -234,7 +284,7 @@ install (FILES
 
 # Subdirectories:
 
-include_directories (${CMAKE_CURRENT_SOURCE_DIR}/src)
+include_directories (${PROJECT_SOURCE_DIR}/src)
 
 add_subdirectory (src)
 if (BUILD_TESTS)
@@ -249,55 +299,45 @@ endif ()
 
 
 
-# Rough-and-ready distclean for anyone still doing in-tree builds:
+# GNU Autotools compatibility; 'make check':
 
-#if (UNIX)
-#  add_custom_target (distclean
-#    COMMAND make clean && rm -rf
-#      CMakeCache.txt
-#      CMakeFiles
-#      cmake_install.cmake
-#      CPackConfig.cmake
-#      CPackSourceConfig.cmake
-#      deinstall.cmake
-#      Makefile
-#      soxr-config.h
-#      src/CMakeFiles
-#      src/cmake_install.cmake
-#      src/libsoxr-dev.src
-#      src/libsoxr-lsr.pc
-#      src/libsoxr.pc
-#      src/libsoxr.src
-#      src/Makefile)
-#endif ()
+add_custom_target (check COMMAND ${CMAKE_CTEST_COMMAND})
+
+
+
+# GNU Autotools compatibility; 'make distclean':
+
+if (UNIX)
+  add_custom_target (distclean COMMAND make clean && find .
+      \\! -path \\*/Modules/\\* \\! -name cmp-test.cmake -a -name \\*.cmake
+      -o -name CMakeFiles -o -name Makefile -o -name CMakeCache.txt -o -name
+      Testing -o -name cmake_install.cmake -o -name install_manifest.txt -o
+      -path ./soxr-config.h -o -name config.h -o -name \\*.pc -o -name \\*.s32
+      | xargs rm -rf)
+endif ()
 
 
 
 # Deinstallation:
 
-#configure_file (
-#  "${CMAKE_CURRENT_SOURCE_DIR}/deinstall.cmake.in"
-#  "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake"
-#  IMMEDIATE @ONLY)
+configure_file (
+  "${CMAKE_CURRENT_SOURCE_DIR}/deinstall.cmake.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake"
+  IMMEDIATE @ONLY)
 
-#add_custom_target (deinstall
-#  COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake")
+add_custom_target (deinstall
+  COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake")
 
 
 
 # Packaging:
 
-#if (UNIX)
-#  set (CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}")
-#  set (CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
-#  set (CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
-#
-#  set (CPACK_SOURCE_GENERATOR "TGZ")
-#  set (CPACK_SOURCE_IGNORE_FILES "dist;/lsr-tests/;/Debug/;/Release/;/cpack/;\\\\.swp$;\\\\.gitignore;/\\\\.git/")
-#
-#  include (CPack)
-#
-#  if (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/cpack)
-#    add_subdirectory (cpack)
-#  endif ()
-#endif ()
+if (UNIX)
+  set (CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}")
+  set (CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
+  set (CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
+  set (CPACK_SOURCE_GENERATOR "TXZ")
+  set (CPACK_SOURCE_IGNORE_FILES
+      "dist;/lsr-tests/;/Debug.*/;/Release.*/;\\\\.swp$;\\\\.git.*;/\\\\.git/")
+  include (CPack)
+endif ()
diff --git a/soxr/INSTALL b/soxr/INSTALL
index c2c7675..5599870 100644
--- a/soxr/INSTALL
+++ b/soxr/INSTALL
@@ -1,11 +1,12 @@
-SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 
 INSTALLATION GUIDE CONTENTS
 
 * Standard build
 * Build customisation
-* Cross-compiling with mingw (linux host)
+* Cross-compilation
 * Integration with other build systems
+* Run-time configuration
 
 
 
@@ -20,7 +21,7 @@ STANDARD BUILD
 
     * A 'make' utility (most compiler installations already have one of these).
 
-    * CMake: http://www.cmake.org/cmake/resources/software.html
+    * CMake v3.0 or newer: https://cmake.org/download/
 
 
 2. Build:
@@ -30,7 +31,7 @@ STANDARD BUILD
 
         go                          (on MS-Windows with nmake)
     or
-        ./go                        (on unix-like systems)
+        ./go                        (on Unix-like systems)
 
     This should build the library and run a few sanity tests.
 
@@ -38,14 +39,14 @@ STANDARD BUILD
 3. Installation:
 
     Note that this step may need to be performed by a system
-    adminstrator.  Enter:
+    administrator.  Enter:
 
         nmake install               (on MS-Windows)
     or
-        cd Release; make install    (on unix)
+        cd Release; make install    (on Unix-like)
 
 
-4. Configuration:
+4. Preparation for use:
 
     To use the library you may need to set up appropriate paths to the
     library and its header file in your development environment.
@@ -60,38 +61,74 @@ STANDARD BUILD
 
 BUILD CUSTOMISATION
 
-If it is necessary to customise the build, then steps 2 and 3 above may be
-substituted as follows.  Change directory to the one containing this file,
-then enter commands along the lines of:
+If it is necessary to customise the build, then steps 2 and 3 above should be
+substituted as follows: change directory to the one containing this file, then
+enter commands along the lines:
 
     mkdir build
     cd build
-    cmake [OPTIONS] ..
+    cmake -Wno-dev -DCMAKE_BUILD_TYPE=Release [OPTIONS] ..
     make
     make test
     sudo make install
 
+N.B. The CMAKE_BUILD_TYPE to use for library deployment is Release.
+
 To list help on the available options, enter:
 
     cmake -LH ..
 
 Options, if given, should be preceded with '-D', e.g.
 
-    cmake -DWITH_SIMD:BOOL=OFF ..
+    -DBUILD_SHARED_LIBS:BOOL=OFF
 
 
 
-CROSS-COMPILING WITH MINGW (LINUX HOST)
+Resampling engines
 
-For example:
+As available on a given system, options for including up-to five resampling
+‘engines’ are available (per above) as follows:
+
+    WITH_CR32: for constant-rate HQ resampling,
+    WITH_CR32S: SIMD variant of previous,
+    WITH_CR64: for constant-rate VHQ resampling,
+    WITH_CR64S: SIMD variant of previous,
+    WITH_VR32: for variable-rate HQ resampling.
+
+By default, these options are all set to ON.
+
+When both SIMD and non-SIMD engine variants are included, run-time selection
+is automatic (based on CPU capability) for x86 CPUs, and can be automatic for
+ARM CPUs if the 3rd-party library `libavutil' is available at libsoxr
+build-time.  Which engine has been selected for a specific configuration and
+invocation of the library can be checked using example #3, which reports it.
+See also Run-time Configuration, below.
+
+
+
+CROSS-COMPILATION
+
+E.g. targeting a Linux ARM system:
+
+    mkdir build
+    cd build
+    cmake -DCMAKE_SYSTEM_NAME=Linux \
+          -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \
+          ..
+or, also building the examples (one of which uses C++):
+
+    cmake -DCMAKE_SYSTEM_NAME=Linux \
+          -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \
+          -DCMAKE_CXX_COMPILER=arm-linux-gnueabi-g++ \
+          -DBUILD_EXAMPLES=1 \
+          ..
+
+E.g. with Mingw (Linux host), using a tool-chain file:
 
     mkdir build
     cd build
     cmake -DCMAKE_TOOLCHAIN_FILE=~/Toolchain-x86_64-mingw-w64-mingw32.cmake \
           -DCMAKE_INSTALL_PREFIX=install \
-          -DHAVE_WORDS_BIGENDIAN_EXITCODE=1 \
-          -DBUILD_TESTS=0 \
-          -DBUILD_EXAMPLES=1 \
           ..
     make
 
@@ -117,7 +154,30 @@ INTEGRATION WITH OTHER BUILD SYSTEMS
 
 Autotools-based systems might find it useful to create a file called
 `configure' in the directory containing this file, consisting of the line:
-  cmake -DBUILD_SHARED_LIBS=OFF .
+    cmake -DBUILD_SHARED_LIBS=OFF .
 (or with other build options as required).
 
-For MS visual studio, see msvc/README
+For MS Visual Studio, see msvc/README.
+
+
+
+RUN-TIME CONFIGURATION
+
+The libsoxr API structure ‘soxr_runtime_spec_t’ allows application developers
+to optimise some aspects of libsoxr’s operation for a particular application.
+Optimal performance however, might depend on an individual end-user’s run-
+time system and the end-user’s preferences.  Hence environment variables are
+available to set (override) run-time parameters as follows:
+
+    Env. variable        Equivalent soxr_runtime_spec_t item (see soxr.h)
+    ------------------   -----------------------------------
+    SOXR_COEFS_SIZE      coef_size_kbytes
+    SOXR_COEF_INTERP     SOXR_COEF_INTERP_xxx
+    SOXR_LARGE_DFT_SIZE  log2_large_dft_size
+    SOXR_MIN_DFT_SIZE    log2_min_dft_size
+    SOXR_NUM_THREADS     num_threads
+
+Additionally, the SOXR_USE_SIMD32 and SOXR_USE_SIMD64 boolean environment
+variables can be used to override automatic selection (or to provide manual
+selection where automatic selection is not available) between SIMD and
+non-SIMD engine variants.
diff --git a/soxr/LICENCE b/soxr/LICENCE
index 1c61878..43e5a71 100644
--- a/soxr/LICENCE
+++ b/soxr/LICENCE
@@ -1,4 +1,4 @@
-SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
 
 This library is free software; you can redistribute it and/or modify it
 under the terms of the GNU Lesser General Public License as published by
@@ -11,8 +11,7 @@ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
 General Public License for more details.
 
 You should have received a copy of the GNU Lesser General Public License
-along with this library; if not, write to the Free Software Foundation,
-Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA.
+along with this library; if not, see <https://www.gnu.org/licenses/>.
 
 
 Notes
diff --git a/soxr/NEWS b/soxr/NEWS
index f388974..9e7c298 100644
--- a/soxr/NEWS
+++ b/soxr/NEWS
@@ -1,3 +1,12 @@
+Version 0.1.3 (2018-02-24)
+  * SIMD enhancements: SSE, AVX, Neon.
+  * Improve support for clang, ARM, and cross-compilation.
+  * Provide env. var. override of runtime parameters.
+  * Build fix re cmake variables AVCODEC_INCLUDE_DIRS & AVUTIL_INCLUDE_DIRS.
+  * Build options WITH_SINGLE_PRECISION, WITH_DOUBLE_PRECISION & WITH_SIMD have
+    been removed; replacement options are detailed in INSTALL, `Resampling
+    engines'.
+
 Version 0.1.2 (2015-09-05)
   * Fix conversion failure when I/O types differ but I/O rates don't.
   * Fix #defines for interpolation order selection.
diff --git a/soxr/README b/soxr/README
index 06f11e6..7f9a7af 100644
--- a/soxr/README
+++ b/soxr/README
@@ -1,4 +1,4 @@
-SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
 
 The SoX Resampler library `libsoxr' performs one-dimensional sample-rate
 conversion -- it may be used, for example, to resample PCM-encoded audio.
@@ -46,7 +46,7 @@ size configuration parameters may be used to reduce this figure).
 For build and installation instructions, see the file `INSTALL'; for
 copyright and licensing information, see the file `LICENCE'.
 
-For support and new versions, see http://soxr.sourceforge.net
+For support and new versions, see https://soxr.sourceforge.net
 ________
 ¹ For example, multi-channel resampling can utilise multiple CPU-cores.
 ² Bit-perfect within practical occupied-bandwidth limits.
diff --git a/soxr/TODO b/soxr/TODO
index 1c4a31b..2d1bc19 100644
--- a/soxr/TODO
+++ b/soxr/TODO
@@ -1,3 +1,3 @@
-* SOXR_ALLOW_ALIASING
-* Explicit flush API fn, perhaps.
-* More SIMD.
+* vr32s
+* vr32 with 1-delay-clear
+* fir_to_phase with RDFT32
diff --git a/soxr/cmake/Modules/FindCFlags.cmake b/soxr/cmake/Modules/FindCFlags.cmake
new file mode 100644
index 0000000..f118727
--- /dev/null
+++ b/soxr/cmake/Modules/FindCFlags.cmake
@@ -0,0 +1,35 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Function to find C compiler feature flags
+
+include (CheckCSourceCompiles)
+include (FindPackageHandleStandardArgs)
+
+function (FindCFlags PKG_NAME PKG_DESC TRIAL_C_FLAGS TEST_C_SOURCE)
+
+foreach (TRIAL_C_FLAG ${TRIAL_C_FLAGS})
+  message (STATUS "Trying ${PKG_NAME} C flags: ${TRIAL_C_FLAG}")
+  unset (DETECT_${PKG_NAME}_C_FLAGS CACHE) #displayed by check_c_source_compiles
+
+  set (TMP "${CMAKE_REQUIRED_FLAGS}")
+  set (CMAKE_REQUIRED_FLAGS "${TRIAL_C_FLAG}")
+  check_c_source_compiles ("${TEST_C_SOURCE}" DETECT_${PKG_NAME}_C_FLAGS)
+  set (CMAKE_REQUIRED_FLAGS "${TMP}")
+
+  if (DETECT_${PKG_NAME}_C_FLAGS)
+    set (DETECTED_C_FLAGS "${TRIAL_C_FLAG}")
+    break ()
+  endif ()
+endforeach ()
+
+# N.B. Will not overwrite existing cache variable:
+set (${PKG_NAME}_C_FLAGS "${DETECTED_C_FLAGS}"
+  CACHE STRING "C compiler flags for ${PKG_DESC}")
+
+find_package_handle_standard_args (
+  ${PKG_NAME} DEFAULT_MSG ${PKG_NAME}_C_FLAGS ${PKG_NAME}_C_FLAGS)
+mark_as_advanced (${PKG_NAME}_C_FLAGS)
+set (${PKG_NAME}_FOUND ${${PKG_NAME}_FOUND} PARENT_SCOPE)
+
+endfunction ()
diff --git a/soxr/cmake/Modules/FindLibAVCodec.cmake b/soxr/cmake/Modules/FindLibAVCodec.cmake
index add33c3..f1bbf89 100644
--- a/soxr/cmake/Modules/FindLibAVCodec.cmake
+++ b/soxr/cmake/Modules/FindLibAVCodec.cmake
@@ -1,23 +1,23 @@
-# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
 # - Find AVCODEC
-# Find the native installation of this package: includes and libraries.
+# Find the installation of this package: include-dirs and libraries.
 #
-#  AVCODEC_INCLUDES    - where to find headers for this package.
-#  AVCODEC_LIBRARIES   - List of libraries when using this package.
-#  AVCODEC_FOUND       - True if this package can be found.
+#  AVCODEC_INCLUDE_DIRS - where to find headers for this package.
+#  AVCODEC_LIBRARIES    - libraries to link to when using this package.
+#  AVCODEC_FOUND        - true iff this package can be found.
 
-if (AVCODEC_INCLUDES)
+if (AVCODEC_INCLUDE_DIRS)
   set (AVCODEC_FIND_QUIETLY TRUE)
-endif (AVCODEC_INCLUDES)
+endif ()
 
-find_path (AVCODEC_INCLUDES libavcodec/avcodec.h)
+find_path (AVCODEC_INCLUDE_DIRS libavcodec/avcodec.h)
 
 find_library (AVCODEC_LIBRARIES NAMES avcodec)
 
 include (FindPackageHandleStandardArgs)
 find_package_handle_standard_args (
-  AVCODEC DEFAULT_MSG AVCODEC_LIBRARIES AVCODEC_INCLUDES)
+  AVCODEC DEFAULT_MSG AVCODEC_LIBRARIES AVCODEC_INCLUDE_DIRS)
 
-mark_as_advanced (AVCODEC_LIBRARIES AVCODEC_INCLUDES)
+mark_as_advanced (AVCODEC_LIBRARIES AVCODEC_INCLUDE_DIRS)
diff --git a/soxr/cmake/Modules/FindLibAVUtil.cmake b/soxr/cmake/Modules/FindLibAVUtil.cmake
new file mode 100644
index 0000000..464e6cf
--- /dev/null
+++ b/soxr/cmake/Modules/FindLibAVUtil.cmake
@@ -0,0 +1,23 @@
+# SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Find AVUTIL
+# Find the installation of this package: includes and libraries.
+#
+#  AVUTIL_INCLUDE_DIRS - where to find headers for this package.
+#  AVUTIL_LIBRARIES    - libraries to link to when using this package.
+#  AVUTIL_FOUND        - true iff this package can be found.
+
+if (AVUTIL_INCLUDE_DIRS)
+  set (AVUTIL_FIND_QUIETLY TRUE)
+endif ()
+
+find_path (AVUTIL_INCLUDE_DIRS libavutil/cpu.h)
+
+find_library (AVUTIL_LIBRARIES NAMES avutil)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (
+  AVUTIL DEFAULT_MSG AVUTIL_LIBRARIES AVUTIL_INCLUDE_DIRS)
+
+mark_as_advanced (AVUTIL_LIBRARIES AVUTIL_INCLUDE_DIRS)
diff --git a/soxr/cmake/Modules/FindOpenMP.cmake b/soxr/cmake/Modules/FindOpenMP.cmake
deleted file mode 100644
index eef8422..0000000
--- a/soxr/cmake/Modules/FindOpenMP.cmake
+++ /dev/null
@@ -1,115 +0,0 @@
-# - Finds OpenMP support
-# This module can be used to detect OpenMP support in a compiler.
-# If the compiler supports OpenMP, the flags required to compile with
-# openmp support are set.
-#
-# The following variables are set:
-#   OpenMP_C_FLAGS - flags to add to the C compiler for OpenMP support
-#   OPENMP_FOUND - true if openmp is detected
-#
-# Supported compilers can be found at http://openmp.org/wp/openmp-compilers/
-#
-# Modifications for soxr:
-#   * don't rely on presence of C++ compiler
-#   * support MINGW
-#
-#=============================================================================
-# Copyright 2009 Kitware, Inc.
-# Copyright 2008-2009 André Rigland Brodtkorb <Andre.Brodtkorb@ifi.uio.no>
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#  * Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-#  * The names of Kitware, Inc., the Insight Consortium, or the names of
-#    any consortium members, or of any contributors, may not be used to
-#    endorse or promote products derived from this software without
-#    specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-include (CheckCSourceCompiles)
-include (FindPackageHandleStandardArgs)
-
-set (OpenMP_C_FLAG_CANDIDATES
-  #Gnu
-  "-fopenmp"
-  #Microsoft Visual Studio
-  "/openmp"
-  #Intel windows
-  "-Qopenmp"
-  #Intel
-  "-openmp"
-  #Empty, if compiler automatically accepts openmp
-  " "
-  #Sun
-  "-xopenmp"
-  #HP
-  "+Oopenmp"
-  #IBM XL C/c++
-  "-qsmp"
-  #Portland Group
-  "-mp"
-)
-
-# sample openmp source code to test
-set (OpenMP_C_TEST_SOURCE
-"
-#include <omp.h>
-int main() {
-#ifdef _OPENMP
-  return 0;
-#else
-  breaks_on_purpose
-#endif
-}
-")
-# if these are set then do not try to find them again,
-# by avoiding any try_compiles for the flags
-if (DEFINED OpenMP_C_FLAGS)
-  set (OpenMP_C_FLAG_CANDIDATES)
-endif (DEFINED OpenMP_C_FLAGS)
-
-# check c compiler
-foreach (FLAG ${OpenMP_C_FLAG_CANDIDATES})
-  set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
-  set (CMAKE_REQUIRED_FLAGS "${FLAG}")
-  unset (OpenMP_FLAG_DETECTED CACHE)
-  message (STATUS "Try OpenMP C flag = [${FLAG}]")
-  check_c_source_compiles ("${OpenMP_C_TEST_SOURCE}" OpenMP_FLAG_DETECTED)
-  set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}")
-  if (OpenMP_FLAG_DETECTED)
-    set (OpenMP_C_FLAGS_INTERNAL "${FLAG}")
-    break ()
-  endif (OpenMP_FLAG_DETECTED)
-endforeach (FLAG ${OpenMP_C_FLAG_CANDIDATES})
-
-set (OpenMP_C_FLAGS "${OpenMP_C_FLAGS_INTERNAL}"
-  CACHE STRING "C compiler flags for OpenMP parallization")
-
-# handle the standard arguments for find_package
-find_package_handle_standard_args (OpenMP DEFAULT_MSG
-  OpenMP_C_FLAGS)
-
-if (MINGW)
-  set (OpenMP_SHARED_LINKER_FLAGS "${OpenMP_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
-  set (OpenMP_EXE_LINKER_FLAGS "${OpenMP_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
-endif ()
-
-mark_as_advanced (OpenMP_C_FLAGS OpenMP_SHARED_LINKER_FLAGS OpenMP_EXE_LINKER_FLAGS)
diff --git a/soxr/cmake/Modules/FindSIMD.cmake b/soxr/cmake/Modules/FindSIMD.cmake
deleted file mode 100644
index 6ac51cb..0000000
--- a/soxr/cmake/Modules/FindSIMD.cmake
+++ /dev/null
@@ -1,94 +0,0 @@
-# - Finds SIMD support
-#
-# The following variables are set:
-#   SIMD_C_FLAGS - flags to add to the C compiler for this package.
-#   SIMD_FOUND - true if support for this package is found.
-#
-#=============================================================================
-# Based on FindOpenMP.cmake, which is:
-#
-# Copyright 2009 Kitware, Inc.
-# Copyright 2008-2009 André Rigland Brodtkorb <Andre.Brodtkorb@ifi.uio.no>
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met:
-#
-#  * Redistributions of source code must retain the above copyright notice,
-#    this list of conditions and the following disclaimer.
-#
-#  * Redistributions in binary form must reproduce the above copyright notice,
-#    this list of conditions and the following disclaimer in the documentation
-#    and/or other materials provided with the distribution.
-#
-#  * The names of Kitware, Inc., the Insight Consortium, or the names of
-#    any consortium members, or of any contributors, may not be used to
-#    endorse or promote products derived from this software without
-#    specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS ``AS IS''
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE FOR
-# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-include (CheckCSourceCompiles)
-include (FindPackageHandleStandardArgs)
-
-if (WIN32) # Safety for when mixed lib/app compilers (but performance hit)
-  set (GCC_WIN32_SIMD_OPTS "-mincoming-stack-boundary=2")
-endif ()
-
-set (SIMD_C_FLAG_CANDIDATES
-  # x64
-  " "
-  # Microsoft Visual Studio x86
-  "/arch:SSE /fp:fast -D__SSE__"
-  # Gcc x86
-  "-msse -mfpmath=sse ${GCC_WIN32_SIMD_OPTS}"
-  # Gcc x86 (old versions)
-  "-msse -mfpmath=sse"
-)
-
-set (SIMD_C_TEST_SOURCE
-"
-#include <xmmintrin.h>
-int main()
-{
-  __m128 a, b;
-  float vals[4] = {0};
-  a = _mm_loadu_ps (vals);
-  b = a;
-  b = _mm_add_ps (a,b);
-  _mm_storeu_ps (vals,b);
-  return 0;
-}
-")
-
-if (DEFINED SIMD_C_FLAGS)
-  set (SIMD_C_FLAG_CANDIDATES)
-endif ()
-
-foreach (FLAG ${SIMD_C_FLAG_CANDIDATES})
-  set (SAFE_CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS}")
-  set (CMAKE_REQUIRED_FLAGS "${FLAG}")
-  unset (SIMD_FLAG_DETECTED CACHE)
-  message (STATUS "Try SIMD C flag = [${FLAG}]")
-  check_c_source_compiles ("${SIMD_C_TEST_SOURCE}" SIMD_FLAG_DETECTED)
-  set (CMAKE_REQUIRED_FLAGS "${SAFE_CMAKE_REQUIRED_FLAGS}")
-  if (SIMD_FLAG_DETECTED)
-    set (SIMD_C_FLAGS_INTERNAL "${FLAG}")
-    break ()
-  endif ()
-endforeach ()
-
-set (SIMD_C_FLAGS "${SIMD_C_FLAGS_INTERNAL}"
-  CACHE STRING "C compiler flags for SIMD vectorization")
-
-find_package_handle_standard_args (SIMD DEFAULT_MSG SIMD_C_FLAGS SIMD_C_FLAGS)
-mark_as_advanced (SIMD_C_FLAGS)
diff --git a/soxr/cmake/Modules/FindSIMD32.cmake b/soxr/cmake/Modules/FindSIMD32.cmake
new file mode 100644
index 0000000..9e42373
--- /dev/null
+++ b/soxr/cmake/Modules/FindSIMD32.cmake
@@ -0,0 +1,54 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Finds SIMD32 support
+#
+# The following variables are set:
+#   SIMD32_C_FLAGS - flags to add to the C compiler for this package.
+#   SIMD32_FOUND - true if support for this package is found.
+
+if (DEFINED SIMD32_C_FLAGS)
+  set (TRIAL_C_FLAGS)
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+  set (TRIAL_C_FLAGS
+    # Gcc
+    "-mfpu=neon-vfpv4 -mcpu=cortex-a7"
+    "-mfpu=neon       -mfloat-abi=hard"
+    "-mfpu=neon       -mfloat-abi=softfp"
+    "-mfpu=neon       -mfloat-abi=soft"
+  )
+  set (TEST_C_SOURCE "
+    #include <arm_neon.h>
+    int main(int c, char * * v) {
+      float32x4_t a = vdupq_n_f32((float)c), b = vdupq_n_f32((float)!!v);
+      return !vgetq_lane_u32(vceqq_f32(a,b),0);
+    }
+  ")
+else ()
+  if (WIN32) # Safety for when mixed lib/app compilers (but performance hit)
+    set (GCC_WIN32_SIMD32_OPTS "-mincoming-stack-boundary=2")
+  endif ()
+
+  set (TRIAL_C_FLAGS
+    # x64
+    " "
+    # MSVC x86
+    "/arch:SSE /fp:fast -D__SSE__"
+    # Gcc x86
+    "-msse -mfpmath=sse ${GCC_WIN32_SIMD32_OPTS}"
+    # Gcc x86 (old versions)
+    "-msse -mfpmath=sse"
+  )
+  set (TEST_C_SOURCE "
+    #include <xmmintrin.h>
+    int main(int c, char * * v) {
+      __m128 a = _mm_set_ss((float)c), b = _mm_set_ss((float)!!v);
+      return _mm_comineq_ss(a,b);
+    }
+  ")
+endif ()
+
+include (FindCFlags)
+
+FindCFlags ("SIMD32" "FLOAT-32 (single-precision) SIMD vectorization"
+  "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}")
diff --git a/soxr/cmake/Modules/FindSIMD64.cmake b/soxr/cmake/Modules/FindSIMD64.cmake
new file mode 100644
index 0000000..d412644
--- /dev/null
+++ b/soxr/cmake/Modules/FindSIMD64.cmake
@@ -0,0 +1,29 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Finds SIMD64 support
+#
+# The following variables are set:
+#   SIMD64_C_FLAGS - flags to add to the C compiler for this package.
+#   SIMD64_FOUND - true if support for this package is found.
+
+if (DEFINED SIMD64_C_FLAGS OR CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+  set (TRIAL_C_FLAGS)
+else ()
+  set (TRIAL_C_FLAGS
+    "-mavx" # Gcc
+    "/arch:AVX" # MSVC
+  )
+  set (TEST_C_SOURCE "
+    #ifndef __AVX__
+      #error
+    #endif
+    #include <immintrin.h>
+    int main() {return 0;}
+    ")
+endif ()
+
+include (FindCFlags)
+
+FindCFlags ("SIMD64" "FLOAT-64 (double-precision) SIMD vectorization"
+  "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}")
diff --git a/soxr/cmake/Modules/SetSystemProcessor.cmake b/soxr/cmake/Modules/SetSystemProcessor.cmake
new file mode 100644
index 0000000..8e2c292
--- /dev/null
+++ b/soxr/cmake/Modules/SetSystemProcessor.cmake
@@ -0,0 +1,37 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Sets CMAKE_SYSTEM_PROCESSOR for cross-compiling.
+
+macro (set_system_processor)
+  if (CMAKE_CROSSCOMPILING)
+    if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "unknown")
+      unset(CMAKE_SYSTEM_PROCESSOR)
+    endif ()
+    if (NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
+      include (CheckCSourceCompiles)
+      set (CPU_LINES
+        "#if defined __x86_64__ || defined _M_X64  /*\;x86_64\;*/"
+        "#if defined __i386__   || defined _M_IX86 /*\;x86_32\;*/"
+        "#if defined __arm__    || defined _M_ARM  /*\;arm\;*/"
+        )
+      foreach (CPU_LINE ${CPU_LINES})
+        string (CONCAT CPU_SOURCE "${CPU_LINE}" "
+        int main() {return 0;}
+        #endif
+        ")
+        unset (SYSTEM_PROCESSOR_DETECTED CACHE)
+        check_c_source_compiles ("${CPU_SOURCE}" SYSTEM_PROCESSOR_DETECTED)
+        if (SYSTEM_PROCESSOR_DETECTED)
+          list (GET CPU_LINE 1 CMAKE_SYSTEM_PROCESSOR)
+          message (STATUS "CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
+          break ()
+        endif ()
+      endforeach ()
+    endif ()
+
+    # N.B. Will not overwrite existing cache variable:
+    set (CMAKE_SYSTEM_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}"
+      CACHE STRING "Target system processor")
+  endif ()
+endmacro ()
diff --git a/soxr/cmake/Modules/TestBigEndian.cmake b/soxr/cmake/Modules/TestBigEndian.cmake
deleted file mode 100644
index d80df20..0000000
--- a/soxr/cmake/Modules/TestBigEndian.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
-# Licence for this file: LGPL v2.1                  See LICENCE for details.
-
-# - Macro to determine endian type
-#  test_big_endian (VARIABLE)
-#  VARIABLE - variable to store the result to
-
-macro (test_big_endian VARIABLE)
-  if ("${HAVE_${VARIABLE}}" MATCHES "^${HAVE_${VARIABLE}}$")
-    include (CheckCSourceRuns)
-    check_c_source_runs ("int main() {union {long i; char c[sizeof(long)];}
-      const u = {1}; return !!u.c[0];}" HAVE_${VARIABLE})
-    set (${VARIABLE} "${HAVE_${VARIABLE}}" CACHE INTERNAL "1 if system is big endian" FORCE)
-  endif ()
-endmacro ()
diff --git a/soxr/dist b/soxr/dist
new file mode 100644
index 0000000..ee68b30
--- /dev/null
+++ b/soxr/dist
@@ -0,0 +1,12 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Makes the distribution tarball
+
+test $# = 1 -o `git status -s|wc -c` = 0
+rm -rf Release
+./go -j4
+cd Release
+make package_source
diff --git a/soxr/examples/1-single-block.c b/soxr/examples/1-single-block.c
index 3fb9201..3b919b4 100644
--- a/soxr/examples/1-single-block.c
+++ b/soxr/examples/1-single-block.c
@@ -25,7 +25,7 @@ const float in[] = {  /* Input: 12 cycles of a sine wave with freq. = irate/4 */
 
 int main(int argc, char const * arg[])
 {
-  double irate = argc > 1? atof(arg[1]) : 1;         /* Default to upsampling */
+  double irate = argc > 1? atof(arg[1]) : 1;      /* Default to interpolation */
   double orate = argc > 2? atof(arg[2]) : 2;             /* by a factor of 2. */
 
   size_t olen = (size_t)(AL(in) * orate / irate + .5);   /* Assay output len. */
diff --git a/soxr/examples/1a-lsr.c b/soxr/examples/1a-lsr.c
index e42e530..6b50a8f 100644
--- a/soxr/examples/1a-lsr.c
+++ b/soxr/examples/1a-lsr.c
@@ -12,7 +12,7 @@ float in[] = {  /* Input: 12 cycles of a sine wave with freq. = irate/4 */
 
 int main(int argc, char const * arg[])
 {
-  double irate = argc > 1? atof(arg[1]) : 1;         /* Default to upsampling */
+  double irate = argc > 1? atof(arg[1]) : 1;      /* Default to interpolation */
   double orate = argc > 2? atof(arg[2]) : 2;             /* by a factor of 2. */
 
   size_t olen = (size_t)(AL(in) * orate / irate + .5);   /* Assay output len. */
diff --git a/soxr/examples/3-options-input-fn.c b/soxr/examples/3-options-input-fn.c
index 38fbb0d..afd43b9 100644
--- a/soxr/examples/3-options-input-fn.c
+++ b/soxr/examples/3-options-input-fn.c
@@ -1,4 +1,4 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Example 3: extends example 2 with multiple channels, multiple datatypes,
@@ -14,7 +14,7 @@
  *   OUTPUT-RATE      Ditto
  *   NUM-CHANNELS     Number of interleaved channels
  *   IN-DATATYPE#     0:float32 1:float64 2:int32 3:int16
- *   OUT-DATATYPE#    Ditto
+ *   OUT-DATATYPE#    Ditto; or 11 for un-dithered int16
  *   Q-RECIPE         Quality recipe (in hex) See soxr.h
  *   Q-FLAGS          Quality flags  (in hex) See soxr.h
  *   PASSBAND-END     %
@@ -42,7 +42,7 @@ static size_t input_fn(input_context_t * p, soxr_cbuf_t * buf, size_t len)
 
 int main(int n, char const * arg[])
 {
-  char const *     const arg0 = n? --n, *arg++ : "";
+  char const *     const arg0 = n? --n, *arg++ : "", * engine = "";
   double          const irate = n? --n, atof(*arg++) : 96000.;
   double          const orate = n? --n, atof(*arg++) : 44100.;
   unsigned        const chans = n? --n, (unsigned)atoi(*arg++) : 1;
@@ -94,6 +94,7 @@ int main(int n, char const * arg[])
   }
 
   if (!error) {                         /* If all is well, run the resampler: */
+    engine = soxr_engine(soxr);
     USE_STD_STDIO;
                                                        /* Resample in blocks: */
     do odone = soxr_output(soxr, obuf, olen);
@@ -106,8 +107,8 @@ int main(int n, char const * arg[])
   soxr_delete(soxr);
   free(obuf), free(ibuf);
                                                               /* Diagnostics: */
-  fprintf(stderr, "%-26s %s; %lu clips; I/O: %s\n",
+  fprintf(stderr, "%-26s %s; %lu clips; I/O: %s (%s)\n",
       arg0, soxr_strerror(error), (long unsigned)clips,
-      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error");
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error", engine);
   return !!error;
 }
diff --git a/soxr/examples/4-split-channels.c b/soxr/examples/4-split-channels.c
index d6448aa..a9022ce 100644
--- a/soxr/examples/4-split-channels.c
+++ b/soxr/examples/4-split-channels.c
@@ -1,4 +1,4 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Example 4: variant of examples 2 & 3, demonstrating I/O with split channels.
@@ -13,6 +13,8 @@
  *
  * Note also (not shown in the examples) that split/interleaved channels may
  * be used for input and output independently.
+ *
+ * Arguments are as example 3.
  */
 
 #include <soxr.h>
@@ -73,13 +75,17 @@ int main(int n, char const * arg[])
   double          const orate = n? --n, atof(*arg++) : 44100.;
   unsigned        const chans = n? --n, (unsigned)atoi(*arg++) : 1;
   soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
-  soxr_datatype_t const otype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned        const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
   unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
   unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+  double   const passband_end = n? --n, atof(*arg++) : 0;
+  double const stopband_begin = n? --n, atof(*arg++) : 0;
+  double const phase_response = n? --n, atof(*arg++) : -1;
   int       const use_threads = n? --n, atoi(*arg++) : 1;
+  soxr_datatype_t const otype = ospec & 3;
 
-  soxr_quality_spec_t const q_spec = soxr_quality_spec(q_recipe, q_flags);
-  soxr_io_spec_t const io_spec=soxr_io_spec(itype|SOXR_SPLIT, otype|SOXR_SPLIT);
+  soxr_quality_spec_t  q_spec = soxr_quality_spec(q_recipe, q_flags);
+  soxr_io_spec_t       io_spec=soxr_io_spec(itype|SOXR_SPLIT, otype|SOXR_SPLIT);
   soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
 
   /* Allocate resampling input and output buffers in proportion to the input
@@ -102,11 +108,18 @@ int main(int n, char const * arg[])
 
   size_t odone, written, need_input = 1, clips = 0;
   soxr_error_t error;
+  soxr_t soxr;
+  unsigned i;
 
-  soxr_t soxr = soxr_create(
+  /* Overrides (if given): */
+  if (passband_end   > 0) q_spec.passband_end   = passband_end / 100;
+  if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100;
+  if (phase_response >=0) q_spec.phase_response = phase_response;
+  io_spec.flags = ospec & ~7u;
+
+  soxr = soxr_create(
       irate, orate, chans, &error, &io_spec, &q_spec, &runtime_spec);
 
-  unsigned i;
   for (i = 0; i < chans; ++i) {
     ibuf_ptrs[i] = iptr;
     obuf_ptrs[i] = optr;
diff --git a/soxr/examples/CMakeLists.txt b/soxr/examples/CMakeLists.txt
index 8107a4e..c8c17c9 100644
--- a/soxr/examples/CMakeLists.txt
+++ b/soxr/examples/CMakeLists.txt
@@ -1,25 +1,23 @@
-# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
-if (${BUILD_EXAMPLES})
-  project (soxr) # Adds c++ compiler
-  file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/[1-9]-*.[cC])
-elseif (${BUILD_TESTS})
-  file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/3*.c)
-endif ()
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PROJECT_CXX_FLAGS}")
+link_libraries (${PROJECT_NAME} ${LIBM_LIBRARIES})
 
 if (${BUILD_EXAMPLES} OR ${BUILD_TESTS})
+  set (SOURCES 3-options-input-fn)
   if (${WITH_LSR_BINDINGS})
-    set (LSR_SOURCES 1a-lsr.c)
+    set (LSR_SOURCES 1a-lsr)
   endif ()
 endif ()
 
-if (NOT BUILD_SHARED_LIBS AND OPENMP_FOUND)
-  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_C_FLAGS}")
+if (${BUILD_EXAMPLES})
+  list (APPEND SOURCES 1-single-block 2-stream 4-split-channels)
+  if (${WITH_VR32})
+    list (APPEND SOURCES 5-variable-rate)
+  endif ()
 endif ()
-set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}")
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PROJECT_CXX_FLAGS}")
-link_libraries (soxr)
 
 foreach (fe ${SOURCES} ${LSR_SOURCES})
   get_filename_component (f ${fe} NAME_WE)
@@ -34,4 +32,5 @@ if (${BUILD_TESTS} AND ${WITH_LSR_BINDINGS})
 endif ()
 
 file (GLOB INSTALL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.[cCh])
-install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README DESTINATION ${DOC_INSTALL_DIR}/examples)
+install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README
+    DESTINATION ${DOC_INSTALL_DIR}/examples)
diff --git a/soxr/examples/examples-common.h b/soxr/examples/examples-common.h
index 585fac3..fc8ed82 100644
--- a/soxr/examples/examples-common.h
+++ b/soxr/examples/examples-common.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Common includes etc. for the examples.  */
@@ -17,10 +17,7 @@
   #include <io.h>
   #include <fcntl.h>
   #define USE_STD_STDIO _setmode(_fileno(stdout), _O_BINARY), \
-                        _setmode(_fileno(stdin ), _O_BINARY);
-  /* Sometimes missing, so ensure that it is defined: */
-  #undef M_PI
-  #define M_PI 3.14159265358979323846
+                        _setmode(_fileno(stdin ), _O_BINARY)
 #else
   #define USE_STD_STDIO
 #endif
@@ -38,8 +35,13 @@
 #endif
 
 #undef min
-#undef max
 #define min(x,y) ((x)<(y)?(x):(y))
+
+#undef max
 #define max(x,y) ((x)>(y)?(x):(y))
 
+#undef AL
 #define AL(a) (sizeof(a)/sizeof((a)[0]))  /* Array Length */
+
+#undef M_PI /* Sometimes missing, so ensure that it is defined: */
+#define M_PI 3.14159265358979323846
diff --git a/soxr/go b/soxr/go
new file mode 100644
index 0000000..7fba810
--- /dev/null
+++ b/soxr/go
@@ -0,0 +1,18 @@
+#!/bin/sh
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+case "$1" in -j*) j="$1"; shift;; esac   # Support -jX for parallel build/test
+
+test x"$1" = x && build=Release || build="$1"
+
+rm -f CMakeCache.txt             # Prevent interference from any in-tree build
+
+mkdir -p "$build"
+cd "$build"
+
+cmake -Wno-dev -DCMAKE_BUILD_TYPE="$build" ..
+make $j
+ctest $j || echo "FAILURE details in $build/Testing/Temporary/LastTest.log"
diff --git a/soxr/go.bat b/soxr/go.bat
new file mode 100644
index 0000000..aabff75
--- /dev/null
+++ b/soxr/go.bat
@@ -0,0 +1,27 @@
+@echo off
+rem SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+rem Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+set build=%1
+if x%build% == x set build=Release
+
+rem Prevent interference from any in-tree build
+del/f CMakeCache.txt
+
+mkdir %build%
+cd %build%
+
+cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=%build% -Wno-dev ..
+if errorlevel 1 goto end
+
+nmake
+if errorlevel 1 goto end
+
+nmake test
+if errorlevel 1 goto error
+goto end
+
+:error
+echo FAILURE details in Testing\Temporary\LastTest.log
+
+:end
diff --git a/soxr/inst-check b/soxr/inst-check
new file mode 100644
index 0000000..8cf64b7
--- /dev/null
+++ b/soxr/inst-check
@@ -0,0 +1,25 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Sanity-check of library installed on unix-like system
+
+# This script checks the installation of the entire library (including lsr).
+#
+# Distros using three separate packages can do the following (in order):
+#
+# * Install soxr pkg (i.e. basically, just the shared object)
+# * ./inst-check-soxr
+# * Install soxr-lsr pkg (i.e. basically, just the shared object)
+# * ./inst-check-soxr-lsr
+# * Install the -dev pkg (i.e. examples, headers, & pkg-config)
+# * ./inst-check PATH-OF-INSTALLED-EXAMPLES-DIR (e.g. /usr/share/doc/libsoxr/examples)
+
+# Where are the example source files:
+src=$1
+test x$src = x && src=/usr/local/share/doc/libsoxr/examples
+
+dir="$(dirname $(readlink -f $0))"
+$dir/inst-check-soxr $src
+$dir/inst-check-soxr-lsr $src
diff --git a/soxr/inst-check-soxr b/soxr/inst-check-soxr
new file mode 100644
index 0000000..5f923b8
--- /dev/null
+++ b/soxr/inst-check-soxr
@@ -0,0 +1,52 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Sanity-check of sub-library installed on unix-like system
+
+arg="$1" # path to installed examples (if dev pkg installed); otherwise omitted
+dir="$(dirname $(readlink -f $0))"
+
+# Find the examples:
+src="$arg"
+test x"$src" = x && src="$dir/examples"
+cd $src
+
+# Somewhere to put the binaries:
+tmp=`mktemp -d`
+
+build_examples() {
+  if [ x"$arg" = x ]; then
+    echo "Examples in `pwd`; using local headers:" # for when dev pkg not installed
+    libs=-l$1
+    cflags=-I$dir/src
+  else
+    echo "Examples in `pwd`; using pkg-config:"
+    libs=$(pkg-config --libs $1)
+    cflags=$(pkg-config --cflags $1)
+  fi
+  for f in ?$2-*.[cC]; do
+    cc=cc; echo $f | grep -q C$ && cc=c++
+    out=$tmp/`echo $f | sed "s/.[cC]$//"`
+    cmd="$cc $cflags -o $out $f $libs -lm"
+    echo $cmd; $cmd
+  done
+}
+
+# Determine library:
+if [ `basename $0` = inst-check-soxr ]; then
+  build_examples soxr
+  gen="dd if=/dev/urandom count=1000"
+  $tmp/1-single-block 1 2 .
+  $gen 2> /dev/null | $tmp/2-stream                     2>&1 >$tmp/stdout
+  $gen 2> /dev/null | $tmp/3-options-input-fn 6 7 2 2 0 2>&1 >$tmp/stdout
+  $gen 2> /dev/null | $tmp/4-split-channels   7 6 2 2 3 2>&1 >$tmp/stdout  # Clipping expected here
+  $gen 2> /dev/null | $tmp/5-variable-rate              2>&1 >$tmp/stdout
+else
+  build_examples soxr-lsr a # lsr has 'a' suffix on example number.
+  $tmp/1a-lsr 1 2 .
+fi
+
+# Tidy up:
+rm -rf $tmp
diff --git a/soxr/inst-check-soxr-lsr b/soxr/inst-check-soxr-lsr
new file mode 100644
index 0000000..5f923b8
--- /dev/null
+++ b/soxr/inst-check-soxr-lsr
@@ -0,0 +1,52 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Sanity-check of sub-library installed on unix-like system
+
+arg="$1" # path to installed examples (if dev pkg installed); otherwise omitted
+dir="$(dirname $(readlink -f $0))"
+
+# Find the examples:
+src="$arg"
+test x"$src" = x && src="$dir/examples"
+cd $src
+
+# Somewhere to put the binaries:
+tmp=`mktemp -d`
+
+build_examples() {
+  if [ x"$arg" = x ]; then
+    echo "Examples in `pwd`; using local headers:" # for when dev pkg not installed
+    libs=-l$1
+    cflags=-I$dir/src
+  else
+    echo "Examples in `pwd`; using pkg-config:"
+    libs=$(pkg-config --libs $1)
+    cflags=$(pkg-config --cflags $1)
+  fi
+  for f in ?$2-*.[cC]; do
+    cc=cc; echo $f | grep -q C$ && cc=c++
+    out=$tmp/`echo $f | sed "s/.[cC]$//"`
+    cmd="$cc $cflags -o $out $f $libs -lm"
+    echo $cmd; $cmd
+  done
+}
+
+# Determine library:
+if [ `basename $0` = inst-check-soxr ]; then
+  build_examples soxr
+  gen="dd if=/dev/urandom count=1000"
+  $tmp/1-single-block 1 2 .
+  $gen 2> /dev/null | $tmp/2-stream                     2>&1 >$tmp/stdout
+  $gen 2> /dev/null | $tmp/3-options-input-fn 6 7 2 2 0 2>&1 >$tmp/stdout
+  $gen 2> /dev/null | $tmp/4-split-channels   7 6 2 2 3 2>&1 >$tmp/stdout  # Clipping expected here
+  $gen 2> /dev/null | $tmp/5-variable-rate              2>&1 >$tmp/stdout
+else
+  build_examples soxr-lsr a # lsr has 'a' suffix on example number.
+  $tmp/1a-lsr 1 2 .
+fi
+
+# Tidy up:
+rm -rf $tmp
diff --git a/soxr/lsr-tests/CMakeLists.txt b/soxr/lsr-tests/CMakeLists.txt
new file mode 100644
index 0000000..4f718f7
--- /dev/null
+++ b/soxr/lsr-tests/CMakeLists.txt
@@ -0,0 +1,50 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+list (APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules)
+
+find_package (FFTW)
+if (FFTW_FOUND)
+  include_directories (${FFTW_INCLUDE_DIRS})
+  link_libraries (${FFTW_LIBRARIES})
+  set (HAVE_FFTW3 1)
+endif ()
+
+find_package (sndfile)
+if (SNDFILE_FOUND)
+  include_directories (${SNDFILE_INCLUDE_DIRS})
+  link_libraries (${SNDFILE_LIBRARIES})
+  set (HAVE_SNDFILE 1)
+endif ()
+
+check_function_exists (lrintf HAVE_LRINTF)
+check_function_exists (alarm HAVE_ALARM)
+check_function_exists (signal HAVE_SIGNAL)
+check_include_files (sys/times.h HAVE_SYS_TIMES_H)
+
+configure_file (${CMAKE_CURRENT_SOURCE_DIR}/config.h.in ${CMAKE_CURRENT_BINARY_DIR}/config.h)
+include_directories (${CMAKE_CURRENT_BINARY_DIR})
+
+add_library (tests_lib STATIC util calc_snr)
+
+link_libraries (tests_lib ${PROJECT_NAME}-lsr ${LIBM_LIBRARIES})
+
+enable_testing ()
+
+set (tests
+  callback_hang_test callback_test downsample_test
+  float_short_test misc_test multi_channel_test
+  reset_test simple_test termination_test varispeed_test)
+if (WITH_CR64 OR WITH_CR64S)
+  set (tests ${tests} snr_bw_test)
+endif ()
+
+foreach (test ${tests})
+  add_executable (${test} ${test})
+  add_test (lsr-${test} ${BIN}${test})
+  set_property (TEST lsr-${test} PROPERTY ENVIRONMENT "SOXR_LSR_STRICT=1")
+endforeach ()
+
+add_executable (multichan_throughput_test multichan_throughput_test)
+add_executable (throughput_test throughput_test )
+add_executable (sndfile-resample sndfile-resample)
diff --git a/soxr/lsr-tests/COPYING b/soxr/lsr-tests/COPYING
new file mode 100644
index 0000000..d60c31a
--- /dev/null
+++ b/soxr/lsr-tests/COPYING
@@ -0,0 +1,340 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year  name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
diff --git a/soxr/lsr-tests/README b/soxr/lsr-tests/README
new file mode 100644
index 0000000..f468446
--- /dev/null
+++ b/soxr/lsr-tests/README
@@ -0,0 +1,8 @@
+The C source and header files in this directory have been copied from
+the `libsamplerate' project and are copyrighted by its authors -- see
+the notices within the files and the file `COPYING' for details.
+
+They are used here to test libsoxr's optional libsamplerate-like
+wrapper.  The only modifications made are to the file `snr_bw_test.c' to
+remove reliance on certain frequency response troughs that are specific
+to libsamplerate.
diff --git a/soxr/lsr-tests/calc_snr.c b/soxr/lsr-tests/calc_snr.c
new file mode 100644
index 0000000..ddfc04c
--- /dev/null
+++ b/soxr/lsr-tests/calc_snr.c
@@ -0,0 +1,242 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include "util.h"
+
+#if (HAVE_FFTW3 == 1)
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include <fftw3.h>
+
+#define	MAX_SPEC_LEN	(1<<18)
+#define	MAX_PEAKS		10
+
+static void log_mag_spectrum (double *input, int len, double *magnitude) ;
+static void smooth_mag_spectrum (double *magnitude, int len) ;
+static double find_snr (const double *magnitude, int len, int expected_peaks) ;
+
+typedef struct
+{	double	peak ;
+	int		index ;
+} PEAK_DATA ;
+
+double
+calculate_snr (float *data, int len, int expected_peaks)
+{	static double magnitude [MAX_SPEC_LEN] ;
+	static double datacopy [MAX_SPEC_LEN] ;
+
+	double snr = 200.0 ;
+	int k ;
+
+	if (len > MAX_SPEC_LEN)
+	{	printf ("%s : line %d : data length too large.\n", __FILE__, __LINE__) ;
+		exit (1) ;
+		} ;
+
+	for (k = 0 ; k < len ; k++)
+		datacopy [k] = data [k] ;
+
+	/* Pad the data just a little to speed up the FFT. */
+	while ((len & 0x1F) && len < MAX_SPEC_LEN)
+	{	datacopy [len] = 0.0 ;
+		len ++ ;
+		} ;
+
+	log_mag_spectrum (datacopy, len, magnitude) ;
+	smooth_mag_spectrum (magnitude, len / 2) ;
+
+	snr = find_snr (magnitude, len, expected_peaks) ;
+
+	return snr ;
+} /* calculate_snr */
+
+/*==============================================================================
+** There is a slight problem with trying to measure SNR with the method used
+** here; the side lobes of the windowed FFT can look like a noise/aliasing peak.
+** The solution is to smooth the magnitude spectrum by wiping out troughs
+** between adjacent peaks as done here.
+** This removes side lobe peaks without affecting noise/aliasing peaks.
+*/
+
+static void linear_smooth (double *mag, PEAK_DATA *larger, PEAK_DATA *smaller) ;
+
+static void
+smooth_mag_spectrum (double *mag, int len)
+{	PEAK_DATA peaks [2] ;
+
+	int k ;
+
+	memset (peaks, 0, sizeof (peaks)) ;
+
+	/* Find first peak. */
+	for (k = 1 ; k < len - 1 ; k++)
+	{	if (mag [k - 1] < mag [k] && mag [k] >= mag [k + 1])
+		{	peaks [0].peak = mag [k] ;
+			peaks [0].index = k ;
+			break ;
+			} ;
+		} ;
+
+	/* Find subsequent peaks ans smooth between peaks. */
+	for (k = peaks [0].index + 1 ; k < len - 1 ; k++)
+	{	if (mag [k - 1] < mag [k] && mag [k] >= mag [k + 1])
+		{	peaks [1].peak = mag [k] ;
+			peaks [1].index = k ;
+
+			if (peaks [1].peak > peaks [0].peak)
+				linear_smooth (mag, &peaks [1], &peaks [0]) ;
+			else
+				linear_smooth (mag, &peaks [0], &peaks [1]) ;
+			peaks [0] = peaks [1] ;
+			} ;
+		} ;
+
+} /* smooth_mag_spectrum */
+
+static void
+linear_smooth (double *mag, PEAK_DATA *larger, PEAK_DATA *smaller)
+{	int k ;
+
+	if (smaller->index < larger->index)
+	{	for (k = smaller->index + 1 ; k < larger->index ; k++)
+			mag [k] = (mag [k] < mag [k - 1]) ? 0.999 * mag [k - 1] : mag [k] ;
+		}
+	else
+	{	for (k = smaller->index - 1 ; k >= larger->index ; k--)
+			mag [k] = (mag [k] < mag [k + 1]) ? 0.999 * mag [k + 1] : mag [k] ;
+		} ;
+
+} /* linear_smooth */
+
+/*==============================================================================
+*/
+
+static int
+peak_compare (const void *vp1, const void *vp2)
+{	const PEAK_DATA *peak1, *peak2 ;
+
+	peak1 = (const PEAK_DATA*) vp1 ;
+	peak2 = (const PEAK_DATA*) vp2 ;
+
+	return (peak1->peak < peak2->peak) ? 1 : -1 ;
+} /* peak_compare */
+
+static double
+find_snr (const double *magnitude, int len, int expected_peaks)
+{	PEAK_DATA peaks [MAX_PEAKS] ;
+
+	int		k, peak_count = 0 ;
+	double	snr ;
+
+	memset (peaks, 0, sizeof (peaks)) ;
+
+	/* Find the MAX_PEAKS largest peaks. */
+	for (k = 1 ; k < len - 1 ; k++)
+	{	if (magnitude [k - 1] < magnitude [k] && magnitude [k] >= magnitude [k + 1])
+		{	if (peak_count < MAX_PEAKS)
+			{	peaks [peak_count].peak = magnitude [k] ;
+				peaks [peak_count].index = k ;
+				peak_count ++ ;
+				qsort (peaks, peak_count, sizeof (PEAK_DATA), peak_compare) ;
+				}
+			else if (magnitude [k] > peaks [MAX_PEAKS - 1].peak)
+			{	peaks [MAX_PEAKS - 1].peak = magnitude [k] ;
+				peaks [MAX_PEAKS - 1].index = k ;
+				qsort (peaks, MAX_PEAKS, sizeof (PEAK_DATA), peak_compare) ;
+				} ;
+			} ;
+		} ;
+
+	if (peak_count < expected_peaks)
+	{	printf ("\n%s : line %d : bad peak_count (%d), expected %d.\n\n", __FILE__, __LINE__, peak_count, expected_peaks) ;
+		return -1.0 ;
+		} ;
+
+	/* Sort the peaks. */
+	qsort (peaks, peak_count, sizeof (PEAK_DATA), peak_compare) ;
+
+	snr = peaks [0].peak ;
+	for (k = 1 ; k < peak_count ; k++)
+		if (fabs (snr - peaks [k].peak) > 10.0)
+			return fabs (peaks [k].peak) ;
+
+	return snr ;
+} /* find_snr */
+
+static void
+log_mag_spectrum (double *input, int len, double *magnitude)
+{	fftw_plan plan = NULL ;
+
+	double	maxval ;
+	int		k ;
+
+	if (input == NULL || magnitude == NULL)
+		return ;
+
+	plan = fftw_plan_r2r_1d (len, input, magnitude, FFTW_R2HC, FFTW_ESTIMATE | FFTW_PRESERVE_INPUT) ;
+	if (plan == NULL)
+	{	printf ("%s : line %d : create plan failed.\n", __FILE__, __LINE__) ;
+		exit (1) ;
+		} ;
+
+	fftw_execute (plan) ;
+
+	fftw_destroy_plan (plan) ;
+
+	/* (k < N/2 rounded up) */
+	maxval = 0.0 ;
+	for (k = 1 ; k < len / 2 ; k++)
+	{	magnitude [k] = sqrt (magnitude [k] * magnitude [k] + magnitude [len - k - 1] * magnitude [len - k - 1]) ;
+		maxval = (maxval < magnitude [k]) ? magnitude [k] : maxval ;
+		} ;
+
+	memset (magnitude + len / 2, 0, len / 2 * sizeof (magnitude [0])) ;
+
+	/* Don't care about DC component. Make it zero. */
+	magnitude [0] = 0.0 ;
+
+	/* log magnitude. */
+	for (k = 0 ; k < len ; k++)
+	{	magnitude [k] = magnitude [k] / maxval ;
+		magnitude [k] = (magnitude [k] < 1e-15) ? -200.0 : 20.0 * log10 (magnitude [k]) ;
+		} ;
+
+	return ;
+} /* log_mag_spectrum */
+
+#else /* ! (HAVE_LIBFFTW && HAVE_LIBRFFTW) */
+
+double
+calculate_snr (float *data, int len, int expected_peaks)
+{	double snr = 200.0 ;
+
+	data = data ;
+	len = len ;
+	expected_peaks = expected_peaks ;
+
+	return snr ;
+} /* calculate_snr */
+
+#endif
+
diff --git a/soxr/lsr-tests/callback_hang_test.c b/soxr/lsr-tests/callback_hang_test.c
new file mode 100644
index 0000000..be89369
--- /dev/null
+++ b/soxr/lsr-tests/callback_hang_test.c
@@ -0,0 +1,131 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <math.h>
+
+#if HAVE_ALARM && HAVE_SIGNAL && HAVE_SIGALRM
+
+#include <signal.h>
+
+#include <samplerate.h>
+
+#include "util.h"
+
+#define	SHORT_BUFFER_LEN	512
+#define	LONG_BUFFER_LEN		(1 << 14)
+
+typedef struct
+{	double ratio ;
+	int count ;
+} SRC_PAIR ;
+
+static void callback_hang_test (int converter) ;
+
+static void alarm_handler (int number) ;
+static long input_callback (void *cb_data, float **data) ;
+
+
+int
+main (void)
+{
+	/* Set up SIGALRM handler. */
+	signal (SIGALRM, alarm_handler) ;
+
+	puts ("") ;
+	callback_hang_test (SRC_ZERO_ORDER_HOLD) ;
+	callback_hang_test (SRC_LINEAR) ;
+	callback_hang_test (SRC_SINC_FASTEST) ;
+	puts ("") ;
+
+	return 0 ;
+} /* main */
+
+
+static void
+callback_hang_test (int converter)
+{	static float output [LONG_BUFFER_LEN] ;
+	static SRC_PAIR pairs [] =
+	{
+		{ 1.2, 5 }, { 1.1, 1 }, { 1.0, 1 }, { 3.0, 1 }, { 2.0, 1 }, { 0.3, 1 },
+		{ 1.2, 0 }, { 1.1, 10 }, { 1.0, 1 }
+		} ;
+
+
+	SRC_STATE	*src_state ;
+
+	double src_ratio = 1.0 ;
+	int k, error ;
+
+	printf ("\tcallback_hang_test  (%-28s) ....... ", src_get_name (converter)) ;
+	fflush (stdout) ;
+
+	/* Perform sample rate conversion. */
+	src_state = src_callback_new (input_callback, converter, 1, &error, NULL) ;
+	if (src_state == NULL)
+	{	printf ("\n\nLine %d : src_callback_new () failed : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	for (k = 0 ; k < ARRAY_LEN (pairs) ; k++)
+	{	alarm (1) ;
+		src_ratio = pairs [k].ratio ;
+		src_callback_read (src_state, src_ratio, pairs [k].count, output) ;
+		} ;
+
+	src_state = src_delete (src_state) ;
+
+	alarm (0) ;
+	puts ("ok") ;
+
+	return ;
+} /* callback_hang_test */
+
+static void
+alarm_handler (int number)
+{
+	(void) number ;
+	printf ("\n\n    Error : Hang inside src_callback_read() detected. Exiting!\n\n") ;
+	exit (1) ;
+} /* alarm_handler */
+
+static long
+input_callback (void *cb_data, float **data)
+{
+	static float buffer [20] ;
+
+	(void) cb_data ;
+	*data = buffer ;
+
+	return ARRAY_LEN (buffer) ;
+} /* input_callback */
+
+#else
+
+int
+main (void)
+{
+	puts ("\tCan't run this test on this platform.") ;
+	return 0 ;
+} /* main */
+
+#endif
diff --git a/soxr/lsr-tests/callback_test.c b/soxr/lsr-tests/callback_test.c
new file mode 100644
index 0000000..0854d64
--- /dev/null
+++ b/soxr/lsr-tests/callback_test.c
@@ -0,0 +1,243 @@
+/*
+** Copyright (C) 2003-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include <samplerate.h>
+
+#include "util.h"
+
+#define	BUFFER_LEN		10000
+#define CB_READ_LEN		256
+
+static void callback_test (int converter, double ratio) ;
+static void end_of_stream_test (int converter) ;
+
+int
+main (void)
+{	static double src_ratios [] =
+	{	1.0, 0.099, 0.1, 0.33333333, 0.789, 1.0001, 1.9, 3.1, 9.9
+	} ;
+
+	int k ;
+
+	puts ("") ;
+
+	puts ("    Zero Order Hold interpolator :") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		callback_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ;
+
+	puts ("    Linear interpolator :") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		callback_test (SRC_LINEAR, src_ratios [k]) ;
+
+	puts ("    Sinc interpolator :") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		callback_test (SRC_SINC_FASTEST, src_ratios [k]) ;
+
+	puts ("") ;
+
+	puts ("    End of stream test :") ;
+	end_of_stream_test (SRC_ZERO_ORDER_HOLD) ;
+	end_of_stream_test (SRC_LINEAR) ;
+	end_of_stream_test (SRC_SINC_FASTEST) ;
+
+	puts ("") ;
+	return 0 ;
+} /* main */
+
+/*=====================================================================================
+*/
+
+typedef struct
+{	int channels ;
+	long count, total ;
+	int end_of_data ;
+	float data [BUFFER_LEN] ;
+} TEST_CB_DATA ;
+
+static long
+test_callback_func (void *cb_data, float **data)
+{	TEST_CB_DATA *pcb_data ;
+
+	long frames ;
+
+	if ((pcb_data = cb_data) == NULL)
+		return 0 ;
+
+	if (data == NULL)
+		return 0 ;
+
+	if (pcb_data->total - pcb_data->count > CB_READ_LEN)
+		frames = CB_READ_LEN / pcb_data->channels ;
+	else
+		frames = (pcb_data->total - pcb_data->count) / pcb_data->channels ;
+
+	*data = pcb_data->data + pcb_data->count ;
+	pcb_data->count += frames ;
+
+	return frames ;
+} /* test_callback_func */
+
+
+static void
+callback_test (int converter, double src_ratio)
+{	static TEST_CB_DATA test_callback_data ;
+	static float output [BUFFER_LEN] ;
+
+	SRC_STATE	*src_state ;
+
+	long	read_count, read_total ;
+	int 	error ;
+
+	printf ("\tcallback_test    (SRC ratio = %6.4f) ........... ", src_ratio) ;
+	fflush (stdout) ;
+
+	test_callback_data.channels = 2 ;
+	test_callback_data.count = 0 ;
+	test_callback_data.end_of_data = 0 ;
+	test_callback_data.total = ARRAY_LEN (test_callback_data.data) ;
+
+	if ((src_state = src_callback_new (test_callback_func, converter, test_callback_data.channels, &error, &test_callback_data)) == NULL)
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	read_total = 0 ;
+	do
+	{	/* We will be throwing away output data, so just grab as much as possible. */
+		read_count = ARRAY_LEN (output) / test_callback_data.channels ;
+		read_count = src_callback_read (src_state, src_ratio, read_count, output) ;
+		read_total += read_count ;
+		}
+	while (read_count > 0) ;
+
+	if ((error = src_error (src_state)) != 0)
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_state = src_delete (src_state) ;
+
+	if (fabs (read_total / src_ratio - ARRAY_LEN (test_callback_data.data)) > 2.0)
+	{	printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
+		printf ("    input len  : %d\n", ARRAY_LEN (test_callback_data.data)) ;
+		printf ("    output len : %ld (should be %g +/- 2)\n\n", read_total,
+					floor (0.5 + src_ratio * ARRAY_LEN (test_callback_data.data))) ;
+		exit (1) ;
+		} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* callback_test */
+
+/*=====================================================================================
+*/
+
+static long
+eos_callback_func (void *cb_data, float **data)
+{
+	TEST_CB_DATA *pcb_data ;
+	long frames ;
+
+	if (data == NULL)
+		return 0 ;
+
+	if ((pcb_data = cb_data) == NULL)
+		return 0 ;
+
+	/*
+	**	Return immediately if there is no more data.
+	**	In this case, the output pointer 'data' will not be set and
+	**	valgrind should not warn about it.
+	*/
+	if (pcb_data->end_of_data)
+		return 0 ;
+
+	if (pcb_data->total - pcb_data->count > CB_READ_LEN)
+		frames = CB_READ_LEN / pcb_data->channels ;
+	else
+		frames = (pcb_data->total - pcb_data->count) / pcb_data->channels ;
+
+	*data = pcb_data->data + pcb_data->count ;
+	pcb_data->count += frames ;
+
+	/*
+	**	Set end_of_data so that the next call to the callback function will
+	**	return zero ocunt without setting the 'data' pointer.
+	*/
+	if (pcb_data->total < 2 * pcb_data->count)
+		pcb_data->end_of_data = 1 ;
+
+	return frames ;
+} /* eos_callback_data */
+
+
+static void
+end_of_stream_test (int converter)
+{	static TEST_CB_DATA test_callback_data ;
+	static float output [BUFFER_LEN] ;
+
+	SRC_STATE	*src_state ;
+
+	double	src_ratio = 0.3 ;
+	long	read_count, read_total ;
+	int 	error ;
+
+	printf ("\t%-30s        ........... ", src_get_name (converter)) ;
+	fflush (stdout) ;
+
+	test_callback_data.channels = 2 ;
+	test_callback_data.count = 0 ;
+	test_callback_data.end_of_data = 0 ;
+	test_callback_data.total = ARRAY_LEN (test_callback_data.data) ;
+
+	if ((src_state = src_callback_new (eos_callback_func, converter, test_callback_data.channels, &error, &test_callback_data)) == NULL)
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	read_total = 0 ;
+	do
+	{	/* We will be throwing away output data, so just grab as much as possible. */
+		read_count = ARRAY_LEN (output) / test_callback_data.channels ;
+		read_count = src_callback_read (src_state, src_ratio, read_count, output) ;
+		read_total += read_count ;
+		}
+	while (read_count > 0) ;
+
+	if ((error = src_error (src_state)) != 0)
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_state = src_delete (src_state) ;
+
+	if (test_callback_data.end_of_data == 0)
+	{	printf ("\n\nLine %d : test_callback_data.end_of_data should not be 0."
+				" This is a bug in the test.\n\n", __LINE__) ;
+		exit (1) ;
+		} ;
+
+	puts ("ok") ;
+	return ;
+} /* end_of_stream_test */
diff --git a/soxr/lsr-tests/cmake/Modules/FindFFTW.cmake b/soxr/lsr-tests/cmake/Modules/FindFFTW.cmake
new file mode 100644
index 0000000..409268e
--- /dev/null
+++ b/soxr/lsr-tests/cmake/Modules/FindFFTW.cmake
@@ -0,0 +1,23 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Find FFTW
+# Find the native installation of this package: includes and libraries.
+#
+#  FFTW_INCLUDES    - where to find headers for this package.
+#  FFTW_LIBRARIES   - List of libraries when using this package.
+#  FFTW_FOUND       - True if this package can be found.
+
+if (FFTW_INCLUDES)
+  set (FFTW_FIND_QUIETLY TRUE)
+endif (FFTW_INCLUDES)
+
+find_path (FFTW_INCLUDES fftw3.h)
+
+find_library (FFTW_LIBRARIES NAMES fftw3)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (
+  FFTW DEFAULT_MSG FFTW_LIBRARIES FFTW_INCLUDES)
+
+mark_as_advanced (FFTW_LIBRARIES FFTW_INCLUDES)
diff --git a/soxr/lsr-tests/cmake/Modules/Findsndfile.cmake b/soxr/lsr-tests/cmake/Modules/Findsndfile.cmake
new file mode 100644
index 0000000..b2fd725
--- /dev/null
+++ b/soxr/lsr-tests/cmake/Modules/Findsndfile.cmake
@@ -0,0 +1,23 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Find SNDFILE
+# Find the native installation of this package: includes and libraries.
+#
+#  SNDFILE_INCLUDES    - where to find headers for this package.
+#  SNDFILE_LIBRARIES   - List of libraries when using this package.
+#  SNDFILE_FOUND       - True if this package can be found.
+
+if (SNDFILE_INCLUDES)
+  set (SNDFILE_FIND_QUIETLY TRUE)
+endif (SNDFILE_INCLUDES)
+
+find_path (SNDFILE_INCLUDES sndfile.h)
+
+find_library (SNDFILE_LIBRARIES NAMES sndfile)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (
+  SNDFILE DEFAULT_MSG SNDFILE_LIBRARIES SNDFILE_INCLUDES)
+
+mark_as_advanced (SNDFILE_LIBRARIES SNDFILE_INCLUDES)
diff --git a/soxr/lsr-tests/config.h.in b/soxr/lsr-tests/config.h.in
new file mode 100644
index 0000000..1095e00
--- /dev/null
+++ b/soxr/lsr-tests/config.h.in
@@ -0,0 +1,24 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxsrc_lsr_tests_config_included
+#define soxsrc_lsr_tests_config_included
+
+#cmakedefine01 HAVE_ALARM
+#cmakedefine01 HAVE_FFTW3
+#cmakedefine01 HAVE_LRINTF
+#cmakedefine01 HAVE_LRINT
+#cmakedefine01 HAVE_SIGNAL
+#cmakedefine01 HAVE_SNDFILE
+#cmakedefine01 HAVE_SYS_TIMES_H
+
+#if HAVE_SIGNAL
+  #include <signal.h>
+  #if defined SIGALRM
+    #define HAVE_SIGALRM 1
+  #else
+    #define HAVE_SIGALRM 0
+  #endif
+#endif
+
+#endif
diff --git a/soxr/lsr-tests/downsample_test.c b/soxr/lsr-tests/downsample_test.c
new file mode 100644
index 0000000..87243e7
--- /dev/null
+++ b/soxr/lsr-tests/downsample_test.c
@@ -0,0 +1,61 @@
+/*
+** Copyright (C) 2008-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <samplerate.h>
+
+#include "util.h"
+
+static void
+downsample_test (int converter)
+{	static float in [1000], out [10] ;
+	SRC_DATA data ;
+
+    printf ("        downsample_test     (%-28s) ....... ", src_get_name (converter)) ;
+	fflush (stdout) ;
+
+	data.src_ratio = 1.0 / 255.0 ;
+	data.input_frames = ARRAY_LEN (in) ;
+	data.output_frames = ARRAY_LEN (out) ;
+	data.data_in = in ;
+	data.data_out = out ;
+
+	if (src_simple (&data, converter, 1))
+	{	puts ("src_simple failed.") ;
+		exit (1) ;
+		} ;
+
+	puts ("ok") ;
+} /* downsample_test */
+
+int
+main (void)
+{
+	puts ("") ;
+
+	downsample_test (SRC_ZERO_ORDER_HOLD) ;
+	downsample_test (SRC_LINEAR) ;
+	downsample_test (SRC_SINC_FASTEST) ;
+	downsample_test (SRC_SINC_MEDIUM_QUALITY) ;
+	downsample_test (SRC_SINC_BEST_QUALITY) ;
+
+	puts ("") ;
+
+	return 0 ;
+} /* main */
diff --git a/soxr/lsr-tests/float_cast.h b/soxr/lsr-tests/float_cast.h
new file mode 100644
index 0000000..77ad5b4
--- /dev/null
+++ b/soxr/lsr-tests/float_cast.h
@@ -0,0 +1,281 @@
+/*
+** Copyright (C) 2001-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU Lesser General Public License as published by
+** the Free Software Foundation; either version 2.1 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU Lesser General Public License for more details.
+**
+** You should have received a copy of the GNU Lesser General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+/* Version 1.5 */
+
+#ifndef FLOAT_CAST_HEADER
+#define FLOAT_CAST_HEADER
+
+/*============================================================================
+**	On Intel Pentium processors (especially PIII and probably P4), converting
+**	from float to int is very slow. To meet the C specs, the code produced by
+**	most C compilers targeting Pentium needs to change the FPU rounding mode
+**	before the float to int conversion is performed.
+**
+**	Changing the FPU rounding mode causes the FPU pipeline to be flushed. It
+**	is this flushing of the pipeline which is so slow.
+**
+**	Fortunately the ISO C99 specifications define the functions lrint, lrintf,
+**	llrint and llrintf which fix this problem as a side effect.
+**
+**	On Unix-like systems, the configure process should have detected the
+**	presence of these functions. If they weren't found we have to replace them
+**	here with a standard C cast.
+*/
+
+/*
+**	The C99 prototypes for lrint and lrintf are as follows:
+**
+**		long int lrintf (float x) ;
+**		long int lrint  (double x) ;
+*/
+
+#include "config.h"
+
+/*
+**	The presence of the required functions are detected during the configure
+**	process and the values HAVE_LRINT and HAVE_LRINTF are set accordingly in
+**	the config.h file.
+*/
+
+#define		HAVE_LRINT_REPLACEMENT	0
+
+#if (HAVE_LRINT && HAVE_LRINTF)
+
+	/*
+	**	These defines enable functionality introduced with the 1999 ISO C
+	**	standard. They must be defined before the inclusion of math.h to
+	**	engage them. If optimisation is enabled, these functions will be
+	**	inlined. With optimisation switched off, you have to link in the
+	**	maths library using -lm.
+	*/
+
+	#define	_ISOC9X_SOURCE	1
+	#define _ISOC99_SOURCE	1
+
+	#define	__USE_ISOC9X	1
+	#define	__USE_ISOC99	1
+
+	#include	<math.h>
+
+#elif (defined (__CYGWIN__))
+
+	#include	<math.h>
+
+	#undef		HAVE_LRINT_REPLACEMENT
+	#define		HAVE_LRINT_REPLACEMENT	1
+
+	#undef	lrint
+	#undef	lrintf
+
+	#define	lrint	double2int
+	#define	lrintf	float2int
+
+	/*
+	**	The native CYGWIN lrint and lrintf functions are buggy:
+	**		http://sourceware.org/ml/cygwin/2005-06/msg00153.html
+	**		http://sourceware.org/ml/cygwin/2005-09/msg00047.html
+	**	and slow.
+	**	These functions (pulled from the Public Domain MinGW math.h header)
+	**	replace the native versions.
+	*/
+
+	static inline long double2int (double in)
+	{	long retval ;
+
+		__asm__ __volatile__
+		(	"fistpl %0"
+			: "=m" (retval)
+			: "t" (in)
+			: "st"
+			) ;
+
+		return retval ;
+	} /* double2int */
+
+	static inline long float2int (float in)
+	{	long retval ;
+
+		__asm__ __volatile__
+		(	"fistpl %0"
+			: "=m" (retval)
+			: "t" (in)
+			: "st"
+			) ;
+
+		return retval ;
+	} /* float2int */
+
+#elif (defined (WIN64) || defined(_WIN64))
+
+	/*	Win64 section should be places before Win32 one, because
+	**	most likely both WIN32 and WIN64 will be defined in 64-bit case.
+	*/
+
+	#include	<math.h>
+
+	/*	Win64 doesn't seem to have these functions, nor inline assembly.
+	**	Therefore implement inline versions of these functions here.
+	*/
+	#include    <emmintrin.h>
+	#include    <mmintrin.h>
+
+	__inline long int
+	lrint(double flt)
+	{
+		return _mm_cvtsd_si32(_mm_load_sd(&flt));
+	}
+
+	__inline long int
+	lrintf(float flt)
+	{
+		return _mm_cvtss_si32(_mm_load_ss(&flt));
+	}
+
+#elif (defined (WIN32) || defined (_WIN32))
+
+	#undef		HAVE_LRINT_REPLACEMENT
+	#define		HAVE_LRINT_REPLACEMENT	1
+
+	#include	<math.h>
+
+	/*
+	**	Win32 doesn't seem to have these functions.
+	**	Therefore implement inline versions of these functions here.
+	*/
+
+	__inline long int
+	lrint (double flt)
+	{	int intgr ;
+
+		_asm
+		{	fld flt
+			fistp intgr
+			} ;
+
+		return intgr ;
+	}
+
+	__inline long int
+	lrintf (float flt)
+	{	int intgr ;
+
+		_asm
+		{	fld flt
+			fistp intgr
+			} ;
+
+		return intgr ;
+	}
+
+#elif (defined (__MWERKS__) && defined (macintosh))
+
+	/* This MacOS 9 solution was provided by Stephane Letz */
+
+	#undef		HAVE_LRINT_REPLACEMENT
+	#define		HAVE_LRINT_REPLACEMENT	1
+	#include	<math.h>
+
+	#undef	lrint
+	#undef	lrintf
+
+	#define	lrint	double2int
+	#define	lrintf	float2int
+
+	inline int
+	float2int (register float in)
+	{	long res [2] ;
+
+		asm
+		{	fctiw	in, in
+			stfd	 in, res
+		}
+		return res [1] ;
+	} /* float2int */
+
+	inline int
+	double2int (register double in)
+	{	long res [2] ;
+
+		asm
+		{	fctiw	in, in
+			stfd	 in, res
+		}
+		return res [1] ;
+	} /* double2int */
+
+#elif (defined (__MACH__) && defined (__APPLE__))
+
+	/* For Apple MacOSX. */
+
+	#undef		HAVE_LRINT_REPLACEMENT
+	#define		HAVE_LRINT_REPLACEMENT	1
+	#include	<math.h>
+
+	#undef lrint
+	#undef lrintf
+
+	#define lrint	double2int
+	#define lrintf	float2int
+
+	inline static long
+	float2int (register float in)
+	{	int res [2] ;
+
+		__asm__ __volatile__
+		(	"fctiw	%1, %1\n\t"
+			"stfd	%1, %0"
+			: "=m" (res)	/* Output */
+			: "f" (in)		/* Input */
+			: "memory"
+			) ;
+
+		return res [1] ;
+	} /* lrintf */
+
+	inline static long
+	double2int (register double in)
+	{	int res [2] ;
+
+		__asm__ __volatile__
+		(	"fctiw	%1, %1\n\t"
+			"stfd	%1, %0"
+			: "=m" (res)	/* Output */
+			: "f" (in)		/* Input */
+			: "memory"
+			) ;
+
+		return res [1] ;
+	} /* lrint */
+
+#else
+	#ifndef __sgi
+	#warning "Don't have the functions lrint() and lrintf()."
+	#warning "Replacing these functions with a standard C cast."
+	#endif
+
+	#include	<math.h>
+
+	#define	lrint(dbl)		((long) (dbl))
+	#define	lrintf(flt)		((long) (flt))
+
+#endif
+
+
+#endif /* FLOAT_CAST_HEADER */
+
diff --git a/soxr/lsr-tests/float_short_test.c b/soxr/lsr-tests/float_short_test.c
new file mode 100644
index 0000000..6664a3b
--- /dev/null
+++ b/soxr/lsr-tests/float_short_test.c
@@ -0,0 +1,192 @@
+/*
+** Copyright (C) 2003-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <samplerate.h>
+
+#include "util.h"
+
+#define	BUFFER_LEN		10000
+
+static void float_to_short_test (void) ;
+static void short_to_float_test (void) ;
+
+static void float_to_int_test (void) ;
+static void int_to_float_test (void) ;
+
+int
+main (void)
+{
+	puts ("") ;
+
+	float_to_short_test () ;
+	short_to_float_test () ;
+
+	float_to_int_test () ;
+	int_to_float_test () ;
+
+	puts ("") ;
+
+	return 0 ;
+} /* main */
+
+/*=====================================================================================
+*/
+
+static void
+float_to_short_test (void)
+{
+	static float fpos [] =
+	{	0.95, 0.99, 1.0, 1.01, 1.1, 2.0, 11.1, 111.1, 2222.2, 33333.3
+		} ;
+	static float fneg [] =
+	{	-0.95, -0.99, -1.0, -1.01, -1.1, -2.0, -11.1, -111.1, -2222.2, -33333.3
+		} ;
+
+	static short out [MAX (ARRAY_LEN (fpos), ARRAY_LEN (fneg))] ;
+
+	int k ;
+
+	printf ("\tfloat_to_short_test ............................. ") ;
+
+	src_float_to_short_array (fpos, out, ARRAY_LEN (fpos)) ;
+
+	for (k = 0 ; k < ARRAY_LEN (fpos) ; k++)
+		if (out [k] < 30000)
+		{	printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ;
+			exit (1) ;
+			} ;
+
+	src_float_to_short_array (fneg, out, ARRAY_LEN (fneg)) ;
+
+	for (k = 0 ; k < ARRAY_LEN (fneg) ; k++)
+		if (out [k] > -30000)
+		{	printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ;
+			exit (1) ;
+			} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* float_to_short_test */
+
+/*-------------------------------------------------------------------------------------
+*/
+
+static void
+short_to_float_test (void)
+{
+	static short input	[BUFFER_LEN] ;
+	static short output	[BUFFER_LEN] ;
+	static float temp	[BUFFER_LEN] ;
+
+	int k ;
+
+	printf ("\tshort_to_float_test ............................. ") ;
+
+	for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+		input [k] = (k * 0x8000) / ARRAY_LEN (input) ;
+
+	src_short_to_float_array (input, temp, ARRAY_LEN (temp)) ;
+	src_float_to_short_array (temp, output, ARRAY_LEN (output)) ;
+
+	for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+		if (ABS (input [k] - output [k]) > 0)
+		{	printf ("\n\n\tLine %d : index %d   %d -> %d\n", __LINE__, k, input [k], output [k]) ;
+			exit (1) ;
+			} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* short_to_float_test */
+
+/*=====================================================================================
+*/
+
+static void
+float_to_int_test (void)
+{
+	static float fpos [] =
+	{	0.95, 0.99, 1.0, 1.01, 1.1, 2.0, 11.1, 111.1, 2222.2, 33333.3
+		} ;
+	static float fneg [] =
+	{	-0.95, -0.99, -1.0, -1.01, -1.1, -2.0, -11.1, -111.1, -2222.2, -33333.3
+		} ;
+
+	static int out [MAX (ARRAY_LEN (fpos), ARRAY_LEN (fneg))] ;
+
+	int k ;
+
+	printf ("\tfloat_to_int_test ............................... ") ;
+
+	src_float_to_int_array (fpos, out, ARRAY_LEN (fpos)) ;
+
+	for (k = 0 ; k < ARRAY_LEN (fpos) ; k++)
+		if (out [k] < 30000 * 0x10000)
+		{	printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ;
+			exit (1) ;
+			} ;
+
+	src_float_to_int_array (fneg, out, ARRAY_LEN (fneg)) ;
+
+	for (k = 0 ; k < ARRAY_LEN (fneg) ; k++)
+		if (out [k] > -30000 * 0x1000)
+		{	printf ("\n\n\tLine %d : out [%d] == %d\n", __LINE__, k, out [k]) ;
+			exit (1) ;
+			} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* float_to_int_test */
+
+/*-------------------------------------------------------------------------------------
+*/
+
+static void
+int_to_float_test (void)
+{
+	static int input	[BUFFER_LEN] ;
+	static int output	[BUFFER_LEN] ;
+	static float temp	[BUFFER_LEN] ;
+
+	int k ;
+
+	printf ("\tint_to_float_test ............................... ") ;
+
+	for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+		input [k] = (k * 0x80000000) / ARRAY_LEN (input) ;
+
+	src_int_to_float_array (input, temp, ARRAY_LEN (temp)) ;
+	src_float_to_int_array (temp, output, ARRAY_LEN (output)) ;
+
+	for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+		if (ABS (input [k] - output [k]) > 0)
+		{	printf ("\n\n\tLine %d : index %d   %d -> %d\n", __LINE__, k, input [k], output [k]) ;
+			exit (1) ;
+			} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* int_to_float_test */
+
diff --git a/soxr/lsr-tests/misc_test.c b/soxr/lsr-tests/misc_test.c
new file mode 100644
index 0000000..4baa334
--- /dev/null
+++ b/soxr/lsr-tests/misc_test.c
@@ -0,0 +1,175 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <samplerate.h>
+
+#include "util.h"
+
+static void name_test (void) ;
+static void error_test (void) ;
+static void src_ratio_test (void) ;
+static void zero_input_test (int converter) ;
+
+int
+main (void)
+{
+	puts ("") ;
+
+	printf ("    version : %s\n\n", src_get_version ()) ;
+
+	/* Current max converter is SRC_LINEAR. */
+	name_test () ;
+
+	error_test () ;
+
+	src_ratio_test () ;
+
+	zero_input_test (SRC_ZERO_ORDER_HOLD) ;
+	zero_input_test (SRC_LINEAR) ;
+	zero_input_test (SRC_SINC_FASTEST) ;
+
+	puts ("") ;
+	return 0 ;
+} /* main */
+
+static void
+name_test (void)
+{	const char	*name ;
+	int	k = 0 ;
+
+	puts ("    name_test :") ;
+
+	while (1)
+	{	name = src_get_name (k) ;
+		if (name == NULL)
+			break ;
+		printf ("\tName %d : %s\n", k, name) ;
+		printf ("\tDesc %d : %s\n", k, src_get_description (k)) ;
+		k ++ ;
+		} ;
+
+	puts ("") ;
+
+	return ;
+} /* name_test */
+
+/*------------------------------------------------------------------------------
+*/
+
+typedef struct
+{	double	ratio ;
+	int		should_pass ;
+} RATIO_TEST ;
+
+static RATIO_TEST ratio_test [] =
+{	{	1.0 / 256.1,	0 },
+	{	1.0 / 256.0,	1 },
+	{	1.0,			1 },
+	{	256.0, 			1 },
+	{	256.1,			0 },
+	{	-1.0,			0 }
+} ;
+
+static void
+src_ratio_test (void)
+{	int k ;
+
+	puts ("    src_ratio_test (SRC ratio must be in range [1/256, 256]):" ) ;
+
+
+	for (k = 0 ; k < ARRAY_LEN (ratio_test) ; k++)
+	{	if (ratio_test [k].should_pass && src_is_valid_ratio (ratio_test [k].ratio) == 0)
+		{	printf ("\n\nLine %d : SRC ratio %f should have passed.\n\n", __LINE__, ratio_test [k].ratio) ;
+			exit (1) ;
+			} ;
+		if (! ratio_test [k].should_pass && src_is_valid_ratio (ratio_test [k].ratio) != 0)
+		{	printf ("\n\nLine %d : SRC ratio %f should not have passed.\n\n", __LINE__, ratio_test [k].ratio) ;
+			exit (1) ;
+			} ;
+		printf ("\t SRC ratio (%9.5f) : %s ................... ok\n", ratio_test [k].ratio,
+			(ratio_test [k].should_pass ? "pass" : "fail")) ;
+		} ;
+
+	puts ("") ;
+
+	return ;
+} /* src_ratio_test */
+
+static void
+error_test (void)
+{	const char *errorstr ;
+	int		k, errors = 0 ;
+
+	puts ("    error_test :") ;
+
+	for (k = 0 ; 1 ; k++)
+	{	errorstr = src_strerror (k) ;
+		printf ("\t%-2d : %s\n", k, errorstr) ;
+		if (errorstr == NULL)
+		{	errors ++ ;
+			continue ;
+			} ;
+		if (strstr (errorstr, "Placeholder.") == errorstr)
+			break ;
+		} ;
+
+	if (errors != 0)
+	{	printf ("\n\nLine %d : Missing error numbers above.\n\n", __LINE__) ;
+		exit (1) ;
+		} ;
+
+	puts ("") ;
+
+	return ;
+} /* error_test */
+
+static void
+zero_input_test (int converter)
+{	SRC_DATA data ;
+	SRC_STATE *state ;
+	float out [100] ;
+	int error ;
+
+	printf ("    %s (%-26s) ........ ", __func__, src_get_name (converter)) ;
+	fflush (stdout) ;
+
+	if ((state = src_new (converter, 1, &error)) == NULL)
+	{	printf ("\n\nLine %d : src_new failed : %s.\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	data.data_in = (float *) 0xdeadbeef ;
+	data.input_frames = 0 ;
+	data.data_out = out ;
+	data.output_frames = ARRAY_LEN (out) ;
+	data.end_of_input = 0 ;
+	data.src_ratio = 1.0 ;
+
+	if ((error = src_process (state, &data)))
+	{	printf ("\n\nLine %d : src_new failed : %s.\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	state = src_delete (state) ;
+
+	puts ("ok") ;
+} /* zero_input_test */
diff --git a/soxr/lsr-tests/multi_channel_test.c b/soxr/lsr-tests/multi_channel_test.c
new file mode 100644
index 0000000..1ad9ced
--- /dev/null
+++ b/soxr/lsr-tests/multi_channel_test.c
@@ -0,0 +1,364 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+
+#include <samplerate.h>
+
+#include "util.h"
+#define	BUFFER_LEN		50000
+#define	BLOCK_LEN		(12)
+
+#define	MAX_CHANNELS	10
+
+static void simple_test (int converter, int channel_count, double target_snr) ;
+static void process_test (int converter, int channel_count, double target_snr) ;
+static void callback_test (int converter, int channel_count, double target_snr) ;
+
+int
+main (void)
+{	double target ;
+	int k ;
+
+	puts ("\n    Zero Order Hold interpolator :") ;
+	target = 38.0 ;
+	for (k = 1 ; k <= 3 ; k++)
+	{	simple_test		(SRC_ZERO_ORDER_HOLD, k, target) ;
+		process_test	(SRC_ZERO_ORDER_HOLD, k, target) ;
+		callback_test	(SRC_ZERO_ORDER_HOLD, k, target) ;
+		} ;
+
+	puts ("\n    Linear interpolator :") ;
+	target = 79.0 ;
+	for (k = 1 ; k <= 3 ; k++)
+	{	simple_test		(SRC_LINEAR, k, target) ;
+		process_test	(SRC_LINEAR, k, target) ;
+		callback_test	(SRC_LINEAR, k, target) ;
+		} ;
+
+	puts ("\n    Sinc interpolator :") ;
+	target = 100.0 ;
+	for (k = 1 ; k <= MAX_CHANNELS ; k++)
+	{	simple_test		(SRC_SINC_FASTEST, k, target) ;
+		process_test	(SRC_SINC_FASTEST, k, target) ;
+		callback_test	(SRC_SINC_FASTEST, k, target) ;
+		} ;
+
+	puts ("") ;
+
+	return 0 ;
+} /* main */
+
+/*==============================================================================
+*/
+
+static float input_serial		[BUFFER_LEN * MAX_CHANNELS] ;
+static float input_interleaved	[BUFFER_LEN * MAX_CHANNELS] ;
+static float output_interleaved	[BUFFER_LEN * MAX_CHANNELS] ;
+static float output_serial		[BUFFER_LEN * MAX_CHANNELS] ;
+
+static void
+simple_test (int converter, int channel_count, double target_snr)
+{	SRC_DATA	src_data ;
+
+	double	freq, snr ;
+	int		ch, error, frames ;
+
+	printf ("\t%-22s (%2d channel%c) ............ ", "simple_test", channel_count, channel_count > 1 ? 's' : ' ') ;
+	fflush (stdout) ;
+
+	assert (channel_count <= MAX_CHANNELS) ;
+
+	memset (input_serial, 0, sizeof (input_serial)) ;
+	memset (input_interleaved, 0, sizeof (input_interleaved)) ;
+	memset (output_interleaved, 0, sizeof (output_interleaved)) ;
+	memset (output_serial, 0, sizeof (output_serial)) ;
+
+	frames = BUFFER_LEN ;
+
+	/* Calculate channel_count separate windowed sine waves. */
+	for (ch = 0 ; ch < channel_count ; ch++)
+	{	freq = (200.0 + 33.333333333 * ch) / 44100.0 ;
+		gen_windowed_sines (1, &freq, 1.0, input_serial + ch * frames, frames) ;
+		} ;
+
+	/* Interleave the data in preparation for SRC. */
+	interleave_data (input_serial, input_interleaved, frames, channel_count) ;
+
+	/* Choose a converstion ratio <= 1.0. */
+	src_data.src_ratio = 0.95 ;
+
+	src_data.data_in = input_interleaved ;
+	src_data.input_frames = frames ;
+
+	src_data.data_out = output_interleaved ;
+	src_data.output_frames = frames ;
+
+	if ((error = src_simple (&src_data, converter, channel_count)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	if (fabs (src_data.output_frames_gen - src_data.src_ratio * src_data.input_frames) > 2)
+	{	printf ("\n\nLine %d : bad output data length %ld should be %d.\n", __LINE__,
+					src_data.output_frames_gen, (int) floor (src_data.src_ratio * src_data.input_frames)) ;
+		printf ("\tsrc_ratio  : %.4f\n", src_data.src_ratio) ;
+		printf ("\tinput_len  : %ld\n", src_data.input_frames) ;
+		printf ("\toutput_len : %ld\n\n", src_data.output_frames_gen) ;
+		exit (1) ;
+		} ;
+
+	/* De-interleave data so SNR can be calculated for each channel. */
+	deinterleave_data (output_interleaved, output_serial, frames, channel_count) ;
+
+	for (ch = 0 ; ch < channel_count ; ch++)
+	{	snr = calculate_snr (output_serial + ch * frames, frames, 1) ;
+		if (snr < target_snr)
+		{	printf ("\n\nLine %d: channel %d snr %f should be %f\n", __LINE__, ch, snr, target_snr) ;
+			save_oct_float ("output.dat", input_serial, channel_count * frames, output_serial, channel_count * frames) ;
+			exit (1) ;
+			} ;
+		} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* simple_test */
+
+/*==============================================================================
+*/
+
+static void
+process_test (int converter, int channel_count, double target_snr)
+{	SRC_STATE	*src_state ;
+	SRC_DATA	src_data ;
+
+	double	freq, snr ;
+	int		ch, error, frames, current_in, current_out ;
+
+	printf ("\t%-22s (%2d channel%c) ............ ", "process_test", channel_count, channel_count > 1 ? 's' : ' ') ;
+	fflush (stdout) ;
+
+	assert (channel_count <= MAX_CHANNELS) ;
+
+	memset (input_serial, 0, sizeof (input_serial)) ;
+	memset (input_interleaved, 0, sizeof (input_interleaved)) ;
+	memset (output_interleaved, 0, sizeof (output_interleaved)) ;
+	memset (output_serial, 0, sizeof (output_serial)) ;
+
+	frames = BUFFER_LEN ;
+
+	/* Calculate channel_count separate windowed sine waves. */
+	for (ch = 0 ; ch < channel_count ; ch++)
+	{	freq = (400.0 + 11.333333333 * ch) / 44100.0 ;
+		gen_windowed_sines (1, &freq, 1.0, input_serial + ch * frames, frames) ;
+		} ;
+
+	/* Interleave the data in preparation for SRC. */
+	interleave_data (input_serial, input_interleaved, frames, channel_count) ;
+
+	/* Perform sample rate conversion. */
+	if ((src_state = src_new (converter, channel_count, &error)) == NULL)
+	{	printf ("\n\nLine %d : src_new() failed : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_data.end_of_input = 0 ; /* Set this later. */
+
+	/* Choose a converstion ratio < 1.0. */
+	src_data.src_ratio = 0.95 ;
+
+	src_data.data_in = input_interleaved ;
+	src_data.data_out = output_interleaved ;
+
+	current_in = current_out = 0 ;
+
+	while (1)
+	{	src_data.input_frames	= MAX (MIN (BLOCK_LEN, frames - current_in), 0) ;
+		src_data.output_frames	= MAX (MIN (BLOCK_LEN, frames - current_out), 0) ;
+
+		if ((error = src_process (src_state, &src_data)))
+		{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+			exit (1) ;
+			} ;
+
+		if (src_data.end_of_input && src_data.output_frames_gen == 0)
+			break ;
+
+		current_in	+= src_data.input_frames_used ;
+		current_out += src_data.output_frames_gen ;
+
+		src_data.data_in	+= src_data.input_frames_used * channel_count ;
+		src_data.data_out	+= src_data.output_frames_gen * channel_count ;
+
+		src_data.end_of_input = (current_in >= frames) ? 1 : 0 ;
+		} ;
+
+	src_state = src_delete (src_state) ;
+
+	if (fabs (current_out - src_data.src_ratio * current_in) > 2)
+	{	printf ("\n\nLine %d : bad output data length %d should be %d.\n", __LINE__,
+					current_out, (int) floor (src_data.src_ratio * current_in)) ;
+		printf ("\tsrc_ratio  : %.4f\n", src_data.src_ratio) ;
+		printf ("\tinput_len  : %d\n", frames) ;
+		printf ("\toutput_len : %d\n\n", current_out) ;
+		exit (1) ;
+		} ;
+
+	/* De-interleave data so SNR can be calculated for each channel. */
+	deinterleave_data (output_interleaved, output_serial, frames, channel_count) ;
+
+	for (ch = 0 ; ch < channel_count ; ch++)
+	{	snr = calculate_snr (output_serial + ch * frames, frames, 1) ;
+		if (snr < target_snr)
+		{	printf ("\n\nLine %d: channel %d snr %f should be %f\n", __LINE__, ch, snr, target_snr) ;
+			save_oct_float ("output.dat", input_serial, channel_count * frames, output_serial, channel_count * frames) ;
+			exit (1) ;
+			} ;
+		} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* process_test */
+
+/*==============================================================================
+*/
+
+typedef struct
+{	int channels ;
+	long total_frames ;
+	long current_frame ;
+	float *data ;
+} TEST_CB_DATA ;
+
+static long
+test_callback_func (void *cb_data, float **data)
+{	TEST_CB_DATA *pcb_data ;
+
+	long frames ;
+
+	if ((pcb_data = cb_data) == NULL)
+		return 0 ;
+
+	if (data == NULL)
+		return 0 ;
+
+	*data = pcb_data->data + (pcb_data->current_frame * pcb_data->channels) ;
+
+	if (pcb_data->total_frames - pcb_data->current_frame < BLOCK_LEN)
+		frames = pcb_data->total_frames - pcb_data->current_frame ;
+	else
+		frames = BLOCK_LEN ;
+
+	pcb_data->current_frame += frames ;
+
+	return frames ;
+} /* test_callback_func */
+
+static void
+callback_test (int converter, int channel_count, double target_snr)
+{	TEST_CB_DATA test_callback_data ;
+	SRC_STATE	*src_state = NULL ;
+
+	double	freq, snr, src_ratio ;
+	int		ch, error, frames, read_total, read_count ;
+
+	printf ("\t%-22s (%2d channel%c) ............ ", "callback_test", channel_count, channel_count > 1 ? 's' : ' ') ;
+	fflush (stdout) ;
+
+	assert (channel_count <= MAX_CHANNELS) ;
+
+	memset (input_serial, 0, sizeof (input_serial)) ;
+	memset (input_interleaved, 0, sizeof (input_interleaved)) ;
+	memset (output_interleaved, 0, sizeof (output_interleaved)) ;
+	memset (output_serial, 0, sizeof (output_serial)) ;
+	memset (&test_callback_data, 0, sizeof (test_callback_data)) ;
+
+	frames = BUFFER_LEN ;
+
+	/* Calculate channel_count separate windowed sine waves. */
+	for (ch = 0 ; ch < channel_count ; ch++)
+	{	freq = (200.0 + 33.333333333 * ch) / 44100.0 ;
+		gen_windowed_sines (1, &freq, 1.0, input_serial + ch * frames, frames) ;
+		} ;
+
+	/* Interleave the data in preparation for SRC. */
+	interleave_data (input_serial, input_interleaved, frames, channel_count) ;
+
+	/* Perform sample rate conversion. */
+	src_ratio = 0.95 ;
+	test_callback_data.channels = channel_count ;
+	test_callback_data.total_frames = frames ;
+	test_callback_data.current_frame = 0 ;
+	test_callback_data.data = input_interleaved ;
+
+	if ((src_state = src_callback_new (test_callback_func, converter, channel_count, &error, &test_callback_data)) == NULL)
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	read_total = 0 ;
+	while (read_total < frames)
+	{	read_count = src_callback_read (src_state, src_ratio, frames - read_total, output_interleaved + read_total * channel_count) ;
+
+		if (read_count <= 0)
+			break ;
+
+		read_total += read_count ;
+		} ;
+
+	if ((error = src_error (src_state)) != 0)
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_state = src_delete (src_state) ;
+
+	if (fabs (read_total - src_ratio * frames) > 2)
+	{	printf ("\n\nLine %d : bad output data length %d should be %d.\n", __LINE__,
+					read_total, (int) floor (src_ratio * frames)) ;
+		printf ("\tsrc_ratio  : %.4f\n", src_ratio) ;
+		printf ("\tinput_len  : %d\n", frames) ;
+		printf ("\toutput_len : %d\n\n", read_total) ;
+		exit (1) ;
+		} ;
+
+	/* De-interleave data so SNR can be calculated for each channel. */
+	deinterleave_data (output_interleaved, output_serial, frames, channel_count) ;
+
+	for (ch = 0 ; ch < channel_count ; ch++)
+	{	snr = calculate_snr (output_serial + ch * frames, frames, 1) ;
+		if (snr < target_snr)
+		{	printf ("\n\nLine %d: channel %d snr %f should be %f\n", __LINE__, ch, snr, target_snr) ;
+			save_oct_float ("output.dat", input_serial, channel_count * frames, output_serial, channel_count * frames) ;
+			exit (1) ;
+			} ;
+		} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* callback_test */
+
diff --git a/soxr/lsr-tests/multichan_throughput_test.c b/soxr/lsr-tests/multichan_throughput_test.c
new file mode 100644
index 0000000..523139e
--- /dev/null
+++ b/soxr/lsr-tests/multichan_throughput_test.c
@@ -0,0 +1,216 @@
+/*
+** Copyright (C) 2008-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <samplerate.h>
+
+#include "config.h"
+
+#include "util.h"
+#include "float_cast.h"
+
+#define BUFFER_LEN	(1<<17)
+
+static float input [BUFFER_LEN] ;
+static float output [BUFFER_LEN] ;
+
+static long
+throughput_test (int converter, int channels, long best_throughput)
+{	SRC_DATA src_data ;
+	clock_t start_time, clock_time ;
+	double duration ;
+	long total_frames = 0, throughput ;
+	int error ;
+
+	printf ("    %-30s     %2d         ", src_get_name (converter), channels) ;
+	fflush (stdout) ;
+
+	src_data.data_in = input ;
+	src_data.input_frames = ARRAY_LEN (input) / channels ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = ARRAY_LEN (output) / channels ;
+
+	src_data.src_ratio = 0.99 ;
+
+	sleep (2) ;
+
+	start_time = clock () ;
+
+	do
+	{
+		if ((error = src_simple (&src_data, converter, channels)) != 0)
+		{	puts (src_strerror (error)) ;
+			exit (1) ;
+			} ;
+
+		total_frames += src_data.output_frames_gen ;
+
+		clock_time = clock () - start_time ;
+		duration = (1.0 * clock_time) / CLOCKS_PER_SEC ;
+	}
+	while (duration < 5.0) ;
+
+	if (src_data.input_frames_used != src_data.input_frames)
+	{	printf ("\n\nLine %d : input frames used %ld should be %ld\n", __LINE__, src_data.input_frames_used, src_data.input_frames) ;
+		exit (1) ;
+		} ;
+
+	if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2)
+	{	printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
+		printf ("    input len  : %d\n", ARRAY_LEN (input) / channels) ;
+		printf ("    output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen,
+				floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ;
+		exit (1) ;
+		} ;
+
+	throughput = lrint (floor (total_frames / duration)) ;
+
+	if (best_throughput == 0)
+	{	best_throughput = MAX (throughput, best_throughput) ;
+		printf ("%5.2f      %10ld\n", duration, throughput) ;
+		}
+	else
+	{	best_throughput = MAX (throughput, best_throughput) ;
+		printf ("%5.2f      %10ld       %10ld\n", duration, throughput, best_throughput) ;
+		}
+
+	return best_throughput ;
+} /* throughput_test */
+
+static void
+single_run (void)
+{	const int max_channels = 10 ;
+	int k ;
+
+	printf ("\n    CPU name : %s\n", get_cpu_name ()) ;
+
+	puts (
+		"\n"
+		"    Converter                        Channels    Duration      Throughput\n"
+		"    ---------------------------------------------------------------------"
+		) ;
+
+	for (k = 1 ; k <= max_channels / 2 ; k++)
+		throughput_test (SRC_SINC_FASTEST, k, 0) ;
+
+	puts ("") ;
+	for (k = 1 ; k <= max_channels / 2 ; k++)
+		throughput_test (SRC_SINC_MEDIUM_QUALITY, k, 0) ;
+
+	puts ("") ;
+	for (k = 1 ; k <= max_channels ; k++)
+		throughput_test (SRC_SINC_BEST_QUALITY, k, 0) ;
+
+	puts ("") ;
+	return ;
+} /* single_run */
+
+static void
+multi_run (int run_count)
+{	int k, ch ;
+
+	printf ("\n    CPU name : %s\n", get_cpu_name ()) ;
+
+	puts (
+		"\n"
+		"    Converter                        Channels    Duration      Throughput    Best Throughput\n"
+		"    ----------------------------------------------------------------------------------------"
+		) ;
+
+	for (ch = 1 ; ch <= 5 ; ch++)
+	{	long sinc_fastest = 0, sinc_medium = 0, sinc_best = 0 ;
+
+		for (k = 0 ; k < run_count ; k++)
+		{	sinc_fastest =		throughput_test (SRC_SINC_FASTEST, ch, sinc_fastest) ;
+			sinc_medium =		throughput_test (SRC_SINC_MEDIUM_QUALITY, ch, sinc_medium) ;
+			sinc_best =			throughput_test (SRC_SINC_BEST_QUALITY, ch, sinc_best) ;
+
+			puts ("") ;
+
+			/* Let the CPU cool down. We might be running on a laptop. */
+			sleep (10) ;
+			} ;
+
+		puts (
+			"\n"
+			"    Converter                        Best Throughput\n"
+			"    ------------------------------------------------"
+			) ;
+
+		printf ("    %-30s    %10ld\n", src_get_name (SRC_SINC_FASTEST), sinc_fastest) ;
+		printf ("    %-30s    %10ld\n", src_get_name (SRC_SINC_MEDIUM_QUALITY), sinc_medium) ;
+		printf ("    %-30s    %10ld\n", src_get_name (SRC_SINC_BEST_QUALITY), sinc_best) ;
+		} ;
+
+	puts ("") ;
+} /* multi_run */
+
+static void
+usage_exit (const char * argv0)
+{	const char * cptr ;
+
+	if ((cptr = strrchr (argv0, '/')) != NULL)
+		argv0 = cptr ;
+
+	printf (
+		"Usage :\n"
+	 	"    %s                 - Single run of the throughput test.\n"
+		"    %s --best-of N     - Do N runs of test a print bext result.\n"
+		"\n",
+		argv0, argv0) ;
+
+	exit (0) ;
+} /* usage_exit */
+
+int
+main (int argc, char ** argv)
+{	double freq ;
+
+	memset (input, 0, sizeof (input)) ;
+	freq = 0.01 ;
+	gen_windowed_sines (1, &freq, 1.0, input, BUFFER_LEN) ;
+
+	if (argc == 1)
+		single_run () ;
+	else if (argc == 3 && strcmp (argv [1], "--best-of") == 0)
+	{	int run_count = atoi (argv [2]) ;
+
+		if (run_count < 1 || run_count > 20)
+		{	printf ("Please be sensible. Run count should be in range (1, 10].\n") ;
+			exit (1) ;
+			} ;
+
+		multi_run (run_count) ;
+		}
+	else
+		usage_exit (argv [0]) ;
+
+	puts (
+		"            Duration is in seconds.\n"
+		"            Throughput is in frames/sec (more is better).\n"
+		) ;
+
+	return 0 ;
+} /* main */
+
diff --git a/soxr/lsr-tests/reset_test.c b/soxr/lsr-tests/reset_test.c
new file mode 100644
index 0000000..40485c2
--- /dev/null
+++ b/soxr/lsr-tests/reset_test.c
@@ -0,0 +1,238 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <samplerate.h>
+
+#include "util.h"
+
+#define	BUFFER_LEN		2048
+#define CB_READ_LEN		256
+
+static void process_reset_test (int converter) ;
+static void callback_reset_test (int converter) ;
+
+static float data_one [BUFFER_LEN] ;
+static float data_zero [BUFFER_LEN] ;
+
+int
+main (void)
+{
+	puts ("") ;
+
+	process_reset_test (SRC_ZERO_ORDER_HOLD) ;
+	process_reset_test (SRC_LINEAR) ;
+	process_reset_test (SRC_SINC_FASTEST) ;
+
+	callback_reset_test (SRC_ZERO_ORDER_HOLD) ;
+	callback_reset_test (SRC_LINEAR) ;
+	callback_reset_test (SRC_SINC_FASTEST) ;
+
+	puts ("") ;
+
+	return 0 ;
+} /* main */
+
+static void
+process_reset_test (int converter)
+{	static float output [BUFFER_LEN] ;
+
+	SRC_STATE *src_state ;
+	SRC_DATA src_data ;
+	int k, error ;
+
+	printf ("\tprocess_reset_test  (%-28s) ....... ", src_get_name (converter)) ;
+	fflush (stdout) ;
+
+	for (k = 0 ; k < BUFFER_LEN ; k++)
+	{	data_one [k] = 1.0 ;
+		data_zero [k] = 0.0 ;
+		} ;
+
+	/* Get a converter. */
+	if ((src_state = src_new (converter, 1, &error)) == NULL)
+	{	printf ("\n\nLine %d : src_new() failed : %s.\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	/* Process a bunch of 1.0 valued samples. */
+	src_data.data_in		= data_one ;
+	src_data.data_out		= output ;
+	src_data.input_frames	= BUFFER_LEN ;
+	src_data.output_frames	= BUFFER_LEN ;
+	src_data.src_ratio		= 0.9 ;
+	src_data.end_of_input	= 1 ;
+
+	if ((error = src_process (src_state, &src_data)) != 0)
+	{	printf ("\n\nLine %d : src_simple () returned error : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	/* Reset the state of the converter.*/
+	src_reset (src_state) ;
+
+	/* Now process some zero data. */
+	src_data.data_in		= data_zero ;
+	src_data.data_out		= output ;
+	src_data.input_frames	= BUFFER_LEN ;
+	src_data.output_frames	= BUFFER_LEN ;
+	src_data.src_ratio		= 0.9 ;
+	src_data.end_of_input	= 1 ;
+
+	if ((error = src_process (src_state, &src_data)) != 0)
+	{	printf ("\n\nLine %d : src_simple () returned error : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	/* Finally make sure that the output data is zero ie reset was sucessful. */
+	for (k = 0 ; k < BUFFER_LEN / 2 ; k++)
+		if (output [k] != 0.0)
+		{	printf ("\n\nLine %d : output [%d] should be 0.0, is %f.\n", __LINE__, k, output [k]) ;
+			exit (1) ;
+			} ;
+
+	/* Make sure that this function has been exported. */
+	src_set_ratio (src_state, 1.0) ;
+
+	/* Delete converter. */
+	src_state = src_delete (src_state) ;
+
+	puts ("ok") ;
+} /* process_reset_test */
+
+/*==============================================================================
+*/
+
+typedef struct
+{	int channels ;
+	long count, total ;
+	float *data ;
+} TEST_CB_DATA ;
+
+static long
+test_callback_func (void *cb_data, float **data)
+{	TEST_CB_DATA *pcb_data ;
+
+	long frames ;
+
+	if ((pcb_data = cb_data) == NULL)
+		return 0 ;
+
+	if (data == NULL)
+		return 0 ;
+
+	if (pcb_data->total - pcb_data->count > 0)
+		frames = pcb_data->total - pcb_data->count ;
+	else
+		frames = 0 ;
+
+	*data = pcb_data->data + pcb_data->count ;
+	pcb_data->count += frames ;
+
+	return frames ;
+} /* test_callback_func */
+
+static void
+callback_reset_test (int converter)
+{	static TEST_CB_DATA test_callback_data ;
+
+	static float output [BUFFER_LEN] ;
+
+	SRC_STATE *src_state ;
+
+	double src_ratio = 1.1 ;
+	long read_count, read_total ;
+	int k, error ;
+
+	printf ("\tcallback_reset_test (%-28s) ....... ", src_get_name (converter)) ;
+	fflush (stdout) ;
+
+	for (k = 0 ; k < ARRAY_LEN (data_one) ; k++)
+	{	data_one [k] = 1.0 ;
+		data_zero [k] = 0.0 ;
+		} ;
+
+	if ((src_state = src_callback_new (test_callback_func, converter, 1, &error, &test_callback_data)) == NULL)
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	/* Process a bunch of 1.0 valued samples. */
+	test_callback_data.channels = 1 ;
+	test_callback_data.count = 0 ;
+	test_callback_data.total = ARRAY_LEN (data_one) ;
+	test_callback_data.data = data_one ;
+
+	read_total = 0 ;
+	do
+	{	read_count = (ARRAY_LEN (output) - read_total > CB_READ_LEN) ? CB_READ_LEN : ARRAY_LEN (output) - read_total ;
+		read_count = src_callback_read (src_state, src_ratio, read_count, output + read_total) ;
+		read_total += read_count ;
+		}
+	while (read_count > 0) ;
+
+	/* Check for errors. */
+	if ((error = src_error (src_state)) != 0)
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	/* Reset the state of the converter.*/
+	src_reset (src_state) ;
+
+	/* Process a bunch of 0.0 valued samples. */
+	test_callback_data.channels = 1 ;
+	test_callback_data.count = 0 ;
+	test_callback_data.total = ARRAY_LEN (data_zero) ;
+	test_callback_data.data = data_zero ;
+
+	/* Now process some zero data. */
+	read_total = 0 ;
+	do
+	{	read_count = (ARRAY_LEN (output) - read_total > CB_READ_LEN) ? CB_READ_LEN : ARRAY_LEN (output) - read_total ;
+		read_count = src_callback_read (src_state, src_ratio, read_count, output + read_total) ;
+		read_total += read_count ;
+		}
+	while (read_count > 0) ;
+
+	/* Check for errors. */
+	if ((error = src_error (src_state)) != 0)
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	/* Finally make sure that the output data is zero ie reset was sucessful. */
+	for (k = 0 ; k < BUFFER_LEN / 2 ; k++)
+		if (output [k] != 0.0)
+		{	printf ("\n\nLine %d : output [%d] should be 0.0, is %f.\n\n", __LINE__, k, output [k]) ;
+			save_oct_float ("output.dat", data_one, ARRAY_LEN (data_one), output, ARRAY_LEN (output)) ;
+			exit (1) ;
+			} ;
+
+	/* Make sure that this function has been exported. */
+	src_set_ratio (src_state, 1.0) ;
+
+	/* Delete converter. */
+	src_state = src_delete (src_state) ;
+
+	puts ("ok") ;
+} /* callback_reset_test */
+
+
diff --git a/soxr/lsr-tests/simple_test.c b/soxr/lsr-tests/simple_test.c
new file mode 100644
index 0000000..91dcde3
--- /dev/null
+++ b/soxr/lsr-tests/simple_test.c
@@ -0,0 +1,117 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include <samplerate.h>
+
+#include "util.h"
+
+#define	BUFFER_LEN		2048
+
+static void simple_test (int converter, double ratio) ;
+
+int
+main (void)
+{	static double src_ratios [] =
+	{	1.0001, 0.099, 0.1, 0.33333333, 0.789, 1.9, 3.1, 9.9
+	} ;
+
+	int k ;
+
+	puts ("") ;
+
+	puts ("    Zero Order Hold interpolator :") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		simple_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ;
+
+	puts ("    Linear interpolator :") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		simple_test (SRC_LINEAR, src_ratios [k]) ;
+
+	puts ("    Sinc interpolator :") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		simple_test (SRC_SINC_FASTEST, src_ratios [k]) ;
+
+	puts ("") ;
+
+	return 0 ;
+} /* main */
+
+static void
+simple_test (int converter, double src_ratio)
+{	static float input [BUFFER_LEN], output [BUFFER_LEN] ;
+
+	SRC_DATA	src_data ;
+
+	int input_len, output_len, error, terminate ;
+
+	printf ("\tsimple_test      (SRC ratio = %6.4f) ........... ", src_ratio) ;
+	fflush (stdout) ;
+
+	/* Calculate maximun input and output lengths. */
+	if (src_ratio >= 1.0)
+	{	output_len = BUFFER_LEN ;
+		input_len = (int) floor (BUFFER_LEN / src_ratio) ;
+		}
+	else
+	{	input_len = BUFFER_LEN ;
+		output_len = (int) floor (BUFFER_LEN * src_ratio) ;
+		} ;
+
+	/* Reduce input_len by 10 so output is longer than necessary. */
+	input_len -= 10 ;
+
+	if (output_len > BUFFER_LEN)
+	{	printf ("\n\nLine %d : output_len > BUFFER_LEN\n\n", __LINE__) ;
+		exit (1) ;
+		} ;
+
+	memset (&src_data, 0, sizeof (src_data)) ;
+
+	src_data.data_in = input ;
+	src_data.input_frames = input_len ;
+
+	src_data.src_ratio = src_ratio ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = BUFFER_LEN ;
+
+	if ((error = src_simple (&src_data, converter, 1)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	terminate = (int) ceil ((src_ratio >= 1.0) ? src_ratio : 1.0 / src_ratio) ;
+
+	if (fabs (src_data.output_frames_gen - src_ratio * input_len) > 2 * terminate)
+	{	printf ("\n\nLine %d : bad output data length %ld should be %d.\n", __LINE__,
+					src_data.output_frames_gen, (int) floor (src_ratio * input_len)) ;
+		printf ("\tsrc_ratio  : %.4f\n", src_ratio) ;
+		printf ("\tinput_len  : %d\n\toutput_len : %d\n\n", input_len, output_len) ;
+		exit (1) ;
+		} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* simple_test */
+
diff --git a/soxr/lsr-tests/sndfile-resample.c b/soxr/lsr-tests/sndfile-resample.c
new file mode 100644
index 0000000..63d179c
--- /dev/null
+++ b/soxr/lsr-tests/sndfile-resample.c
@@ -0,0 +1,332 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <math.h>
+
+#if (HAVE_SNDFILE)
+
+#include <samplerate.h>
+#include <sndfile.h>
+
+#define DEFAULT_CONVERTER SRC_SINC_MEDIUM_QUALITY
+
+#define	BUFFER_LEN		4096	/*-(1<<16)-*/
+
+static void usage_exit (const char *progname) ;
+static sf_count_t sample_rate_convert (SNDFILE *infile, SNDFILE *outfile, int converter, double src_ratio, int channels, double * gain) ;
+static double apply_gain (float * data, long frames, int channels, double max, double gain) ;
+
+int
+main (int argc, char *argv [])
+{	SNDFILE	*infile, *outfile = NULL ;
+	SF_INFO sfinfo ;
+
+	sf_count_t	count ;
+	double		src_ratio = -1.0, gain = 1.0 ;
+	int			new_sample_rate = -1, k, converter, max_speed = SF_FALSE ;
+
+	if (argc == 2 && strcmp (argv [1], "--version") == 0)
+	{	char buffer [64], *cptr ;
+
+		if ((cptr = strrchr (argv [0], '/')) != NULL)
+			argv [0] = cptr + 1 ;
+		if ((cptr = strrchr (argv [0], '\\')) != NULL)
+			argv [0] = cptr + 1 ;
+
+		sf_command (NULL, SFC_GET_LIB_VERSION, buffer, sizeof (buffer)) ;
+
+		printf ("%s (%s,%s)\n", argv [0], src_get_version (), buffer) ;
+		exit (0) ;
+		} ;
+
+	if (argc != 5 && argc != 7 && argc != 8)
+		usage_exit (argv [0]) ;
+
+	/* Set default converter. */
+	converter = DEFAULT_CONVERTER ;
+
+	for (k = 1 ; k < argc - 2 ; k++)
+	{	if (strcmp (argv [k], "--max-speed") == 0)
+			max_speed = SF_TRUE ;
+		else if (strcmp (argv [k], "-to") == 0)
+		{	k ++ ;
+			new_sample_rate = atoi (argv [k]) ;
+			}
+		else if (strcmp (argv [k], "-by") == 0)
+		{	k ++ ;
+			src_ratio = atof (argv [k]) ;
+			}
+		else if (strcmp (argv [k], "-c") == 0)
+		{	k ++ ;
+			converter = atoi (argv [k]) ;
+			}
+		else
+			usage_exit (argv [0]) ;
+		} ;
+
+	if (new_sample_rate <= 0 && src_ratio <= 0.0)
+		usage_exit (argv [0]) ;
+
+	if (src_get_name (converter) == NULL)
+	{	printf ("Error : bad converter number.\n") ;
+		usage_exit (argv [0]) ;
+		} ;
+
+	if (strcmp (argv [argc - 2], argv [argc - 1]) == 0)
+	{	printf ("Error : input and output file names are the same.\n") ;
+		exit (1) ;
+		} ;
+
+	if ((infile = sf_open (argv [argc - 2], SFM_READ, &sfinfo)) == NULL)
+	{	printf ("Error : Not able to open input file '%s'\n", argv [argc - 2]) ;
+		exit (1) ;
+		} ;
+
+	printf ("Input File    : %s\n", argv [argc - 2]) ;
+	printf ("Sample Rate   : %d\n", sfinfo.samplerate) ;
+	printf ("Input Frames  : %ld\n\n", (long) sfinfo.frames) ;
+
+	if (new_sample_rate > 0)
+	{	src_ratio = (1.0 * new_sample_rate) / sfinfo.samplerate ;
+		sfinfo.samplerate = new_sample_rate ;
+		}
+	else if (src_is_valid_ratio (src_ratio))
+		sfinfo.samplerate = (int) floor (sfinfo.samplerate * src_ratio) ;
+	else
+	{	printf ("Not able to determine new sample rate. Exiting.\n") ;
+		sf_close (infile) ;
+		exit (1) ;
+		} ;
+
+	if (fabs (src_ratio - 1.0) < 1e-20)
+	{	printf ("Target samplerate and input samplerate are the same. Exiting.\n") ;
+		sf_close (infile) ;
+		exit (0) ;
+		} ;
+
+	printf ("SRC Ratio     : %f\n", src_ratio) ;
+	printf ("Converter     : %s\n\n", src_get_name (converter)) ;
+
+	if (src_is_valid_ratio (src_ratio) == 0)
+	{	printf ("Error : Sample rate change out of valid range.\n") ;
+		sf_close (infile) ;
+		exit (1) ;
+		} ;
+
+	/* Delete the output file length to zero if already exists. */
+	remove (argv [argc - 1]) ;
+
+	printf ("Output file   : %s\n", argv [argc - 1]) ;
+	printf ("Sample Rate   : %d\n", sfinfo.samplerate) ;
+
+	do
+	{	sf_close (outfile) ;
+
+		if ((outfile = sf_open (argv [argc - 1], SFM_WRITE, &sfinfo)) == NULL)
+		{	printf ("Error : Not able to open output file '%s'\n", argv [argc - 1]) ;
+			sf_close (infile) ;
+			exit (1) ;
+			} ;
+
+		if (max_speed)
+		{	/* This is mainly for the comparison program tests/src-evaluate.c */
+			sf_command (outfile, SFC_SET_ADD_PEAK_CHUNK, NULL, SF_FALSE) ;
+			}
+		else
+		{	/* Update the file header after every write. */
+			sf_command (outfile, SFC_SET_UPDATE_HEADER_AUTO, NULL, SF_TRUE) ;
+			} ;
+
+		sf_command (outfile, SFC_SET_CLIPPING, NULL, SF_TRUE) ;
+
+		count = sample_rate_convert (infile, outfile, converter, src_ratio, sfinfo.channels, &gain) ;
+		}
+	while (count < 0) ;
+
+	printf ("Output Frames : %ld\n\n", (long) count) ;
+
+	sf_close (infile) ;
+	sf_close (outfile) ;
+
+	return 0 ;
+} /* main */
+
+/*==============================================================================
+*/
+
+static sf_count_t
+sample_rate_convert (SNDFILE *infile, SNDFILE *outfile, int converter, double src_ratio, int channels, double * gain)
+{	static float input [BUFFER_LEN] ;
+	static float output [BUFFER_LEN] ;
+
+	SRC_STATE	*src_state ;
+	SRC_DATA	src_data ;
+	int			error ;
+	double		max = 0.0 ;
+	sf_count_t	output_count = 0 ;
+
+	sf_seek (infile, 0, SEEK_SET) ;
+	sf_seek (outfile, 0, SEEK_SET) ;
+
+	/* Initialize the sample rate converter. */
+	if ((src_state = src_new (converter, channels, &error)) == NULL)
+	{	printf ("\n\nError : src_new() failed : %s.\n\n", src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_data.end_of_input = 0 ; /* Set this later. */
+
+	/* Start with zero to force load in while loop. */
+	src_data.input_frames = 0 ;
+	src_data.data_in = input ;
+
+	src_data.src_ratio = src_ratio ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = BUFFER_LEN /channels ;
+
+	while (1)
+	{
+		/* If the input buffer is empty, refill it. */
+		if (src_data.input_frames == 0)
+		{	src_data.input_frames = sf_readf_float (infile, input, BUFFER_LEN / channels) ;
+			src_data.data_in = input ;
+
+			/* The last read will not be a full buffer, so snd_of_input. */
+			if (src_data.input_frames < BUFFER_LEN / channels)
+				src_data.end_of_input = SF_TRUE ;
+			} ;
+
+		if ((error = src_process (src_state, &src_data)))
+		{	printf ("\nError : %s\n", src_strerror (error)) ;
+			exit (1) ;
+			} ;
+
+		/* Terminate if done. */
+		if (src_data.end_of_input && src_data.output_frames_gen == 0)
+			break ;
+
+		max = apply_gain (src_data.data_out, src_data.output_frames_gen, channels, max, *gain) ;
+
+		/* Write output. */
+		sf_writef_float (outfile, output, src_data.output_frames_gen) ;
+		output_count += src_data.output_frames_gen ;
+
+		src_data.data_in += src_data.input_frames_used * channels ;
+		src_data.input_frames -= src_data.input_frames_used ;
+		} ;
+
+	src_state = src_delete (src_state) ;
+
+	if (max > 1.0)
+	{	*gain = 1.0 / max ;
+		printf ("\nOutput has clipped. Restarting conversion to prevent clipping.\n\n") ;
+		return -1 ;
+		} ;
+
+	return output_count ;
+} /* sample_rate_convert */
+
+static double
+apply_gain (float * data, long frames, int channels, double max, double gain)
+{
+	long k ;
+
+	for (k = 0 ; k < frames * channels ; k++)
+	{	data [k] *= gain ;
+
+		if (fabs (data [k]) > max)
+			max = fabs (data [k]) ;
+		} ;
+
+	return max ;
+} /* apply_gain */
+
+static void
+usage_exit (const char *progname)
+{	char lsf_ver [128] ;
+	const char	*cptr ;
+	int		k ;
+
+	if ((cptr = strrchr (progname, '/')) != NULL)
+		progname = cptr + 1 ;
+
+	if ((cptr = strrchr (progname, '\\')) != NULL)
+		progname = cptr + 1 ;
+
+
+	sf_command (NULL, SFC_GET_LIB_VERSION, lsf_ver, sizeof (lsf_ver)) ;
+
+	printf ("\n"
+		"  A Sample Rate Converter using libsndfile for file I/O and Secret \n"
+		"  Rabbit Code (aka libsamplerate) for performing the conversion.\n"
+		"  It works on any file format supported by libsndfile with any \n"
+		"  number of channels (limited only by host memory).\n"
+		"\n"
+		"       %s\n"
+		"       %s\n"
+		"\n"
+		"  Usage : \n"
+		"       %s -to <new sample rate> [-c <number>] <input file> <output file>\n"
+		"       %s -by <amount> [-c <number>] <input file> <output file>\n"
+		"\n", src_get_version (), lsf_ver, progname, progname) ;
+
+	puts (
+		"  The optional -c argument allows the converter type to be chosen from\n"
+		"  the following list :"
+		"\n"
+		) ;
+
+	for (k = 0 ; (cptr = src_get_name (k)) != NULL ; k++)
+		printf ("       %d : %s%s\n", k, cptr, k == DEFAULT_CONVERTER ? " (default)" : "") ;
+
+	puts ("") ;
+
+	exit (1) ;
+} /* usage_exit */
+
+/*==============================================================================
+*/
+
+#else /* (HAVE_SNFILE == 0) */
+
+/* Alternative main function when libsndfile is not available. */
+
+int
+main (void)
+{	puts (
+		"\n"
+		"****************************************************************\n"
+		"  This example program was compiled without libsndfile \n"
+		"  (http://www.mega-nerd.com/libsndfile/).\n"
+		"  It is therefore completely broken and non-functional.\n"
+		"****************************************************************\n"
+		"\n"
+		) ;
+
+	return 0 ;
+} /* main */
+
+#endif
+
diff --git a/soxr/lsr-tests/snr_bw_test.c b/soxr/lsr-tests/snr_bw_test.c
new file mode 100644
index 0000000..55130b4
--- /dev/null
+++ b/soxr/lsr-tests/snr_bw_test.c
@@ -0,0 +1,401 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include "config.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+
+#if (HAVE_FFTW3)
+
+#include <samplerate.h>
+
+#include "util.h"
+
+#define	BUFFER_LEN		50000
+#define	MAX_FREQS		4
+#define	MAX_RATIOS		6
+#define	MAX_SPEC_LEN	(1<<15)
+
+#ifndef	M_PI
+#define	M_PI			3.14159265358979323846264338
+#endif
+
+enum
+{	BOOLEAN_FALSE	= 0,
+	BOOLEAN_TRUE	= 1
+} ;
+
+typedef struct
+{	int		freq_count ;
+	double	freqs [MAX_FREQS] ;
+
+	double	src_ratio ;
+	int		pass_band_peaks ;
+
+	double	snr ;
+	double	peak_value ;
+} SINGLE_TEST ;
+
+typedef struct
+{	int			converter ;
+	int			tests ;
+	int			do_bandwidth_test ;
+	SINGLE_TEST	test_data [10] ;
+} CONVERTER_TEST ;
+
+static double snr_test (SINGLE_TEST *snr_test_data, int number, int converter, int verbose) ;
+static double find_peak (float *output, int output_len) ;
+static double bandwidth_test (int converter, int verbose) ;
+
+int
+main (int argc, char *argv [])
+{	CONVERTER_TEST snr_test_data [] =
+	{
+		{	SRC_ZERO_ORDER_HOLD,
+			8,
+			BOOLEAN_FALSE,
+			{	{	1,	{ 0.01111111111 },		3.0,		1,	 28.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.6,		1,	 36.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.3,		1,	 36.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.0,		1,	150.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.001,		1,	 38.0,	1.0 },
+				{	2,	{ 0.011111, 0.324 },	1.9999,		2,	 14.0,	.96 },
+				{	2,	{ 0.012345, 0.457 },	0.456789,	1,	 12.0,	.96 },
+				{	1,	{ 0.3511111111 },		1.33,		1,	 10.0,	1.0 }
+				}
+			},
+
+		{	SRC_LINEAR,
+			8,
+			BOOLEAN_FALSE,
+			{	{	1,	{ 0.01111111111 },		3.0,		1,	 73.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.6,		1,	 73.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.3,		1,	 73.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.0,		1,	150.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.001,		1,	 77.0,	1.0 },
+				{	2,	{ 0.011111, 0.324 },	1.9999,		2,	 16.0,	0.96 },
+				{	2,	{ 0.012345, 0.457 },	0.456789,	1,	 26.0,	0.96 },
+				{	1,	{ 0.3511111111 },		1.33,		1,	 14.4,	0.99 }
+				}
+			},
+
+		{	SRC_SINC_FASTEST,
+			9,
+			BOOLEAN_TRUE,
+			{	{	1,	{ 0.01111111111 },		3.0,		1,	100.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.6,		1,	 99.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.3,		1,	100.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.0,		1,	150.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.001,		1,	100.0,	1.0 },
+				{	2,	{ 0.011111, 0.324 },	1.9999,		2,	 97.0,	1.0 },
+				{	2,	{ 0.012345, 0.457 },	0.456789,	1,	100.0,	0.5 },
+				{	2,	{ 0.011111, 0.45 },		0.6,		1,	 97.0,	0.5 },
+				{	1,	{ 0.3511111111 },		1.33,		1,	 97.0,	1.0 }
+				}
+			},
+
+		{	SRC_SINC_MEDIUM_QUALITY,
+			9,
+			BOOLEAN_TRUE,
+			{	{	1,	{ 0.01111111111 },		3.0,		1,	130.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.6,		1,	132.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.3,		1,	135.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.0,		1,	155.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.001,		1,	133.0,	1.0 },
+				{	2,	{ 0.011111, 0.324 },	1.9999,		2,	127.0,	1.0 },
+				{	2,	{ 0.012345, 0.457 },	0.456789,	1,	124.0,	0.5 },
+				{	2,	{ 0.011111, 0.45 },		0.6,		1,	126.0,	0.5 },
+				{	1,	{ 0.43111111111 },		1.33,		1,	121.0,	1.0 }
+				}
+			},
+
+		{	SRC_SINC_BEST_QUALITY,
+			9,
+			BOOLEAN_TRUE,
+			{	{	1,	{ 0.01111111111 },		3.0,		1,	147.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.6,		1,	147.0,	1.0 },
+				{	1,	{ 0.01111111111 },		0.3,		1,	147.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.0,		1,	155.0,	1.0 },
+				{	1,	{ 0.01111111111 },		1.001,		1,	146.0,	1.0 },
+				{	2,	{ 0.011111, 0.324 },	1.9999,		2,	147.0,	1.0 },
+				{	2,	{ 0.012345, 0.457 },	0.456789,	1,	148.0,	0.5 },
+				{	2,	{ 0.011111, 0.45 },		0.6,		1,	145.0,	0.5 },
+				{	1,	{ 0.43111111111 },		1.33,		1,	145.0,	1.0 }
+				}
+			},
+		} ; /* snr_test_data */
+
+	double	best_snr, snr, freq3dB ;
+	int 	j, k, converter, verbose = 0 ;
+
+	if (argc == 2 && strcmp (argv [1], "--verbose") == 0)
+		verbose = 1 ;
+
+	puts ("") ;
+
+	for (j = 0 ; j < ARRAY_LEN (snr_test_data) ; j++)
+	{	best_snr = 5000.0 ;
+
+		converter = snr_test_data [j].converter ;
+
+		printf ("    Converter %d : %s\n", converter, src_get_name (converter)) ;
+		printf ("    %s\n", src_get_description (converter)) ;
+
+		for (k = 0 ; k < snr_test_data [j].tests ; k++)
+		{	snr = snr_test (&(snr_test_data [j].test_data [k]), k, converter, verbose) ;
+			if (best_snr > snr)
+				best_snr = snr ;
+			} ;
+
+		printf ("    Worst case Signal-to-Noise Ratio : %.2f dB.\n", best_snr) ;
+
+		if (snr_test_data [j].do_bandwidth_test == BOOLEAN_FALSE)
+		{	puts ("    Bandwith test not performed on this converter.\n") ;
+			continue ;
+			}
+
+		freq3dB = bandwidth_test (converter, verbose) ;
+
+		printf ("    Measured -3dB rolloff point      : %5.2f %%.\n\n", freq3dB) ;
+		} ;
+
+	return 0 ;
+} /* main */
+
+/*==============================================================================
+*/
+
+static double
+snr_test (SINGLE_TEST *test_data, int number, int converter, int verbose)
+{	static float data [BUFFER_LEN + 1] ;
+	static float output [MAX_SPEC_LEN] ;
+
+	SRC_STATE	*src_state ;
+	SRC_DATA	src_data ;
+
+	double		output_peak, snr ;
+	int 		k, output_len, input_len, error ;
+
+	if (verbose != 0)
+	{	printf ("\tSignal-to-Noise Ratio Test %d.\n"
+				"\t=====================================\n", number) ;
+		printf ("\tFrequencies : [ ") ;
+		for (k = 0 ; k < test_data->freq_count ; k++)
+			printf ("%6.4f ", test_data->freqs [k]) ;
+
+		printf ("]\n\tSRC Ratio   : %8.4f\n", test_data->src_ratio) ;
+		}
+	else
+	{	printf ("\tSignal-to-Noise Ratio Test %d : ", number) ;
+		fflush (stdout) ;
+		} ;
+
+	/* Set up the output array. */
+	if (test_data->src_ratio >= 1.0)
+	{	output_len = MAX_SPEC_LEN ;
+		input_len = (int) ceil (MAX_SPEC_LEN / test_data->src_ratio) ;
+		if (input_len > BUFFER_LEN)
+			input_len = BUFFER_LEN ;
+		}
+	else
+	{	input_len = BUFFER_LEN ;
+		output_len = (int) ceil (BUFFER_LEN * test_data->src_ratio) ;
+		output_len &= ((-1) << 4) ;
+		if (output_len > MAX_SPEC_LEN)
+			output_len = MAX_SPEC_LEN ;
+		input_len = (int) ceil (output_len / test_data->src_ratio) ;
+		} ;
+
+	memset (output, 0, sizeof (output)) ;
+
+	/* Generate input data array. */
+	gen_windowed_sines (test_data->freq_count, test_data->freqs, 1.0, data, input_len) ;
+
+	/* Perform sample rate conversion. */
+	if ((src_state = src_new (converter, 1, &error)) == NULL)
+	{	printf ("\n\nLine %d : src_new() failed : %s.\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_data.end_of_input = 1 ; /* Only one buffer worth of input. */
+
+	src_data.data_in = data ;
+	src_data.input_frames = input_len ;
+
+	src_data.src_ratio = test_data->src_ratio ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = output_len ;
+
+	if ((error = src_process (src_state, &src_data)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_state = src_delete (src_state) ;
+
+	if (verbose != 0)
+		printf ("\tOutput Len  :   %ld\n", src_data.output_frames_gen) ;
+
+	if (abs (src_data.output_frames_gen - output_len) > 4)
+	{	printf ("\n\nLine %d : output data length should be %d.\n\n", __LINE__, output_len) ;
+		exit (1) ;
+		} ;
+
+	/* Check output peak. */
+	output_peak = find_peak (output, src_data.output_frames_gen) ;
+
+	if (verbose != 0)
+		printf ("\tOutput Peak :   %6.4f\n", output_peak) ;
+
+	if (fabs (output_peak - test_data->peak_value) > 0.01)
+	{	printf ("\n\nLine %d : output peak (%6.4f) should be %6.4f\n\n", __LINE__, output_peak, test_data->peak_value) ;
+		save_oct_float ("snr_test.dat", data, BUFFER_LEN, output, output_len) ;
+		exit (1) ;
+		} ;
+
+	/* Calculate signal-to-noise ratio. */
+	snr = calculate_snr (output, src_data.output_frames_gen, test_data->pass_band_peaks) ;
+
+	if (snr < 0.0)
+	{	/* An error occurred. */
+		save_oct_float ("snr_test.dat", data, BUFFER_LEN, output, src_data.output_frames_gen) ;
+		exit (1) ;
+		} ;
+
+	if (verbose != 0)
+		printf ("\tSNR Ratio   :   %.2f dB\n", snr) ;
+
+	if (snr < test_data->snr)
+	{	printf ("\n\nLine %d : SNR (%5.2f) should be > %6.2f dB\n\n", __LINE__, snr, test_data->snr) ;
+		exit (1) ;
+		} ;
+
+	if (verbose != 0)
+		puts ("\t-------------------------------------\n\tPass\n") ;
+	else
+		puts ("Pass") ;
+
+	return snr ;
+} /* snr_test */
+
+static double
+find_peak (float *data, int len)
+{	double 	peak = 0.0 ;
+	int		k = 0 ;
+
+	for (k = 0 ; k < len ; k++)
+		if (fabs (data [k]) > peak)
+			peak = fabs (data [k]) ;
+
+	return peak ;
+} /* find_peak */
+
+
+static double
+find_attenuation (double freq, int converter, int verbose)
+{	static float input	[BUFFER_LEN] ;
+	static float output [2 * BUFFER_LEN] ;
+
+	SRC_DATA	src_data ;
+	double 		output_peak ;
+	int			error ;
+
+	gen_windowed_sines (1, &freq, 1.0, input, BUFFER_LEN) ;
+
+	src_data.end_of_input = 1 ; /* Only one buffer worth of input. */
+
+	src_data.data_in = input ;
+	src_data.input_frames = BUFFER_LEN ;
+
+	src_data.src_ratio = 1.999 ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = ARRAY_LEN (output) ;
+
+	if ((error = src_simple (&src_data, converter, 1)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	output_peak = find_peak (output, ARRAY_LEN (output)) ;
+
+	if (verbose)
+		printf ("\tFreq : %6f   InPeak : %6f    OutPeak : %6f   Atten : %6.2f dB\n",
+				freq, 1.0, output_peak, 20.0 * log10 (1.0 / output_peak)) ;
+
+	return 20.0 * log10 (1.0 / output_peak) ;
+} /* find_attenuation */
+
+static double
+bandwidth_test (int converter, int verbose)
+{	double	f1, f2, a1, a2 ;
+	double	freq, atten ;
+
+	f1 = 0.35 ;
+	a1 = find_attenuation (f1, converter, verbose) ;
+
+	f2 = 0.495 ;
+	a2 = find_attenuation (f2, converter, verbose) ;
+
+	if (a1 > 3.0 || a2 < 3.0)
+	{	printf ("\n\nLine %d : cannot bracket 3dB point.\n\n", __LINE__) ;
+		exit (1) ;
+		} ;
+
+	while (a2 - a1 > 1.0)
+	{	freq = f1 + 0.5 * (f2 - f1) ;
+		atten = find_attenuation (freq, converter, verbose) ;
+
+		if (atten < 3.0)
+		{	f1 = freq ;
+			a1 = atten ;
+			}
+		else
+		{	f2 = freq ;
+			a2 = atten ;
+			} ;
+		} ;
+
+	freq = f1 + (3.0 - a1) * (f2 - f1) / (a2 - a1) ;
+
+	return 200.0 * freq ;
+} /* bandwidth_test */
+
+#else /* (HAVE_FFTW3) == 0 */
+
+/* Alternative main function when librfftw is not available. */
+
+int
+main (void)
+{	puts ("\n"
+		"****************************************************************\n"
+		" This test cannot be run without FFTW (http://www.fftw.org/).\n"
+		" Both the real and the complex versions of the library are\n"
+		" required.") ;
+	puts ("****************************************************************\n") ;
+
+	return 0 ;
+} /* main */
+
+#endif
+
diff --git a/soxr/lsr-tests/termination_test.c b/soxr/lsr-tests/termination_test.c
new file mode 100644
index 0000000..6bb0fc0
--- /dev/null
+++ b/soxr/lsr-tests/termination_test.c
@@ -0,0 +1,339 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include <samplerate.h>
+
+#include "util.h"
+
+#define	SHORT_BUFFER_LEN	2048
+#define	LONG_BUFFER_LEN		((1 << 16) - 20)
+
+static void simple_test (int converter) ;
+static void stream_test (int converter, double ratio) ;
+static void init_term_test (int converter, double ratio) ;
+
+static int	next_block_length (int reset) ;
+
+int
+main (void)
+{	static double src_ratios [] =
+	{	0.999900, 1.000100, 0.789012, 1.200000, 0.333333, 3.100000,
+		0.125000, 8.000000, 0.099900, 9.990000, 0.100000, 10.00000
+	} ;
+
+	int k ;
+
+	puts ("\n    Zero Order Hold interpolator:") ;
+
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		init_term_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ;
+	puts ("") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		stream_test (SRC_ZERO_ORDER_HOLD, src_ratios [k]) ;
+
+
+	puts ("\n    Linear interpolator:") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		init_term_test (SRC_LINEAR, src_ratios [k]) ;
+	puts ("") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		stream_test (SRC_LINEAR, src_ratios [k]) ;
+
+
+	puts ("\n    Sinc interpolator:") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		init_term_test (SRC_SINC_FASTEST, src_ratios [k]) ;
+	puts ("") ;
+	for (k = 0 ; k < ARRAY_LEN (src_ratios) ; k++)
+		stream_test (SRC_SINC_FASTEST, src_ratios [k]) ;
+
+	puts ("") ;
+
+	simple_test (SRC_SINC_FASTEST) ;
+
+	return 0 ;
+} /* main */
+
+static void
+simple_test (int converter)
+{
+	int ilen = 199030, olen = 1000, error ;
+
+	{
+		float in [ilen] ;
+		float out [olen] ;
+		double ratio = (1.0 * olen) / ilen ;
+		SRC_DATA src_data =
+		{	in, out,
+			ilen, olen,
+			0, 0, 0,
+			ratio
+		} ;
+
+		error = src_simple (&src_data, converter, 1) ;
+		if (error)
+		{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+			exit (1) ;
+			} ;
+	} ;
+
+    return ;
+} /* simple_test */
+
+static void
+init_term_test (int converter, double src_ratio)
+{	static float input [SHORT_BUFFER_LEN], output [SHORT_BUFFER_LEN] ;
+
+	SRC_DATA	src_data ;
+
+	int k, input_len, output_len, error, terminate ;
+
+	printf ("\tinit_term_test   (SRC ratio = %7.4f) .......... ", src_ratio) ;
+	fflush (stdout) ;
+
+	/* Calculate maximun input and output lengths. */
+	if (src_ratio >= 1.0)
+	{	output_len = SHORT_BUFFER_LEN ;
+		input_len = (int) floor (SHORT_BUFFER_LEN / src_ratio) ;
+		}
+	else
+	{	input_len = SHORT_BUFFER_LEN ;
+		output_len = (int) floor (SHORT_BUFFER_LEN * src_ratio) ;
+		} ;
+
+	/* Reduce input_len by 10 so output is longer than necessary. */
+	input_len -= 10 ;
+
+	for (k = 0 ; k < ARRAY_LEN (input) ; k++)
+		input [k] = 1.0 ;
+
+	if (output_len > SHORT_BUFFER_LEN)
+	{	printf ("\n\nLine %d : output_len > SHORT_BUFFER_LEN\n\n", __LINE__) ;
+		exit (1) ;
+		} ;
+
+	src_data.data_in = input ;
+	src_data.input_frames = input_len ;
+
+	src_data.src_ratio = src_ratio ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = SHORT_BUFFER_LEN ;
+
+	if ((error = src_simple (&src_data, converter, 1)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	terminate = (int) ceil ((src_ratio >= 1.0) ? 1 : 1.0 / src_ratio) ;
+
+	if (fabs (src_ratio * input_len - src_data.output_frames_gen) > terminate)
+	{	printf ("\n\nLine %d : Bad output frame count.\n\n", __LINE__) ;
+		printf ("\tterminate             : %d\n", terminate) ;
+		printf ("\tsrc_ratio             : %.4f\n", src_ratio) ;
+		printf ("\tinput_len             : %d\n"
+				"\tinput_len * src_ratio : %f\n", input_len, input_len * src_ratio) ;
+		printf ("\toutput_frames_gen     : %ld\n\n", src_data.output_frames_gen) ;
+		exit (1) ;
+		} ;
+
+	if (abs (src_data.input_frames_used - input_len) > 1)
+	{	printf ("\n\nLine %d : input_frames_used should be %d, is %ld.\n\n",
+					 __LINE__, input_len, src_data.input_frames_used) ;
+		printf ("\tsrc_ratio  : %.4f\n", src_ratio) ;
+		printf ("\tinput_len  : %d\n\tinput_used : %ld\n\n", input_len, src_data.input_frames_used) ;
+		exit (1) ;
+		} ;
+
+	if (fabs (output [0]) < 0.1)
+	{	printf ("\n\nLine %d : First output sample is bad.\n\n", __LINE__) ;
+		printf ("\toutput [0] == %f\n\n", output [0]) ;
+		exit (1) ;
+		}
+
+	puts ("ok") ;
+
+	return ;
+} /* init_term_test */
+
+static void
+stream_test (int converter, double src_ratio)
+{	static float input [LONG_BUFFER_LEN], output [LONG_BUFFER_LEN] ;
+
+	SRC_STATE	*src_state ;
+	SRC_DATA	src_data ;
+
+	int input_len, output_len, current_in, current_out ;
+	int k, error, terminate ;
+
+	printf ("\tstream_test      (SRC ratio = %7.4f) .......... ", src_ratio) ;
+	fflush (stdout) ;
+
+/* Erik */
+for (k = 0 ; k < LONG_BUFFER_LEN ; k++) input [k] = k * 1.0 ;
+
+	/* Calculate maximun input and output lengths. */
+	if (src_ratio >= 1.0)
+	{	output_len = LONG_BUFFER_LEN ;
+		input_len = (int) floor (LONG_BUFFER_LEN / src_ratio) ;
+		}
+	else
+	{	input_len = LONG_BUFFER_LEN ;
+		output_len = (int) floor (LONG_BUFFER_LEN * src_ratio) ;
+		} ;
+
+	/* Reduce input_len by 10 so output is longer than necessary. */
+	input_len -= 20 ;
+
+	if (output_len > LONG_BUFFER_LEN)
+	{	printf ("\n\nLine %d : output_len > LONG_BUFFER_LEN\n\n", __LINE__) ;
+		exit (1) ;
+		} ;
+
+	current_in = current_out = 0 ;
+
+	/* Perform sample rate conversion. */
+	if ((src_state = src_new (converter, 1, &error)) == NULL)
+	{	printf ("\n\nLine %d : src_new() failed : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_data.end_of_input = 0 ; /* Set this later. */
+
+	src_data.data_in = input ;
+
+	src_data.src_ratio = src_ratio ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = ARRAY_LEN (output) / 10 ;
+
+	terminate = 1 + (int) ceil ((src_ratio >= 1.0) ? src_ratio : 1.0 / src_ratio) ;
+
+	while (1)
+	{
+		src_data.input_frames = next_block_length (0) ;
+		src_data.input_frames = MIN (src_data.input_frames, input_len - current_in) ;
+
+		src_data.output_frames = ARRAY_LEN (output) - current_out ;
+		/*-Erik MIN (src_data.output_frames, output_len - current_out) ;-*/
+
+		src_data.end_of_input = (current_in >= input_len) ? 1 : 0 ;
+
+		if ((error = src_process (src_state, &src_data)))
+		{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+			printf ("  src_data.input_frames  : %ld\n", src_data.input_frames) ;
+			printf ("  src_data.output_frames : %ld\n\n", src_data.output_frames) ;
+			exit (1) ;
+			} ;
+
+		if (src_data.end_of_input && src_data.output_frames_gen == 0)
+			break ;
+
+		if (src_data.input_frames_used > src_data.input_frames)
+		{	printf ("\n\nLine %d : input_frames_used > input_frames\n\n", __LINE__) ;
+			printf ("  src_data.input_frames      : %ld\n", src_data.input_frames) ;
+			printf ("  src_data.input_frames_used : %ld\n", src_data.input_frames_used) ;
+			printf ("  src_data.output_frames     : %ld\n", src_data.output_frames) ;
+			printf ("  src_data.output_frames_gen : %ld\n\n", src_data.output_frames_gen) ;
+			exit (1) ;
+			} ;
+
+		if (src_data.input_frames_used < 0)
+		{	printf ("\n\nLine %d : input_frames_used (%ld) < 0\n\n", __LINE__, src_data.input_frames_used) ;
+			exit (1) ;
+			} ;
+
+		if (src_data.output_frames_gen < 0)
+		{	printf ("\n\nLine %d : output_frames_gen (%ld) < 0\n\n", __LINE__, src_data.output_frames_gen) ;
+			exit (1) ;
+			} ;
+
+		current_in	+= src_data.input_frames_used ;
+		current_out += src_data.output_frames_gen ;
+
+		if (current_in > input_len + terminate)
+		{	printf ("\n\nLine %d : current_in (%d) > input_len (%d + %d)\n\n", __LINE__, current_in, input_len, terminate) ;
+			exit (1) ;
+			} ;
+
+		if (current_out > output_len)
+		{	printf ("\n\nLine %d : current_out (%d) > output_len (%d)\n\n", __LINE__, current_out, output_len) ;
+			exit (1) ;
+			} ;
+
+		if (src_data.input_frames_used > input_len)
+		{	printf ("\n\nLine %d : input_frames_used (%ld) > %d\n\n", __LINE__, src_data.input_frames_used, input_len) ;
+			exit (1) ;
+			} ;
+
+		if (src_data.output_frames_gen > output_len)
+		{	printf ("\n\nLine %d : output_frames_gen (%ld) > %d\n\n", __LINE__, src_data.output_frames_gen, output_len) ;
+			exit (1) ;
+			} ;
+
+		if (src_data.data_in == NULL && src_data.output_frames_gen == 0)
+			break ;
+
+
+		src_data.data_in	+= src_data.input_frames_used ;
+		src_data.data_out	+= src_data.output_frames_gen ;
+		} ;
+
+	src_state = src_delete (src_state) ;
+
+	if (fabs (current_out - src_ratio * input_len) > terminate)
+	{	printf ("\n\nLine %d : bad output data length %d should be %2.1f +/- %d.\n", __LINE__,
+					current_out, src_ratio * input_len, terminate) ;
+		printf ("\tsrc_ratio  : %.4f\n", src_ratio) ;
+		printf ("\tinput_len  : %d\n\tinput_used : %d\n", input_len, current_in) ;
+		printf ("\toutput_len : %d\n\toutput_gen : %d\n\n", output_len, current_out) ;
+		exit (1) ;
+		} ;
+
+	if (current_in != input_len)
+	{	printf ("\n\nLine %d : unused input.\n", __LINE__) ;
+		printf ("\tinput_len         : %d\n", input_len) ;
+		printf ("\tinput_frames_used : %d\n\n", current_in) ;
+		exit (1) ;
+		} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* stream_test */
+
+static int
+next_block_length (int reset)
+{	static int block_lengths [] = /* Should be an odd length. */
+	{	/*-2, 500, 5, 400, 10, 300, 20, 200, 50, 100, 70 -*/
+		5, 400, 10, 300, 20, 200, 50, 100, 70
+		} ;
+	static int block_len_index = 0 ;
+
+	if (reset)
+		block_len_index = 0 ;
+	else
+		block_len_index = (block_len_index + 1) % ARRAY_LEN (block_lengths) ;
+
+	return block_lengths [block_len_index] ;
+} /* next_block_length */
+
diff --git a/soxr/lsr-tests/throughput_test.c b/soxr/lsr-tests/throughput_test.c
new file mode 100644
index 0000000..28b6fe5
--- /dev/null
+++ b/soxr/lsr-tests/throughput_test.c
@@ -0,0 +1,212 @@
+/*
+** Copyright (C) 2004-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <samplerate.h>
+
+#include "config.h"
+
+#include "util.h"
+#include "float_cast.h"
+
+#define BUFFER_LEN	(1<<16)
+
+static float input [BUFFER_LEN] ;
+static float output [BUFFER_LEN] ;
+
+static long
+throughput_test (int converter, long best_throughput)
+{	SRC_DATA src_data ;
+	clock_t start_time, clock_time ;
+	double duration ;
+	long total_frames = 0, throughput ;
+	int error ;
+
+	printf ("    %-30s    ", src_get_name (converter)) ;
+	fflush (stdout) ;
+
+	src_data.data_in = input ;
+	src_data.input_frames = ARRAY_LEN (input) ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = ARRAY_LEN (output) ;
+
+	src_data.src_ratio = 0.99 ;
+
+	sleep (2) ;
+
+	start_time = clock () ;
+
+	do
+	{
+		if ((error = src_simple (&src_data, converter, 1)) != 0)
+		{	puts (src_strerror (error)) ;
+			exit (1) ;
+			} ;
+
+		total_frames += src_data.output_frames_gen ;
+
+		clock_time = clock () - start_time ;
+		duration = (1.0 * clock_time) / CLOCKS_PER_SEC ;
+	}
+	while (duration < 3.0) ;
+
+	if (src_data.input_frames_used != ARRAY_LEN (input))
+	{	printf ("\n\nLine %d : input frames used %ld should be %d\n", __LINE__, src_data.input_frames_used, ARRAY_LEN (input)) ;
+		exit (1) ;
+		} ;
+
+	if (fabs (src_data.src_ratio * src_data.input_frames_used - src_data.output_frames_gen) > 2)
+	{	printf ("\n\nLine %d : input / output length mismatch.\n\n", __LINE__) ;
+		printf ("    input len  : %d\n", ARRAY_LEN (input)) ;
+		printf ("    output len : %ld (should be %g +/- 2)\n\n", src_data.output_frames_gen,
+				floor (0.5 + src_data.src_ratio * src_data.input_frames_used)) ;
+		exit (1) ;
+		} ;
+
+	throughput = lrint (floor (total_frames / duration)) ;
+
+	if (best_throughput == 0)
+	{	best_throughput = MAX (throughput, best_throughput) ;
+		printf ("%5.2f          %10ld\n", duration, throughput) ;
+		}
+	else
+	{	best_throughput = MAX (throughput, best_throughput) ;
+		printf ("%5.2f          %10ld       %10ld\n", duration, throughput, best_throughput) ;
+		}
+
+
+	return best_throughput ;
+} /* throughput_test */
+
+static void
+single_run (void)
+{
+
+	printf ("\n    CPU name : %s\n", get_cpu_name ()) ;
+
+	puts (
+		"\n"
+		"    Converter                        Duration        Throughput\n"
+		"    -----------------------------------------------------------"
+		) ;
+
+	throughput_test (SRC_ZERO_ORDER_HOLD, 0) ;
+	throughput_test (SRC_LINEAR, 0) ;
+	throughput_test (SRC_SINC_FASTEST, 0) ;
+	throughput_test (SRC_SINC_MEDIUM_QUALITY, 0) ;
+	throughput_test (SRC_SINC_BEST_QUALITY, 0) ;
+
+	puts ("") ;
+	return ;
+} /* single_run */
+
+static void
+multi_run (int run_count)
+{	long zero_order_hold = 0, linear = 0 ;
+	long sinc_fastest = 0, sinc_medium = 0, sinc_best = 0 ;
+	int k ;
+
+	puts (
+		"\n"
+		"    Converter                        Duration        Throughput      Best Throughput\n"
+		"    --------------------------------------------------------------------------------"
+		) ;
+
+	for (k = 0 ; k < run_count ; k++)
+	{	zero_order_hold =	throughput_test (SRC_ZERO_ORDER_HOLD, zero_order_hold) ;
+		linear =			throughput_test (SRC_LINEAR, linear) ;
+		sinc_fastest =		throughput_test (SRC_SINC_FASTEST, sinc_fastest) ;
+		sinc_medium =		throughput_test (SRC_SINC_MEDIUM_QUALITY, sinc_medium) ;
+		sinc_best =			throughput_test (SRC_SINC_BEST_QUALITY, sinc_best) ;
+
+		puts ("") ;
+
+		/* Let the CPU cool down. We might be running on a laptop. */
+		sleep (10) ;
+		} ;
+
+	printf ("\n    CPU name : %s\n", get_cpu_name ()) ;
+
+	puts (
+		"\n"
+		"    Converter                        Best Throughput\n"
+		"    ------------------------------------------------"
+		) ;
+	printf ("    %-30s    %10ld\n", src_get_name (SRC_ZERO_ORDER_HOLD), zero_order_hold) ;
+	printf ("    %-30s    %10ld\n", src_get_name (SRC_LINEAR), linear) ;
+	printf ("    %-30s    %10ld\n", src_get_name (SRC_SINC_FASTEST), sinc_fastest) ;
+	printf ("    %-30s    %10ld\n", src_get_name (SRC_SINC_MEDIUM_QUALITY), sinc_medium) ;
+	printf ("    %-30s    %10ld\n", src_get_name (SRC_SINC_BEST_QUALITY), sinc_best) ;
+
+	puts ("") ;
+} /* multi_run */
+
+static void
+usage_exit (const char * argv0)
+{	const char * cptr ;
+
+	if ((cptr = strrchr (argv0, '/')) != NULL)
+		argv0 = cptr ;
+
+	printf (
+		"Usage :\n"
+	 	"    %s                 - Single run of the throughput test.\n"
+		"    %s --best-of N     - Do N runs of test a print bext result.\n"
+		"\n",
+		argv0, argv0) ;
+
+	exit (0) ;
+} /* usage_exit */
+
+int
+main (int argc, char ** argv)
+{	double freq ;
+
+	memset (input, 0, sizeof (input)) ;
+	freq = 0.01 ;
+	gen_windowed_sines (1, &freq, 1.0, input, BUFFER_LEN) ;
+
+	if (argc == 1)
+		single_run () ;
+	else if (argc == 3 && strcmp (argv [1], "--best-of") == 0)
+	{	int run_count = atoi (argv [2]) ;
+
+		if (run_count < 1 || run_count > 20)
+		{	printf ("Please be sensible. Run count should be in range (1, 10].\n") ;
+			exit (1) ;
+			} ;
+
+		multi_run (run_count) ;
+		}
+	else
+		usage_exit (argv [0]) ;
+
+	puts (
+		"            Duration is in seconds.\n"
+		"            Throughput is in samples/sec (more is better).\n"
+		) ;
+
+	return 0 ;
+} /* main */
+
diff --git a/soxr/lsr-tests/util.c b/soxr/lsr-tests/util.c
new file mode 100644
index 0000000..fefcaf2
--- /dev/null
+++ b/soxr/lsr-tests/util.c
@@ -0,0 +1,230 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <math.h>
+
+#include	"util.h"
+
+#ifndef	M_PI
+#define	M_PI			3.14159265358979323846264338
+#endif
+
+void
+gen_windowed_sines (int freq_count, const double *freqs, double max, float *output, int output_len)
+{	int 	k, freq ;
+	double	amplitude, phase ;
+
+	amplitude = max / freq_count ;
+
+	for (k = 0 ; k < output_len ; k++)
+		output [k] = 0.0 ;
+
+	for (freq = 0 ; freq < freq_count ; freq++)
+	{	phase = 0.9 * M_PI / freq_count ;
+
+		if (freqs [freq] <= 0.0 || freqs [freq] >= 0.5)
+		{	printf ("\n%s : Error : freq [%d] == %g is out of range. Should be < 0.5.\n", __FILE__, freq, freqs [freq]) ;
+			exit (1) ;
+			} ;
+
+		for (k = 0 ; k < output_len ; k++)
+			output [k] += amplitude * sin (freqs [freq] * (2 * k) * M_PI + phase) ;
+		} ;
+
+	/* Apply Hanning Window. */
+	for (k = 0 ; k < output_len ; k++)
+		output [k] *= 0.5 - 0.5 * cos ((2 * k) * M_PI / (output_len - 1)) ;
+
+	/*	data [k] *= 0.3635819 - 0.4891775 * cos ((2 * k) * M_PI / (output_len - 1))
+					+ 0.1365995 * cos ((4 * k) * M_PI / (output_len - 1))
+					- 0.0106411 * cos ((6 * k) * M_PI / (output_len - 1)) ;
+		*/
+
+	return ;
+} /* gen_windowed_sines */
+
+void
+save_oct_float (char *filename, float *input, int in_len, float *output, int out_len)
+{	FILE 	*file ;
+	int		k ;
+
+	printf ("Dumping input and output data to file : %s.\n\n", filename) ;
+
+	if (! (file = fopen (filename, "w")))
+		return ;
+
+	fprintf (file, "# Not created by Octave\n") ;
+
+	fprintf (file, "# name: input\n") ;
+	fprintf (file, "# type: matrix\n") ;
+	fprintf (file, "# rows: %d\n", in_len) ;
+	fprintf (file, "# columns: 1\n") ;
+
+	for (k = 0 ; k < in_len ; k++)
+		fprintf (file, "% g\n", input [k]) ;
+
+	fprintf (file, "# name: output\n") ;
+	fprintf (file, "# type: matrix\n") ;
+	fprintf (file, "# rows: %d\n", out_len) ;
+	fprintf (file, "# columns: 1\n") ;
+
+	for (k = 0 ; k < out_len ; k++)
+		fprintf (file, "% g\n", output [k]) ;
+
+	fclose (file) ;
+	return ;
+} /* save_oct_float */
+
+void
+save_oct_double (char *filename, double *input, int in_len, double *output, int out_len)
+{	FILE 	*file ;
+	int		k ;
+
+	printf ("Dumping input and output data to file : %s.\n\n", filename) ;
+
+	if (! (file = fopen (filename, "w")))
+		return ;
+
+	fprintf (file, "# Not created by Octave\n") ;
+
+	fprintf (file, "# name: input\n") ;
+	fprintf (file, "# type: matrix\n") ;
+	fprintf (file, "# rows: %d\n", in_len) ;
+	fprintf (file, "# columns: 1\n") ;
+
+	for (k = 0 ; k < in_len ; k++)
+		fprintf (file, "% g\n", input [k]) ;
+
+	fprintf (file, "# name: output\n") ;
+	fprintf (file, "# type: matrix\n") ;
+	fprintf (file, "# rows: %d\n", out_len) ;
+	fprintf (file, "# columns: 1\n") ;
+
+	for (k = 0 ; k < out_len ; k++)
+		fprintf (file, "% g\n", output [k]) ;
+
+	fclose (file) ;
+	return ;
+} /* save_oct_double */
+
+void
+interleave_data (const float *in, float *out, int frames, int channels)
+{	int fr, ch ;
+
+	for (fr = 0 ; fr < frames ; fr++)
+		for (ch = 0 ; ch < channels ; ch++)
+			out [ch + channels * fr] = in [fr + frames * ch] ;
+
+	return ;
+} /* interleave_data */
+
+void
+deinterleave_data (const float *in, float *out, int frames, int channels)
+{	int fr, ch ;
+
+	for (ch = 0 ; ch < channels ; ch++)
+		for (fr = 0 ; fr < frames ; fr++)
+			out [fr + frames * ch] = in [ch + channels * fr] ;
+
+	return ;
+} /* deinterleave_data */
+
+void
+reverse_data (float *data, int datalen)
+{	int left, right ;
+	float temp ;
+
+	left = 0 ;
+	right = datalen - 1 ;
+
+	while (left < right)
+	{	temp = data [left] ;
+		data [left] = data [right] ;
+		data [right] = temp ;
+		left ++ ;
+		right -- ;
+		} ;
+
+} /* reverse_data */
+
+const char *
+get_cpu_name (void)
+{
+	const char *name = "Unknown", *search = NULL ;
+	static char buffer [512] ;
+	FILE * file = NULL ;
+	int is_pipe = 0 ;
+
+#if defined (__linux__)
+	file = fopen ("/proc/cpuinfo", "r") ;
+	search = "model name" ;
+#elif defined (__APPLE__)
+	file = popen ("/usr/sbin/system_profiler -detailLevel full SPHardwareDataType", "r") ;
+	search = "Processor Name" ;
+	is_pipe = 1 ;
+#elif defined (__FreeBSD__)
+	file = popen ("sysctl -a", "r") ;
+	search = "hw.model" ;
+	is_pipe = 1 ;
+#else
+	file = NULL ;
+#endif
+
+	if (file == NULL)
+		return name ;
+
+	if (search == NULL)
+	{	printf ("Error : search is NULL in function %s.\n", __func__) ;
+		return name ;
+		} ;
+
+	while (fgets (buffer, sizeof (buffer), file) != NULL)
+		if (strstr (buffer, search))
+		{	char *src, *dest ;
+
+			if ((src = strchr (buffer, ':')) != NULL)
+			{	src ++ ;
+				while (isspace (src [0]))
+					src ++ ;
+				name = src ;
+
+				/* Remove consecutive spaces. */
+				src ++ ;
+				for (dest = src ; src [0] ; src ++)
+				{	if (isspace (src [0]) && isspace (dest [-1]))
+						continue ;
+					dest [0] = src [0] ;
+					dest ++ ;
+					} ;
+				dest [0] = 0 ;
+				break ;
+				} ;
+			} ;
+
+	if (is_pipe)
+		pclose (file) ;
+	else
+		fclose (file) ;
+
+	return name ;
+} /* get_cpu_name */
+
diff --git a/soxr/lsr-tests/util.h b/soxr/lsr-tests/util.h
new file mode 100644
index 0000000..80b1b49
--- /dev/null
+++ b/soxr/lsr-tests/util.h
@@ -0,0 +1,50 @@
+/*
+** Copyright (C) 2002-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#define	ABS(a)			(((a) < 0) ? - (a) : (a))
+#define MIN(a,b)		(((a) < (b)) ? (a) : (b))
+#define	MAX(a,b)		(((a) >= (b)) ? (a) : (b))
+
+#define	ARRAY_LEN(x)	((int) (sizeof (x) / sizeof ((x) [0])))
+
+void gen_windowed_sines (int freq_count, const double *freqs, double max, float *output, int output_len) ;
+
+void save_oct_float (char *filename, float *input, int in_len, float *output, int out_len) ;
+void save_oct_double (char *filename, double *input, int in_len, double *output, int out_len) ;
+
+void interleave_data (const float *in, float *out, int frames, int channels) ;
+
+void deinterleave_data (const float *in, float *out, int frames, int channels) ;
+
+void reverse_data (float *data, int datalen) ;
+
+double calculate_snr (float *data, int len, int expected_peaks) ;
+
+const char * get_cpu_name (void) ;
+
+#if OS_IS_WIN32
+/*
+**	Extra Win32 hacks.
+**
+**	Despite Microsoft claim of windows being POSIX compatibile it has '_sleep'
+**	instead of 'sleep'.
+*/
+
+#define sleep _sleep
+#endif
+
diff --git a/soxr/lsr-tests/varispeed_test.c b/soxr/lsr-tests/varispeed_test.c
new file mode 100644
index 0000000..52b2f43
--- /dev/null
+++ b/soxr/lsr-tests/varispeed_test.c
@@ -0,0 +1,152 @@
+/*
+** Copyright (C) 2006-2011 Erik de Castro Lopo <erikd@mega-nerd.com>
+**
+** This program is free software; you can redistribute it and/or modify
+** it under the terms of the GNU General Public License as published by
+** the Free Software Foundation; either version 2 of the License, or
+** (at your option) any later version.
+**
+** This program is distributed in the hope that it will be useful,
+** but WITHOUT ANY WARRANTY; without even the implied warranty of
+** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+** GNU General Public License for more details.
+**
+** You should have received a copy of the GNU General Public License
+** along with this program; if not, write to the Free Software
+** Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
+*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <string.h>
+
+#include <samplerate.h>
+
+#include "util.h"
+
+#define	BUFFER_LEN		(1 << 16)
+
+static void varispeed_test (int converter, double target_snr) ;
+
+int
+main (void)
+{
+	puts ("") ;
+	printf ("    Zero Order Hold interpolator    : ") ;
+	varispeed_test (SRC_ZERO_ORDER_HOLD, 10.0) ;
+
+	printf ("    Linear interpolator             : ") ;
+	varispeed_test (SRC_LINEAR, 10.0) ;
+
+	printf ("    Sinc interpolator               : ") ;
+	varispeed_test (SRC_SINC_FASTEST, 115.0) ;
+
+	puts ("") ;
+
+	return 0 ;
+} /* main */
+
+static void
+varispeed_test (int converter, double target_snr)
+{	static float input [BUFFER_LEN], output [BUFFER_LEN] ;
+	double sine_freq, snr ;
+
+	SRC_STATE	*src_state ;
+	SRC_DATA	src_data ;
+
+	int input_len, error ;
+
+	memset (input, 0, sizeof (input)) ;
+
+	input_len = ARRAY_LEN (input) / 2 ;
+
+	sine_freq = 0.0111 ;
+	gen_windowed_sines (1, &sine_freq, 1.0, input, input_len) ;
+
+	/* Perform sample rate conversion. */
+	if ((src_state = src_new (converter, 1, &error)) == NULL)
+	{	printf ("\n\nLine %d : src_new() failed : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_data.end_of_input = 1 ;
+
+	src_data.data_in = input ;
+	src_data.input_frames = input_len ;
+
+	src_data.src_ratio = 3.0 ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = ARRAY_LEN (output) ;
+
+	if ((error = src_set_ratio (src_state, 1.0 / src_data.src_ratio)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	if ((error = src_process (src_state, &src_data)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		printf ("  src_data.input_frames  : %ld\n", src_data.input_frames) ;
+		printf ("  src_data.output_frames : %ld\n\n", src_data.output_frames) ;
+		exit (1) ;
+		} ;
+
+	if (src_data.input_frames_used != input_len)
+	{	printf ("\n\nLine %d : unused input.\n", __LINE__) ;
+		printf ("\tinput_len         : %d\n", input_len) ;
+		printf ("\tinput_frames_used : %ld\n\n", src_data.input_frames_used) ;
+		exit (1) ;
+		} ;
+
+	/* Copy the last output to the input. */
+	memcpy (input, output, sizeof (input)) ;
+	reverse_data (input, src_data.output_frames_gen) ;
+
+	if ((error = src_reset (src_state)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	src_data.end_of_input = 1 ;
+
+	src_data.data_in = input ;
+	input_len = src_data.input_frames = src_data.output_frames_gen ;
+
+	src_data.data_out = output ;
+	src_data.output_frames = ARRAY_LEN (output) ;
+
+	if ((error = src_set_ratio (src_state, 1.0 / src_data.src_ratio)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		exit (1) ;
+		} ;
+
+	if ((error = src_process (src_state, &src_data)))
+	{	printf ("\n\nLine %d : %s\n\n", __LINE__, src_strerror (error)) ;
+		printf ("  src_data.input_frames  : %ld\n", src_data.input_frames) ;
+		printf ("  src_data.output_frames : %ld\n\n", src_data.output_frames) ;
+		exit (1) ;
+		} ;
+
+	if (src_data.input_frames_used != input_len)
+	{	printf ("\n\nLine %d : unused input.\n", __LINE__) ;
+		printf ("\tinput_len         : %d\n", input_len) ;
+		printf ("\tinput_frames_used : %ld\n\n", src_data.input_frames_used) ;
+		exit (1) ;
+		} ;
+
+	src_state = src_delete (src_state) ;
+
+	snr = calculate_snr (output, src_data.output_frames_gen, 1) ;
+
+	if (target_snr > snr)
+	{	printf ("\n\nLine %d : snr (%3.1f) does not meet target (%3.1f)\n\n", __LINE__, snr, target_snr) ;
+		save_oct_float ("varispeed.mat", input, src_data.input_frames, output, src_data.output_frames_gen) ;
+		exit (1) ;
+		} ;
+
+	puts ("ok") ;
+
+	return ;
+} /* varispeed_test */
+
diff --git a/soxr/msvc/README b/soxr/msvc/README
new file mode 100644
index 0000000..5b7f60a
--- /dev/null
+++ b/soxr/msvc/README
@@ -0,0 +1,22 @@
+SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+
+Cmake is the recommended way to configure, build (as either a DLL or a static
+library), and install libsoxr for general use on MS-Windows, as on other OSs.
+
+However, building within MS Visual Studio is also possible, as exemplified by
+the accompanying files:
+
+ * soxr-config.h       Pre-configured for a modern Win32 system.
+
+ * libsoxr.vcproj      Builds the library as a DLL, per above.
+
+ * libsoxr.sln,        Build an example exe using the above.
+   example1.vcproj
+
+The following notes apply to adaptation of these files:
+
+ * For a system without AVX support, set WITH_CR64S to 0 in
+   soxr-config.h and exclude the three files ...64s.c from the build.
+
+ * If changing libsoxr.vcproj to build a static library, then also
+   remove the preprocessor definition: SOXR_DLL.
diff --git a/soxr/msvc/example1.vcproj b/soxr/msvc/example1.vcproj
new file mode 100644
index 0000000..170a522
--- /dev/null
+++ b/soxr/msvc/example1.vcproj
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="example1"
+	ProjectGUID="{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}"
+	RootNamespace="soxr"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform Name="Win32" />
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\src"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="..\src"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<File RelativePath="..\examples\1-single-block.c" />
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/soxr/msvc/libsoxr.sln b/soxr/msvc/libsoxr.sln
new file mode 100644
index 0000000..c1a840b
--- /dev/null
+++ b/soxr/msvc/libsoxr.sln
@@ -0,0 +1,29 @@
+﻿
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual C++ Express 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example1", "example1.vcproj", "{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}"
+	ProjectSection(ProjectDependencies) = postProject
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB} = {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libsoxr", "libsoxr.vcproj", "{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Release|Win32 = Release|Win32
+		Debug|Win32 = Debug|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Release|Win32.ActiveCfg = Release|Win32
+		{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Release|Win32.Build.0 = Release|Win32
+		{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Debug|Win32.ActiveCfg = Debug|Win32
+		{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Debug|Win32.Build.0 = Debug|Win32
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Release|Win32.ActiveCfg = Release|Win32
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Release|Win32.Build.0 = Release|Win32
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Debug|Win32.Build.0 = Debug|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/soxr/msvc/libsoxr.vcproj b/soxr/msvc/libsoxr.vcproj
new file mode 100644
index 0000000..499f895
--- /dev/null
+++ b/soxr/msvc/libsoxr.vcproj
@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="libsoxr"
+	ProjectGUID="{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}"
+	RootNamespace="libsoxr"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform Name="Win32" />
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="2"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="."
+				PreprocessorDefinitions="_USE_MATH_DEFINES;_CRT_SECURE_NO_WARNINGS;SOXR_LIB;SOXR_DLL;soxr_EXPORTS"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="2"
+				TargetMachine="1"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="2"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="."
+				PreprocessorDefinitions="NDEBUG;_USE_MATH_DEFINES;_CRT_SECURE_NO_WARNINGS;SOXR_LIB;SOXR_DLL;soxr_EXPORTS"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="2"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<File RelativePath="..\src\cr.c" />
+		<File RelativePath="..\src\cr32.c" />
+		<File RelativePath="..\src\cr32s.c" />
+		<File RelativePath="..\src\cr64.c" />
+		<File RelativePath="..\src\cr64s.c" />
+		<File RelativePath="..\src\data-io.c" />
+		<File RelativePath="..\src\dbesi0.c" />
+		<File RelativePath="..\src\fft4g32.c" />
+		<File RelativePath="..\src\fft4g64.c" />
+		<File RelativePath="..\src\filter.c" />
+		<File RelativePath="..\src\pffft32s.c" />
+		<File RelativePath="..\src\pffft64s.c" />
+		<File RelativePath="..\src\util32s.c" />
+		<File RelativePath="..\src\util64s.c" />
+		<File RelativePath="..\src\soxr.c" />
+		<File RelativePath="..\src\vr32.c" />
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/soxr/msvc/soxr-config.h b/soxr/msvc/soxr-config.h
new file mode 100644
index 0000000..74415e2
--- /dev/null
+++ b/soxr/msvc/soxr-config.h
@@ -0,0 +1,30 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* N.B. Pre-configured for modern MS-Windows systems.  However, the normal
+ * procedure is to use the cmake configuration and build system. See INSTALL. */
+
+#if !defined soxr_config_included
+#define soxr_config_included
+
+#define AVCODEC_FOUND 0
+#define AVUTIL_FOUND 0
+#define WITH_PFFFT 1
+
+#define HAVE_FENV_H 1
+#define HAVE_STDBOOL_H 1
+#define HAVE_STDINT_H 1
+#define HAVE_LRINT 1
+#define HAVE_BIGENDIAN 0
+
+#define WITH_CR32 1
+#define WITH_CR32S 1
+#define WITH_CR64 1
+#define WITH_CR64S 1
+#define WITH_VR32 1
+
+#define WITH_HI_PREC_CLOCK 1
+#define WITH_FLOAT_STD_PREC_CLOCK 0
+#define WITH_DEV_TRACE 1
+
+#endif
diff --git a/soxr/multi-arch b/soxr/multi-arch
new file mode 100644
index 0000000..288b578
--- /dev/null
+++ b/soxr/multi-arch
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+rm -f CMakeCache.txt             # Prevent interference from any in-tree build
+
+j=-j4
+build=Release
+
+for n in \
+    cc: \
+    clang: \
+    arm-linux-gnueabi-gcc:Linux \
+    x86_64-w64-mingw32-gcc:Windows \
+    i686-w64-mingw32-gcc:Windows \
+    ; do
+  compiler=$(echo $n | sed 's/:.*//')
+  system=$(echo $n | sed 's/.*://')
+  dir=$build-$compiler
+  which $compiler > /dev/null || echo $compiler not found && (
+  echo "***" $dir
+  mkdir -p $dir
+    cd $dir
+    cmake -DCMAKE_BUILD_TYPE=$build -DCMAKE_C_COMPILER=$compiler -DCMAKE_SYSTEM_NAME="$system" -DBUILD_SHARED_LIBS=OFF -DWITH_OPENMP=OFF ..
+    make $j && [ /$system = / ] && ctest -j || true
+    cd tests
+    ../../tests/throughput-test && SOXR_THROUGHPUT_GAIN=.6 ../../tests/throughput-test 2 3 || true
+  )
+done
diff --git a/soxr/soxr-config.h.in b/soxr/soxr-config.h.in
index 227bcfd..00b3b45 100644
--- a/soxr/soxr-config.h.in
+++ b/soxr/soxr-config.h.in
@@ -1,46 +1,27 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #if !defined soxr_config_included
 #define soxr_config_included
 
-#define HAVE_SINGLE_PRECISION @HAVE_SINGLE_PRECISION@
-#define HAVE_DOUBLE_PRECISION @HAVE_DOUBLE_PRECISION@
-#define HAVE_AVFFT            @HAVE_AVFFT@
-#define HAVE_SIMD             @HAVE_SIMD@
-#define HAVE_FENV_H           @HAVE_FENV_H@
-#define HAVE_LRINT            @HAVE_LRINT@
-#define WORDS_BIGENDIAN       @WORDS_BIGENDIAN@
+#cmakedefine01 AVCODEC_FOUND
+#cmakedefine01 AVUTIL_FOUND
+#cmakedefine01 WITH_PFFFT
 
-#include <limits.h>
+#cmakedefine01 HAVE_FENV_H
+#cmakedefine01 HAVE_STDBOOL_H
+#cmakedefine01 HAVE_STDINT_H
+#cmakedefine01 HAVE_LRINT
+#cmakedefine01 HAVE_BIGENDIAN
 
-#undef bool
-#undef false
-#undef true
-#define bool int
-#define false 0
-#define true 1
+#cmakedefine01 WITH_CR32
+#cmakedefine01 WITH_CR32S
+#cmakedefine01 WITH_CR64
+#cmakedefine01 WITH_CR64S
+#cmakedefine01 WITH_VR32
 
-#undef int16_t
-#undef int32_t
-#undef int64_t
-#undef uint32_t
-#undef uint64_t
-#define int16_t short
-#if LONG_MAX > 2147483647L
-  #define int32_t int
-  #define int64_t long
-#elif LONG_MAX < 2147483647L
-#error this library requires that 'long int' has at least 32-bits
-#else
-  #define int32_t long
-  #if defined _MSC_VER
-    #define int64_t __int64
-  #else
-    #define int64_t long long
-  #endif
-#endif
-#define uint32_t unsigned int32_t
-#define uint64_t unsigned int64_t
+#cmakedefine01 WITH_HI_PREC_CLOCK
+#cmakedefine01 WITH_FLOAT_STD_PREC_CLOCK
+#cmakedefine01 WITH_DEV_TRACE
 
 #endif
diff --git a/soxr/src/CMakeLists.txt b/soxr/src/CMakeLists.txt
index cd41aa7..bb01a0d 100644
--- a/soxr/src/CMakeLists.txt
+++ b/soxr/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
 
@@ -7,90 +7,89 @@
 
 if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vr-coefs.h)
   include_directories(${CMAKE_CURRENT_BINARY_DIR})
-  set_property(SOURCE vr32.c APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h)
+  set_property(SOURCE vr32.c
+      APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h)
   add_executable (vr-coefs vr-coefs.c)
+  target_link_libraries (vr-coefs ${LIBM_LIBRARIES})
   ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
     COMMAND vr-coefs > ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
     DEPENDS vr-coefs)
 endif ()
 
 
-# Minimalist boo configuration:
-add_definitions (${PROJECT_C_FLAGS} -DSOXR_LIB -DSOXR_SILENT=1)
-include (CheckFunctionExists)
-include (CheckIncludeFiles)
 
-set(WITH_LSR_BINDINGS OFF)
-set(WITH_SINGLE_PRECISION ON)
-set(WITH_DOUBLE_PRECISION OFF)
-set(WITH_SIMD ON)
-set(HAVE_SINGLE_PRECISION "1")
-set(HAVE_DOUBLE_PRECISION "0")
-set(HAVE_AVFFT "0")
-set(HAVE_SIMD "1")
-check_function_exists (lrint HAVE_LRINT)
-if(NOT HAVE_LRINT)
-  set(HAVE_LRINT "0")
-endif()
-check_include_files (fenv.h HAVE_FENV_H)
-if(NOT HAVE_FENV_H)
-  set(HAVE_FENV_H "0")
-endif()
-set(WORDS_BIGENDIAN "0")
+add_definitions (${PROJECT_C_FLAGS} -DSOXR_LIB)
+
 
-configure_file (
-        ${CMAKE_CURRENT_SOURCE_DIR}/../soxr-config.h.in
-        ${CMAKE_CURRENT_BINARY_DIR}/soxr-config.h)
-include_directories (${CMAKE_CURRENT_BINARY_DIR})
 
 # Libsoxr configuration:
 
-set (RDFT32 fft4g32.c)
-if (WITH_AVFFT AND AVCODEC_FOUND)
-  set (RDFT32 avfft32.c)
-  set (RDFT32S avfft32s.c)
+set (RDFT32 fft4g32)
+if (AVCODEC_FOUND)
+  set (RDFT32 avfft32)
+  set (RDFT32S avfft32s)
 elseif (WITH_PFFFT)
-  #set (RDFT32 pffft32.c)
-  set (RDFT32S pffft32s.c)
-elseif (WITH_SIMD)
-  set (RDFT32S fft4g32s.c)
+  #set (RDFT32 pffft32)
+  set (RDFT32S pffft32s)
+elseif (WITH_CR32S)
+  set (RDFT32S fft4g32s)
+  if (NOT WITH_CR32)
+    list (APPEND RDFT32S fft4g32)
+  endif ()
 endif ()
 
-if (WITH_DOUBLE_PRECISION)
-  set (DP_SOURCES rate64.c)
+set (SOURCES ${PROJECT_NAME}.c data-io)
+
+if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
+  list (APPEND SOURCES dbesi0 filter fft4g64 cr)
 endif ()
 
-if (WITH_SINGLE_PRECISION)
-  set (SP_SOURCES rate32.c ${RDFT32})
+if (WITH_CR32)
+  list (APPEND SOURCES cr32 ${RDFT32})
 endif ()
 
-if (HAVE_SIMD)
-  set (SIMD_SOURCES rate32s.c vr32s.c ${RDFT32S} simd.c)
-  foreach (source ${SIMD_SOURCES})
-    set_property (SOURCE ${source} PROPERTY COMPILE_FLAGS ${SIMD_C_FLAGS})
+if (WITH_CR64)
+  list (APPEND SOURCES cr64)
+endif ()
+
+if (WITH_VR32)
+  list (APPEND SOURCES vr32)
+endif ()
+
+if (WITH_CR32S)
+  foreach (source cr32s ${RDFT32S} util32s)
+    list (APPEND SOURCES ${source})
+    set_property (SOURCE ${source}
+        APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD32_C_FLAGS})
+  endforeach ()
+endif ()
+
+if (WITH_CR64S)
+  foreach (source cr64s pffft64s util64s)
+    list (APPEND SOURCES ${source})
+    set_property (SOURCE ${source}
+        APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD64_C_FLAGS})
   endforeach ()
-else ()
-  set (SIMD_SOURCES vr32.c)
 endif ()
 
 
 
 # Libsoxr:
 
-add_library (soxr ${LIB_TYPE} soxr.c data-io.c dbesi0.c filter.c fft4g64.c
-  ${SP_SOURCES} ${DP_SOURCES} ${SIMD_SOURCES})
-set_target_properties (soxr PROPERTIES
+add_library (${PROJECT_NAME} ${LIB_TYPE} ${SOURCES})
+target_link_libraries (${PROJECT_NAME} PRIVATE ${LIBS} ${LIBM_LIBRARIES})
+set_target_properties (${PROJECT_NAME} PROPERTIES
   VERSION "${SO_VERSION}"
   SOVERSION ${SO_VERSION_MAJOR}
   INSTALL_NAME_DIR ${LIB_INSTALL_DIR}
   LINK_INTERFACE_LIBRARIES ""
-  PUBLIC_HEADER "soxr.h")
+  PUBLIC_HEADER "${PROJECT_NAME}.h")
 if (BUILD_FRAMEWORK)
-  set_target_properties (soxr PROPERTIES FRAMEWORK TRUE)
+  set_target_properties (${PROJECT_NAME} PROPERTIES FRAMEWORK TRUE)
 elseif (NOT WIN32)
-#  set (TARGET_PCS ${CMAKE_CURRENT_BINARY_DIR}/soxr.pc)
-#  configure_file (${CMAKE_CURRENT_SOURCE_DIR}/soxr.pc.in ${TARGET_PCS})
-#  install (FILES ${CMAKE_CURRENT_BINARY_DIR}/soxr.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
+  set (TARGET_PCS ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc)
+  configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}.pc.in ${TARGET_PCS})
+  install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
 endif ()
 
 
@@ -98,11 +97,11 @@ endif ()
 # LSR bindings:
 
 if (WITH_LSR_BINDINGS)
-  set (LSR soxr-lsr)
+  set (LSR ${PROJECT_NAME}-lsr)
   set (LSR_SO_VERSION 0.1.9)
   set (LSR_SO_VERSION_MAJOR 0)
-  add_library (${LSR} ${LIB_TYPE} lsr)
-  target_link_libraries (${LSR} soxr)
+  add_library (${LSR} ${LIB_TYPE} ${LSR})
+  target_link_libraries (${LSR} ${PROJECT_NAME})
   set_target_properties (${LSR} PROPERTIES
     VERSION "${LSR_SO_VERSION}"
     SOVERSION ${LSR_SO_VERSION_MAJOR}
@@ -112,9 +111,9 @@ if (WITH_LSR_BINDINGS)
   if (BUILD_FRAMEWORK)
     set_target_properties (${LSR} PROPERTIES FRAMEWORK TRUE)
   elseif (NOT WIN32)
-#    set (TARGET_PCS "${TARGET_PCS} ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc")
-#    configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc)
-#    install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
+    set (TARGET_PCS "${TARGET_PCS} ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc")
+    configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc)
+    install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
   endif ()
 endif ()
 
@@ -122,29 +121,9 @@ endif ()
 
 # Installation (from build from source):
 
-#install (TARGETS soxr ${LSR}
-#  FRAMEWORK DESTINATION ${FRAMEWORK_INSTALL_DIR}
-#  LIBRARY DESTINATION ${LIB_INSTALL_DIR}
-#  RUNTIME DESTINATION ${BIN_INSTALL_DIR}
-#  ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
-#  PUBLIC_HEADER DESTINATION ${INCLUDE_INSTALL_DIR})
-
-
-
-# Packaging (for unix-like distributions):
-
-#get_property (LIB1 TARGET soxr PROPERTY LOCATION)
-#if (BUILD_SHARED_LIBS)
-#  set (LIB1 ${LIB1}.${SO_VERSION_MAJOR} ${LIB1}.${SO_VERSION})
-#endif ()
-#list (APPEND TARGET_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/soxr.h")
-#if (WITH_LSR_BINDINGS)
-#  get_property (LIB2 TARGET ${LSR} PROPERTY LOCATION)
-#  if (BUILD_SHARED_LIBS)
-#    set (LIB2 ${LIB2}.${LSR_SO_VERSION_MAJOR} ${LIB2}.${LSR_SO_VERSION})
-#  endif ()
-#  list (APPEND TARGET_HEADERS "${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.h")
-#endif ()
-#set (TARGET_LIBS ${LIB1} ${LIB2})
-#configure_file (${CMAKE_CURRENT_SOURCE_DIR}/libsoxr.src.in ${CMAKE_CURRENT_BINARY_DIR}/libsoxr.src)
-#configure_file (${CMAKE_CURRENT_SOURCE_DIR}/libsoxr-dev.src.in ${CMAKE_CURRENT_BINARY_DIR}/libsoxr-dev.src)
+install (TARGETS ${PROJECT_NAME} ${LSR}
+  FRAMEWORK DESTINATION ${FRAMEWORK_INSTALL_DIR}
+  LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+  RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+  ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+  PUBLIC_HEADER DESTINATION ${INCLUDE_INSTALL_DIR})
diff --git a/soxr/src/aliases.h b/soxr/src/aliases.h
index eb42bdc..d1a392f 100644
--- a/soxr/src/aliases.h
+++ b/soxr/src/aliases.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #if defined SOXR_LIB
@@ -18,8 +18,10 @@
 #define lsx_dfst_f                     _soxr_dfst_f
 #define lsx_dfst                       _soxr_dfst
 #define lsx_fir_to_phase               _soxr_fir_to_phase
+#define lsx_f_resp                     _soxr_f_resp
 #define lsx_init_fft_cache_f           _soxr_init_fft_cache_f
 #define lsx_init_fft_cache             _soxr_init_fft_cache
+#define lsx_inv_f_resp                 _soxr_inv_f_resp
 #define lsx_kaiser_beta                _soxr_kaiser_beta
 #define lsx_kaiser_params              _soxr_kaiser_params
 #define lsx_make_lpf                   _soxr_make_lpf
diff --git a/soxr/src/avfft32.c b/soxr/src/avfft32.c
index 5be13d2..c3096aa 100644
--- a/soxr/src/avfft32.c
+++ b/soxr/src/avfft32.c
@@ -1,27 +1,33 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
+#include <stdlib.h>
 #include <math.h>
 #include <libavcodec/avfft.h>
 #include "filter.h"
+#include "rdft_t.h"
 
 static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
 static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
 static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
 static int multiplier(void) {return 2;}
 static void nothing(void) {}
+static int flags(void) {return 0;}
 
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32_cb[] = {
-  (fn_t)forward_setup,
-  (fn_t)backward_setup,
-  (fn_t)av_rdft_end,
-  (fn_t)rdft,
-  (fn_t)rdft,
-  (fn_t)rdft,
-  (fn_t)rdft,
-  (fn_t)_soxr_ordered_convolve_f,
-  (fn_t)_soxr_ordered_partial_convolve_f,
-  (fn_t)multiplier,
-  (fn_t)nothing,
+rdft_cb_table _soxr_rdft32_cb = {
+  forward_setup,
+  backward_setup,
+  av_rdft_end,
+  rdft,
+  rdft,
+  rdft,
+  rdft,
+  _soxr_ordered_convolve_f,
+  _soxr_ordered_partial_convolve_f,
+  multiplier,
+  nothing,
+  malloc,
+  calloc,
+  free,
+  flags,
 };
diff --git a/soxr/src/avfft32s.c b/soxr/src/avfft32s.c
index 75e485e..2944144 100644
--- a/soxr/src/avfft32s.c
+++ b/soxr/src/avfft32s.c
@@ -3,25 +3,30 @@
 
 #include <math.h>
 #include <libavcodec/avfft.h>
-#include "simd.h"
+#include "util32s.h"
+#include "rdft_t.h"
 
 static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
 static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
-static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
+static void rdft(int length, void * setup, void * H, void * scratch) {av_rdft_calc(setup, H); (void)length; (void)scratch;}
 static int multiplier(void) {return 2;}
-static void nothing(void) {}
+static void nothing2(int u1, void *u2, void *u3, void *u4) {(void)u1; (void)u2; (void)u3; (void)u4;}
+static int flags(void) {return RDFT_IS_SIMD;}
 
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32s_cb[] = {
-  (fn_t)forward_setup,
-  (fn_t)backward_setup,
-  (fn_t)av_rdft_end,
-  (fn_t)rdft,
-  (fn_t)rdft,
-  (fn_t)rdft,
-  (fn_t)rdft,
-  (fn_t)_soxr_ordered_convolve_simd,
-  (fn_t)_soxr_ordered_partial_convolve_simd,
-  (fn_t)multiplier,
-  (fn_t)nothing,
+rdft_cb_table _soxr_rdft32s_cb = {
+  forward_setup,
+  backward_setup,
+  av_rdft_end,
+  rdft,
+  rdft,
+  rdft,
+  rdft,
+  ORDERED_CONVOLVE_SIMD,
+  ORDERED_PARTIAL_CONVOLVE_SIMD,
+  multiplier,
+  nothing2,
+  SIMD_ALIGNED_MALLOC,
+  SIMD_ALIGNED_CALLOC,
+  SIMD_ALIGNED_FREE,
+  flags,
 };
diff --git a/soxr/src/cb_t.h b/soxr/src/cb_t.h
new file mode 100644
index 0000000..d78ebd7
--- /dev/null
+++ b/soxr/src/cb_t.h
@@ -0,0 +1,26 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+* Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+typedef struct {
+  void * (*input)(void *, void * samples, size_t   n);
+  void (*process)(void *, size_t);
+  void const * (*output)(void *, void * samples, size_t * n);
+  void (*flush)(void *);
+  void (*close)(void *);
+  double (*delay)(void *);
+  void (*sizes)(size_t * shared, size_t * channel);
+  char const * (*create)(void * channel, void * shared, double io_ratio, void * q_spec, void * r_spec, double scale);
+  void (*set_io_ratio)(void *, double io_ratio, size_t len);
+  char const * (*id)(void);
+} control_block_t;
+
+#define resampler_input        p->control_block.input
+#define resampler_process      p->control_block.process
+#define resampler_output       p->control_block.output
+#define resampler_flush        p->control_block.flush
+#define resampler_close        p->control_block.close
+#define resampler_delay        p->control_block.delay
+#define resampler_sizes        p->control_block.sizes
+#define resampler_create       p->control_block.create
+#define resampler_set_io_ratio p->control_block.set_io_ratio
+#define resampler_id           p->control_block.id
diff --git a/soxr/src/ccrw2.h b/soxr/src/ccrw2.h
index b42185b..09331a4 100644
--- a/soxr/src/ccrw2.h
+++ b/soxr/src/ccrw2.h
@@ -3,8 +3,8 @@
 
 /* Concurrent Control with "Readers" and "Writers", P.J. Courtois et al, 1971 */
 
-#if !defined ccrw2_included
-#define ccrw2_included
+#if !defined soxr_ccrw2_included
+#define soxr_ccrw2_included
 
 #if defined SOXR_LIB
 #include "internal.h"
diff --git a/soxr/src/cr-core.c b/soxr/src/cr-core.c
new file mode 100644
index 0000000..5355de3
--- /dev/null
+++ b/soxr/src/cr-core.c
@@ -0,0 +1,316 @@
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details.
+ *
+ * Constant-rate resampling engine-specific code. */
+
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "filter.h"
+
+#if defined SOXR_LIB
+  #include "internal.h"
+  #include "cr.h"
+  #if CORE_TYPE & CORE_DBL
+    typedef double sample_t;
+    #if CORE_TYPE & CORE_SIMD_DFT
+      #define RDFT_CB    _soxr_rdft64s_cb
+    #else
+      #define RDFT_CB    _soxr_rdft64_cb
+    #endif
+  #else
+    typedef float sample_t;
+    #if CORE_TYPE & CORE_SIMD_DFT
+      #define RDFT_CB    _soxr_rdft32s_cb
+    #else
+      #define RDFT_CB    _soxr_rdft32_cb
+    #endif
+  #endif
+
+  #if CORE_TYPE & (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+    #if CORE_TYPE & CORE_DBL
+      #include "util64s.h"
+      #include "dev64s.h"
+    #else
+      #include "util32s.h"
+      #include "dev32s.h"
+    #endif
+  #endif
+
+  extern rdft_cb_table RDFT_CB;
+#else
+  #define RDFT_CB 0
+#endif
+
+
+
+static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+  sample_t const * input = stage_read_p(p);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio);
+  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+
+  for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
+    sample_t const * s = input + p->at.integer;
+    double x = p->at.fraction * (1 / MULT32);
+    double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b);
+    double c = s[1]-*s-a-b;
+    output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s));
+  }
+  assert(max_num_out - i >= 0);
+  fifo_trim_by(output_fifo, max_num_out - i);
+  fifo_read(&p->fifo, p->at.integer, NULL);
+  p->at.integer = 0;
+}
+
+
+
+#if defined __AVX__
+  #define DEFINED_AVX 1
+#else
+  #define DEFINED_AVX 0
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+  #define DEFINED_X86 1
+#else
+  #define DEFINED_X86 0
+#endif
+
+#if defined __arm__
+  #define DEFINED_ARM 1
+#else
+  #define DEFINED_ARM 0
+#endif
+
+
+
+#if CORE_TYPE & CORE_DBL
+  #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_AVX)
+  #define SIMD_SSE 0
+#else
+  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_X86)
+  #define SIMD_AVX 0
+#endif
+
+#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_ARM)
+
+
+
+#include "half-coefs.h"
+
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+#define FUNCTION_H h7
+#define CONVOLVE ____ __ _
+#include "half-fir.h"
+#endif
+
+#define FUNCTION_H h8
+#define CONVOLVE ____ ____
+#include "half-fir.h"
+
+#define FUNCTION_H h9
+#define CONVOLVE ____ ____ _
+#include "half-fir.h"
+
+#if CORE_TYPE & CORE_DBL
+  #define FUNCTION_H h10
+  #define CONVOLVE ____ ____ __
+  #include "half-fir.h"
+
+  #define FUNCTION_H h11
+  #define CONVOLVE ____ ____ __ _
+  #include "half-fir.h"
+
+  #define FUNCTION_H h12
+  #define CONVOLVE ____ ____ ____
+  #include "half-fir.h"
+
+  #define FUNCTION_H h13
+  #define CONVOLVE ____ ____ ____ _
+  #include "half-fir.h"
+#endif
+
+static half_fir_info_t const half_firs[] = {
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+  { 7, half_fir_coefs_7 , h7 , 0  , 120.65f},
+#endif
+  { 8, half_fir_coefs_8 , h8 , 0  , 136.51f},
+  { 9, half_fir_coefs_9 , h9 , 0  , 152.32f},
+#if CORE_TYPE & CORE_DBL
+  {10, half_fir_coefs_10, h10, 0  , 168.08f},
+  {11, half_fir_coefs_11, h11, 0  , 183.79f},
+  {12, half_fir_coefs_12, h12, 0  , 199.46f},
+  {13, half_fir_coefs_13, h13, 0  , 215.12f},
+#endif
+};
+
+#undef SIMD_AVX
+#undef SIMD_NEON
+#undef SIMD_SSE
+
+
+
+#if CORE_TYPE & CORE_DBL
+  #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_AVX)
+  #define SIMD_SSE 0
+#else
+  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_X86)
+  #define SIMD_AVX 0
+#endif
+
+#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_ARM)
+
+
+
+#define COEFS (sample_t * __restrict)p->shared->poly_fir_coefs
+#define VAR_LENGTH p->n
+#define VAR_CONVOLVE(n) while (j < (n)) _
+#define VAR_POLY_PHASE_BITS p->phase_bits
+
+
+
+#define FUNCTION vpoly0
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir0.h"
+
+#define FUNCTION vpoly1
+#define COEF_INTERP 1
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+#define FUNCTION vpoly2
+#define COEF_INTERP 2
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+#define FUNCTION vpoly3
+#define COEF_INTERP 3
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+
+
+#if !(CORE_TYPE & CORE_SIMD_POLY)
+
+#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+#define FUNCTION U100_0
+#define FIR_LENGTH U100_l
+#define CONVOLVE(n) poly_fir_convolve_U100
+#include "poly-fir0.h"
+
+#define u100_l 11
+#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _
+#define FUNCTION u100_0
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir0.h"
+
+#define FUNCTION u100_1
+#define COEF_INTERP 1
+#define PHASE_BITS 8
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir.h"
+
+#define FUNCTION u100_2
+#define COEF_INTERP 2
+#define PHASE_BITS 6
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir.h"
+
+#endif
+
+#define u100_1_b 8
+#define u100_2_b 6
+
+
+
+static poly_fir_t const poly_firs[] = {
+  {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}},
+  {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}},
+  {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}},
+  {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}},
+  {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}},
+
+  {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}},
+  {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}},
+  {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}},
+  {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}},
+
+#if CORE_TYPE & CORE_SIMD_POLY
+  {10.62f, {{0, vpoly0}, {0, 0}, {0, 0}}},
+  {-1, {{0, vpoly0}, {u100_1_b, vpoly1}, {u100_2_b, vpoly2}}},
+#else
+  {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}},
+  {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}},
+#endif
+  {-1, {{0, vpoly0}, {   9, vpoly1}, {  6, vpoly2}}},
+  {-1, {{0, vpoly0}, {  11, vpoly1}, {  7, vpoly2}}},
+  {-1, {{0, vpoly0}, {  13, vpoly1}, {  8, vpoly2}}},
+  {-1, {{0, vpoly0}, {  10, vpoly2}, {  8, vpoly3}}},
+  {-1, {{0, vpoly0}, {  12, vpoly2}, {  9, vpoly3}}},
+};
+
+
+
+static cr_core_t const cr_core = {
+
+#if CORE_TYPE & CORE_SIMD_POLY
+  {SIMD_ALIGNED_MALLOC, SIMD_ALIGNED_CALLOC, SIMD_ALIGNED_FREE},
+#else
+  {malloc, calloc, free},
+#endif
+  half_firs, array_length(half_firs),
+  0, 0,
+  cubic_stage_fn,
+  poly_firs, &RDFT_CB
+};
+
+
+
+#if defined SOXR_LIB
+
+#include "soxr.h"
+
+static char const * rate_create(void * channel, void * shared, double io_ratio,
+                               void * q_spec, void * r_spec, double scale)
+{
+  return _soxr_init(channel, shared, io_ratio, q_spec, r_spec, scale,
+      &cr_core, CORE_TYPE);
+}
+
+
+
+static char const * id(void) {return CORE_STR;}
+
+#include "cb_t.h"
+
+control_block_t RATE_CB = {
+  _soxr_input,
+  _soxr_process,
+  _soxr_output,
+  _soxr_flush,
+  _soxr_close,
+  _soxr_delay,
+  _soxr_sizes,
+  rate_create,
+  0,
+  id,
+};
+
+#endif
diff --git a/soxr/src/cr.c b/soxr/src/cr.c
new file mode 100644
index 0000000..eabe700
--- /dev/null
+++ b/soxr/src/cr.c
@@ -0,0 +1,600 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details.
+ *
+ * Constant-rate resampling common code. */
+
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdio.h>
+
+#include "filter.h"
+
+#if defined SOXR_LIB
+  #include "internal.h"
+  #define STATIC
+#endif
+
+#include "cr.h"
+
+#define num_coefs4 ((core_flags&CORE_SIMD_POLY)? ((num_coefs+3)&~3) : num_coefs)
+
+#define coef_coef(C,T,x) \
+  C((T*)result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i)
+
+#define STORE(C,T) { \
+  if (interp_order > 2) coef_coef(C,T,3) = (T)d; \
+  if (interp_order > 1) coef_coef(C,T,2) = (T)c; \
+  if (interp_order > 0) coef_coef(C,T,1) = (T)b; \
+  coef_coef(C,T,0) = (T)f0;}
+
+static real * prepare_poly_fir_coefs(double const * coefs, int num_coefs,
+    int num_phases, int interp_order, double multiplier,
+    core_flags_t core_flags, alloc_t const * mem)
+{
+  int i, j, length = num_coefs4 * num_phases * (interp_order + 1);
+  real * result = mem->calloc(1,(size_t)length << LOG2_SIZEOF_REAL(core_flags));
+  double fm1 = coefs[0], f1 = 0, f2 = 0;
+
+  for (i = num_coefs - 1; i >= 0; --i)
+    for (j = num_phases - 1; j >= 0; --j) {
+      double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */
+      int pos = i * num_phases + j - 1;
+      fm1 = pos > 0 ? coefs[pos - 1] * multiplier : 0;
+      switch (interp_order) {
+        case 1: b = f1 - f0; break;
+        case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break;
+        case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break;
+        default: assert(!interp_order);
+      }
+      switch (core_flags & 3) {
+        case 0: if (WITH_CR32 ) STORE(coef , float ); break;
+        case 1: if (WITH_CR64 ) STORE(coef , double); break;
+        case 2: if (WITH_CR32S) STORE(coef4, float ); break;
+        default:if (WITH_CR64S) STORE(coef4, double); break;
+      }
+      f2 = f1, f1 = f0;
+    }
+  return result;
+}
+
+#undef STORE
+#undef coef_coef
+
+#define IS_FLOAT32 (WITH_CR32 || WITH_CR32S) && \
+    (!(WITH_CR64 || WITH_CR64S) || sizeof_real == sizeof(float))
+#define WITH_FLOAT64 WITH_CR64 || WITH_CR64S
+
+static void dft_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+  real * output, * dft_out;
+  int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
+  rate_shared_t const * s = p->shared;
+  dft_filter_t const * f = &s->dft_filter[p->dft_filter_num];
+  int const overlap = f->num_taps - 1;
+
+  if (p->at.integer + p->L * num_in >= f->dft_length) {
+    rdft_cb_table const * const RDFT_CB = p->rdft_cb;
+    size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(p->core_flags);
+    div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L);
+    real const * input = fifo_read_ptr(&p->fifo);
+    fifo_read(&p->fifo, divd.quot, NULL);
+    num_in -= divd.quot;
+
+    output = fifo_reserve(output_fifo, f->dft_length);
+    dft_out = (p->core_flags & CORE_SIMD_DFT)? p->dft_out : output;
+
+    if (lsx_is_power_of_2(p->L)) { /* F-domain */
+      int portion = f->dft_length / p->L;
+      memcpy(dft_out, input, (unsigned)portion * sizeof_real);
+      rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch);
+      if (IS_FLOAT32) {
+#define dft_out ((float *)dft_out)
+        for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
+          dft_out[i] = dft_out[(portion << 1) - i],
+            dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
+        dft_out[portion] = dft_out[1];
+        dft_out[portion + 1] = 0;
+        dft_out[1] = dft_out[0];
+#undef dft_out
+      }
+      else if (WITH_FLOAT64) {
+#define dft_out ((double *)dft_out)
+        for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
+          dft_out[i] = dft_out[(portion << 1) - i],
+            dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
+        dft_out[portion] = dft_out[1];
+        dft_out[portion + 1] = 0;
+        dft_out[1] = dft_out[0];
+#undef dft_out
+      }
+
+      for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
+        memcpy((char *)dft_out + (size_t)i * sizeof_real, dft_out, (size_t)portion * sizeof_real);
+        if (IS_FLOAT32)
+        #define dft_out ((float *)dft_out)
+          dft_out[i + 1] = 0;
+        #undef dft_out
+        else if (WITH_FLOAT64)
+        #define dft_out ((double *)dft_out)
+          dft_out[i + 1] = 0;
+        #undef dft_out
+      }
+
+      if (p->step.integer > 0) {
+        rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+      }
+    } else {
+      if (p->L == 1)
+        memcpy(dft_out, input, (size_t)f->dft_length * sizeof_real);
+      else {
+
+        memset(dft_out, 0, (size_t)f->dft_length * sizeof_real);
+        if (IS_FLOAT32)
+          for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
+            ((float *)dft_out)[i] = ((float *)input)[j];
+        else if (WITH_FLOAT64)
+          for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
+            ((double *)dft_out)[i] = ((double *)input)[j];
+        p->at.integer = p->L - 1 - divd.rem;
+      }
+      if (p->step.integer > 0)
+        rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+      else
+        rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+    }
+
+    if (p->step.integer > 0) {
+      rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs);
+      rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+      if ((p->core_flags & CORE_SIMD_DFT) && p->step.integer == 1)
+        memcpy(output, dft_out, (size_t)f->dft_length * sizeof_real);
+      if (p->step.integer != 1) {
+        if (IS_FLOAT32)
+          for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
+              i += p->step.integer)
+            ((float *)output)[j] = ((float *)dft_out)[i];
+        else if (WITH_FLOAT64)
+          for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
+              i += p->step.integer)
+            ((double *)output)[j] = ((double *)dft_out)[i];
+        p->remM = i - (f->dft_length - overlap);
+        fifo_trim_by(output_fifo, f->dft_length - j);
+      }
+      else fifo_trim_by(output_fifo, overlap);
+    }
+    else { /* F-domain */
+      int m = -p->step.integer;
+      rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs);
+      rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch);
+      if (p->core_flags & CORE_SIMD_DFT)
+        memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof_real);
+      fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m);
+    }
+    (rdft_cb_table const *)RDFT_CB;
+  }
+  p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L;
+}
+
+/* Set to 4 x nearest power of 2 or half of that */
+/* if danger of causing too many cache misses. */
+static int set_dft_length(int num_taps, int min, int large)
+{
+  double d = log((double)num_taps) / log(2.);
+  return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large));
+}
+
+static void dft_stage_init(
+    unsigned instance, double Fp, double Fs, double Fn, double att,
+    double phase_response, stage_t * p, int L, int M, double * multiplier,
+    unsigned min_dft_size, unsigned large_dft_size, core_flags_t core_flags,
+    rdft_cb_table const * rdft_table)
+{
+  rdft_cb_table const * const RDFT_CB = rdft_table;
+  dft_filter_t * f = &p->shared->dft_filter[instance];
+  int num_taps = 0, dft_length = f->dft_length, i, offset;
+  bool f_domain_m = abs(3-M) == 1 && Fs <= 1;
+  size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags);
+
+  if (!dft_length) {
+    int k = phase_response == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4;
+    double m, * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.);
+
+    if (phase_response != 50)
+      lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase_response);
+    else f->post_peak = num_taps / 2;
+
+    dft_length = set_dft_length(num_taps, (int)min_dft_size, (int)large_dft_size);
+    f->coefs = rdft_calloc((size_t)dft_length, sizeof_real);
+    offset = dft_length - num_taps + 1;
+    m = (1. / dft_length) * rdft_multiplier() * L * *multiplier;
+    if (IS_FLOAT32) for (i = 0; i < num_taps; ++i)
+        ((float *)f->coefs)[(i + offset) & (dft_length - 1)] =(float)(h[i] * m);
+    else if (WITH_FLOAT64) for (i = 0; i < num_taps; ++i)
+        ((double *)f->coefs)[(i + offset) & (dft_length - 1)] = h[i] * m;
+    free(h);
+  }
+
+  if (rdft_flags() & RDFT_IS_SIMD)
+    p->dft_out = rdft_malloc(sizeof_real * (size_t)dft_length);
+  if (rdft_flags() & RDFT_NEEDS_SCRATCH)
+    p->dft_scratch = rdft_malloc(2 * sizeof_real * (size_t)dft_length);
+
+  if (!f->dft_length) {
+    void * coef_setup = rdft_forward_setup(dft_length);
+    int Lp = lsx_is_power_of_2(L)? L : 1;
+    int Mp = f_domain_m? M : 1;
+    f->dft_forward_setup = rdft_forward_setup(dft_length / Lp);
+    f->dft_backward_setup = rdft_backward_setup(dft_length / Mp);
+    if (Mp == 1)
+      rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+    else
+      rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+    rdft_delete_setup(coef_setup);
+    f->num_taps = num_taps;
+    f->dft_length = dft_length;
+    lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i",
+        num_taps, dft_length, Fp, Fs, Fn, att, L, M);
+  }
+  *multiplier = 1;
+  p->out_in_ratio = (double)L / M;
+  p->core_flags = core_flags;
+  p->rdft_cb = rdft_table;
+  p->fn = dft_stage_fn;
+  p->preload = f->post_peak / L;
+  p->at.integer = f->post_peak % L;
+  p->L = L;
+  p->step.integer = f_domain_m? -M/2 : M;
+  p->dft_filter_num = instance;
+  p->block_len = f->dft_length - (f->num_taps - 1);
+  p->phase0 = p->at.integer / p->L;
+  p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L;
+}
+
+static struct half_fir_info const * find_half_fir(
+    struct half_fir_info const * firs, size_t len, double att)
+{
+  size_t i;
+  for (i = 0; i + 1 < len && att > firs[i].att; ++i);
+  return &firs[i];
+}
+
+#define have_pre_stage  (preM  * preL  != 1)
+#define have_arb_stage  (arbM  * arbL  != 1)
+#define have_post_stage (postM * postL != 1)
+
+#include "soxr.h"
+
+STATIC char const * _soxr_init(
+  rate_t * const p,             /* Per audio channel. */
+  rate_shared_t * const shared, /* By channels undergoing same rate change. */
+  double const io_ratio,        /* Input rate divided by output rate. */
+  soxr_quality_spec_t const * const q_spec,
+  soxr_runtime_spec_t const * const r_spec,
+  double multiplier,            /* Linear gain to apply during conversion. */
+  cr_core_t const * const core,
+  core_flags_t const core_flags)
+{
+  size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags);
+  double const tolerance = 1 + 1e-5;
+
+  double       bits = q_spec->precision;
+  rolloff_t const rolloff = (rolloff_t)(q_spec->flags & 3);
+  int interpolator = (int)(r_spec->flags & 3) - 1;
+  double const Fp0 = q_spec->passband_end, Fs0 = q_spec->stopband_begin;
+  double const phase_response = q_spec->phase_response, tbw0 = Fs0-Fp0;
+
+  bool const maintain_3dB_pt = !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT);
+  double tbw_tighten = 1, alpha;
+  #define tighten(x) (Fs0-(Fs0-(x))*tbw_tighten)
+
+  double arbM = io_ratio, Fn1, Fp1 = Fp0, Fs1 = Fs0, bits1 = min(bits,33);
+  double att = (bits1 + 1) * linear_to_dB(2.), attArb = att; /* +1: pass+stop */
+  int preL = 1, preM = 1, shr = 0, arbL = 1, postL = 1, postM = 1;
+  bool upsample=false, rational=false, iOpt=!(r_spec->flags&SOXR_NOSMALLINTOPT);
+  bool lq_bits= (q_spec->flags & SOXR_PROMOTE_TO_LQ)? bits <= 16 : bits == 16;
+  bool lq_Fp0 = (q_spec->flags & SOXR_PROMOTE_TO_LQ)? Fp0<=lq_bw0 : Fp0==lq_bw0;
+  int n = 0, i, mode = lq_bits && rolloff == rolloff_medium? io_ratio > 1 ||
+    phase_response != 50 || !lq_Fp0 || Fs0 != 1 : ((int)ceil(bits1) - 6) / 4;
+  struct half_fir_info const * half_fir_info;
+  stage_t * s;
+
+  if (io_ratio < 1 && Fs0 - 1 > 1 - Fp0 / tolerance)
+    return "imaging greater than rolloff";
+  if (.002 / tolerance > tbw0 || tbw0 > .5 * tolerance)
+    return "transition bandwidth not in [0.2,50] % of nyquist";
+  if (.5 / tolerance > Fp0 || Fs0 > 1.5 * tolerance)
+    return "transition band not within [50,150] % of nyquist";
+  if (bits!=0 && (15 > bits || bits > 33))
+    return "precision not in [15,33] bits";
+  if (io_ratio <= 0)
+    return "resampling factor not positive";
+  if (0 > phase_response || phase_response > 100)
+    return "phase response not in [0=min-phase,100=max-phase] %";
+
+  p->core = core;
+  p->io_ratio = io_ratio;
+  if (bits!=0) while (!n++) {                            /* Determine stages: */
+    int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 :
+      (int)ceil(r_spec->coef_size_kbytes * 1000. / (U100_l * (int)sizeof_real));
+    double d, epsilon = 0, frac;
+    upsample = arbM < 1;
+    for (i = (int)(.5 * arbM), shr = 0; i >>= 1; arbM *= .5, ++shr);
+    preM = upsample || (arbM > 1.5 && arbM < 2);
+    postM = 1 + (arbM > 1 && preM), arbM /= postM;
+    preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL;
+    if ((frac = arbM - (int)arbM)!=0)
+      epsilon = fabs(floor(frac * MULT32 + .5) / (frac * MULT32) - 1);
+    for (i = 1, rational = frac==0; i <= maxL && !rational; ++i) {
+      d = frac * i, try = (int)(d + .5);
+      if ((rational = fabs(try / d - 1) <= epsilon)) {    /* No long doubles! */
+        if (try == i)
+          arbM = ceil(arbM), shr += x = arbM > 3, arbM /= 1 + x;
+        else arbM = i * (int)arbM + try, arbL = i;
+      }
+    }
+    L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x;
+    if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) {
+      for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1);
+      arbM = arbM * postL / arbL / preL, arbL = 1, n = 0;
+    } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt))
+      preL = L, preM = M, arbM = arbL = postM = 1;
+    if (!mode && (!rational || !n))
+      ++mode, n = 0;
+  }
+
+  p->num_stages = shr + have_pre_stage + have_arb_stage + have_post_stage;
+  if (!p->num_stages && multiplier != 1) {
+    bits = arbL = 0;                         /* Use cubic_stage in this case. */
+    ++p->num_stages;
+  }
+  p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages));
+  if (!p->stages)
+    return "out of memory";
+  for (i = 0; i < p->num_stages; ++i) {
+    p->stages[i].num = i;
+    p->stages[i].shared = shared;
+    p->stages[i].input_size = 8192;
+  }
+  p->stages[0].is_input = true;
+
+  alpha = postM / (io_ratio * (postL << 0));
+
+  if ((n = p->num_stages) > 1) {                              /* Att. budget: */
+    if (have_arb_stage)
+      att += linear_to_dB(2.), attArb = att, --n;
+    att += linear_to_dB((double)n);
+  }
+
+  half_fir_info = find_half_fir(core->half_firs, core->half_firs_len, att);
+  for (i = 0, s = p->stages; i < shr; ++i, ++s) {
+    s->fn = half_fir_info->fn;
+    s->coefs = half_fir_info->coefs;
+    s->n = half_fir_info->num_coefs;
+    s->pre_post = 4 * s->n;
+    s->preload = s->pre = s->pre_post >> 1;
+  }
+
+  if (have_pre_stage) {
+    if (maintain_3dB_pt && have_post_stage) {    /* Trans. bands overlapping. */
+      double x = tbw0 * lsx_inv_f_resp(-3., att);
+      x = -lsx_f_resp(x / (max(2 * alpha - Fs0, alpha) - Fp0), att);
+      if (x > .035) {
+        tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014;
+        lsx_debug("tbw_tighten=%g (%gdB)", tbw_tighten, x);
+      }
+    }
+    Fn1 = preM? max(preL, preM) : arbM / arbL;
+    dft_stage_init(0, tighten(Fp1), Fs1, Fn1, att, phase_response, s++, preL,
+        max(preM, 1), &multiplier, r_spec->log2_min_dft_size,
+        r_spec->log2_large_dft_size, core_flags, core->rdft_cb);
+    Fp1 /= Fn1, Fs1 /= Fn1;
+  }
+
+  if (bits==0 && have_arb_stage) {                /* `Quick' cubic arb stage: */
+    s->fn = core->cubic_stage_fn;
+    s->mult = multiplier, multiplier = 1;
+    s->step.whole = (int64_t)(arbM * MULT32 + .5);
+    s->pre_post = max(3, s->step.integer);
+    s->preload = s->pre = 1;
+    s->out_in_ratio = MULT32 / (double)s->step.whole;
+  }
+  else if (have_arb_stage) {                     /* Higher quality arb stage: */
+    static const float rolloffs[] = {-.01f, -.3f, 0, -.103f};
+    poly_fir_t const * f = &core->poly_firs[6*(upsample+!!preM)+mode-!upsample];
+    int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases;
+    size_t coefs_size;
+    double at, Fp = Fp1, Fs, Fn, mult = upsample? 1 : arbM / arbL;
+    poly_fir1_t const * f1;
+
+    if (!upsample && preM)
+      Fn = 2 * mult, Fs = 3 + fabs(Fs1 - 1);
+    else Fn = 1, Fs = 2 - (mode? Fp1 + (Fs1 - Fp1) * .7 : Fs1);
+
+    if (mode)
+      Fp = Fs - (Fs - Fp) / (1 - lsx_inv_f_resp(rolloffs[rolloff], attArb));
+
+    i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1;
+    do {
+      f1 = &f->interp[++i];
+      assert(f1->fn);
+      if (i)
+        arbM /= arbL, arbL = 1, rational = false;
+      phase_bits = (int)ceil(f1->scalar - log(mult)/log(2.));
+      phases = !rational? (1 << phase_bits) : arbL;
+      if (f->interp[0].scalar==0) {
+        int phases0 = max(phases, 19), n0 = 0;
+        lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta);
+        num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM;
+      }
+      if ((num_coefs & 1) && rational && (arbL & 1))
+        phases <<= 1, arbL <<= 1, arbM *= 2;
+      at = arbL * (s->phase0 = .5 * (num_coefs & 1));
+      order = i + (i && mode > 4);
+      coefs_size = (size_t)(num_coefs4 * phases * (order+1)) * sizeof_real;
+    } while (interpolator < 0 && i < 2 && f->interp[i+1].fn &&
+        coefs_size / 1000 > r_spec->coef_size_kbytes);
+
+    if (!s->shared->poly_fir_coefs) {
+      int num_taps = num_coefs * phases - 1;
+      double * coefs = lsx_design_lpf(
+          Fp, Fs, Fn, attArb, &num_taps, phases, f->beta);
+      s->shared->poly_fir_coefs = prepare_poly_fir_coefs(
+          coefs, num_coefs, phases, order, multiplier, core_flags, &core->mem);
+      lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk",
+          num_coefs, phases, order, (double)coefs_size / 1000.);
+      free(coefs);
+    }
+    multiplier = 1;
+    s->fn = f1->fn;
+    s->pre_post = num_coefs4 - 1;
+    s->preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs);
+    s->n = num_coefs4;
+    s->phase_bits = phase_bits;
+    s->L = arbL;
+    s->use_hi_prec_clock =
+      mode>1 && (q_spec->flags & SOXR_HI_PREC_CLOCK) && !rational;
+#if WITH_FLOAT_STD_PREC_CLOCK
+    if (order && !s->use_hi_prec_clock) {
+      s->at.flt = at;
+      s->step.flt = arbM;
+      s->out_in_ratio = (double)(arbL / s->step.flt);
+    } else
+#endif
+    {
+      s->at.whole = (int64_t)(at * MULT32 + .5);
+#if WITH_HI_PREC_CLOCK
+      if (s->use_hi_prec_clock) {
+        double M = arbM * MULT32;
+        s->at.fix.ls.parts.ms = 0x80000000ul;
+        s->step.whole = (int64_t)M;
+        M -= (double)s->step.whole;
+        M *= MULT32 * MULT32;
+        s->step.fix.ls.all = (uint64_t)M;
+      } else
+#endif
+        s->step.whole = (int64_t)(arbM * MULT32 + .5);
+      s->out_in_ratio = MULT32 * arbL / (double)s->step.whole;
+    }
+    ++s;
+  }
+
+  if (have_post_stage)
+    dft_stage_init(1, tighten(Fp0 / (upsample? alpha : 1)), upsample? max(2 -
+        Fs0 / alpha, 1) : Fs0, (double)max(postL, postM), att, phase_response,
+        s++, postL, postM, &multiplier, r_spec->log2_min_dft_size,
+        r_spec->log2_large_dft_size, core_flags, core->rdft_cb);
+
+  lsx_debug("%g: >>%i %i/%i %i/%g %i/%i (%x)", 1/io_ratio,
+      shr, preL, preM, arbL, arbM, postL, postM, core_flags);
+
+  for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
+    fifo_create(&s->fifo, (int)sizeof_real);
+    memset(fifo_reserve(&s->fifo, s->preload), 0,
+        sizeof_real * (size_t)s->preload);
+    lsx_debug_more("%5i|%-5i preload=%i remL=%i",
+        s->pre, s->pre_post-s->pre, s->preload, s->at.integer);
+  }
+  fifo_create(&s->fifo, (int)sizeof_real);
+  return 0;
+}
+
+static bool stage_process(stage_t * stage, bool flushing)
+{
+  fifo_t * fifo = &stage->fifo;
+  bool done = false;
+  int want;
+  while (!done && (want = stage->input_size - fifo_occupancy(fifo)) > 0) {
+    if (stage->is_input) {
+      if (flushing)
+        memset(fifo_reserve(fifo, want), 0, fifo->item_size * (size_t)want);
+      else done = true;
+    }
+    else done = stage_process(stage - 1, flushing);
+  }
+  stage->fn(stage, &stage[1].fifo);
+  return done && fifo_occupancy(fifo) < stage->input_size;
+}
+
+STATIC void _soxr_process(void * P, size_t olen)
+{
+  rate_t *p = P;
+  int const n = p->flushing? min(-(int)p->samples_out, (int)olen) : (int)olen;
+  stage_t * stage = &p->stages[p->num_stages];
+  fifo_t * fifo = &stage->fifo;
+  bool done = false;
+  while (!done && fifo_occupancy(fifo) < (int)n)
+    done = stage->is_input || stage_process(stage - 1, p->flushing);
+}
+
+STATIC void * _soxr_input(void * P, void * samples, size_t n)
+{
+  rate_t *p = P;
+  if (p->flushing)
+    return 0;
+  p->samples_in += (int64_t)n;
+  return fifo_write(&p->stages[0].fifo, (int)n, samples);
+}
+
+STATIC void const * _soxr_output(void * P, void * samples, size_t * n0)
+{
+  rate_t *p = P;
+  fifo_t * fifo = &p->stages[p->num_stages].fifo;
+  int n = p->flushing? min(-(int)p->samples_out, (int)*n0) : (int)*n0;
+  p->samples_out += n = min(n, fifo_occupancy(fifo));
+  return fifo_read(fifo, (int)(*n0 = (size_t)n), samples);
+}
+
+STATIC void _soxr_flush(void * P)
+{
+  rate_t *p = P;
+  if (p->flushing) return;
+  p->samples_out -= (int64_t)((double)p->samples_in / p->io_ratio + .5);
+  p->samples_in = 0;
+  p->flushing = true;
+}
+
+STATIC void _soxr_close(void * P)
+{
+  rate_t *p = P;
+  if (p->stages) {
+    rdft_cb_table const * const RDFT_CB = p->core->rdft_cb;
+    rate_shared_t * shared = p->stages[0].shared;
+    int i;
+
+    for (i = 0; i <= p->num_stages; ++i) {
+      stage_t * s = &p->stages[i];
+      rdft_free(s->dft_scratch);
+      rdft_free(s->dft_out);
+      fifo_delete(&s->fifo);
+    }
+    if (shared) {
+      for (i = 0; i < 2; ++i) {
+        dft_filter_t * f= &shared->dft_filter[i];
+        rdft_free(f->coefs);
+        rdft_delete_setup(f->dft_forward_setup);
+        rdft_delete_setup(f->dft_backward_setup);
+      }
+      p->core->mem.free(shared->poly_fir_coefs);
+      memset(shared, 0, sizeof(*shared));
+    }
+    free(p->stages);
+    (rdft_cb_table const *)RDFT_CB;
+  }
+}
+
+#if defined SOXR_LIB
+STATIC double _soxr_delay(void * P)
+{
+  rate_t *p = P;
+  return (double)p->samples_in / p->io_ratio - (double)p->samples_out;
+}
+
+STATIC void _soxr_sizes(size_t * shared, size_t * channel)
+{
+  *shared = sizeof(rate_shared_t);
+  *channel = sizeof(rate_t);
+}
+#endif
diff --git a/soxr/src/cr.h b/soxr/src/cr.h
new file mode 100644
index 0000000..880eb1d
--- /dev/null
+++ b/soxr/src/cr.h
@@ -0,0 +1,178 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_cr_included
+#define soxr_cr_included
+
+#define  FIFO_SIZE_T int
+#include "fifo.h"
+
+typedef void real; /* float or double */
+struct stage;
+typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
+typedef struct half_fir_info {
+  int num_coefs;
+  real const * coefs;
+  stage_fn_t fn, dfn;
+  float att;
+} half_fir_info_t;
+typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
+typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;
+
+#define U100_l 42
+#define MULT32 (65536. * 65536.)
+
+/* Conceptually: coef_p is &coefs[num_phases][fir_len][interp_order+1]: */
+#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\
+  (fir_len) * ((interp_order) + 1) * (phase_num) + \
+  ((interp_order) + 1) * (fir_coef_num) + \
+  ((interp_order) - (coef_interp_num))]
+
+/* Conceptually: coef_p is &coefs[num_phases][fir_len/4][interp_order+1][4]: */
+#define coef4(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\
+  (fir_len) * ((interp_order) + 1) * (phase_num) + \
+  ((interp_order) + 1) * ((fir_coef_num) & ~3) + \
+  4 * ((interp_order) - (coef_interp_num)) + \
+  ((fir_coef_num) & 3)]
+
+typedef union { /* Int64 in parts */
+  #if HAVE_BIGENDIAN
+  struct {int32_t ms; uint32_t ls;} parts;
+  #else
+  struct {uint32_t ls; int32_t ms;} parts;
+  #endif
+  int64_t all;
+} int64p_t;
+
+typedef union { /* Uint64 in parts */
+  #if HAVE_BIGENDIAN
+  struct {uint32_t ms, ls;} parts;
+  #else
+  struct {uint32_t ls, ms;} parts;
+  #endif
+  uint64_t all;
+} uint64p_t;
+
+typedef struct {
+  int        dft_length, num_taps, post_peak;
+  void       * dft_forward_setup, * dft_backward_setup;
+  real   * coefs;
+} dft_filter_t;
+
+typedef struct { /* So generated filter coefs may be shared between channels */
+  real   * poly_fir_coefs;
+  dft_filter_t dft_filter[2];
+} rate_shared_t;
+
+typedef double float_step_t; /* Or long double or __float128. */
+
+typedef union { /* Fixed point arithmetic */
+  struct {uint64p_t ls; int64p_t ms;} fix;  /* Hi-prec has ~96 bits. */
+  float_step_t flt;
+} step_t;
+
+#define integer  fix.ms.parts.ms
+#define fraction fix.ms.parts.ls
+#define whole    fix.ms.all
+
+#define CORE_DBL       1
+#define CORE_SIMD_POLY 2
+#define CORE_SIMD_HALF 4
+#define CORE_SIMD_DFT  8
+#define LOG2_SIZEOF_REAL(core_flags) (2 + ((core_flags) & 1))
+
+typedef int core_flags_t;
+
+#if defined SOXR_LIB
+#include "rdft_t.h"
+#else
+typedef void fn_t;
+#endif
+
+typedef struct stage {
+  int        num;
+
+  /* Common to all stage types: */
+  core_flags_t   core_flags;
+  stage_fn_t fn;
+  fifo_t     fifo;
+  int        pre;       /* Number of past samples to store */
+  int        pre_post;  /* pre + number of future samples to store */
+  int        preload;   /* Number of zero samples to pre-load the fifo */
+  double     out_in_ratio; /* For buffer management. */
+  int        input_size;
+  bool       is_input;
+
+  /* For a stage with variable (run-time generated) filter coefs: */
+  rdft_cb_table const * rdft_cb;
+  rate_shared_t * shared;
+  unsigned   dft_filter_num; /* Which, if any, of the 2 DFT filters to use */
+  real       * dft_scratch;
+  float      * dft_out;
+  real const * coefs;
+
+  /* For a stage with variable L/M: */
+  step_t     at, step;
+  bool       use_hi_prec_clock;
+  int        L, remM;
+  int        n, phase_bits, block_len;
+  double     mult, phase0;
+} stage_t;
+
+#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
+#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
+
+#define lq_bw0  (1385/2048.) /* ~.67625, FP exact. */
+
+typedef enum {rolloff_small, rolloff_medium, rolloff_none} rolloff_t;
+
+typedef struct {
+  void * (* alloc)(size_t);
+  void * (* calloc)(size_t, size_t);
+  void (* free)(void *);
+} alloc_t;
+
+typedef struct {
+  alloc_t mem;
+  half_fir_info_t  const * half_firs;
+  size_t half_firs_len;
+  half_fir_info_t  const * doub_firs;
+  size_t doub_firs_len;
+  stage_fn_t cubic_stage_fn;
+  poly_fir_t const * poly_firs;
+  rdft_cb_table * rdft_cb;
+} cr_core_t;
+
+typedef struct rate rate_t;
+struct rate {
+  cr_core_t const * core;
+  double     io_ratio;
+  int64_t    samples_in, samples_out;
+  int        num_stages, flushing;
+  stage_t    * stages;
+};
+
+#if defined SOXR_LIB
+
+#include "soxr.h"
+
+char const * _soxr_init(
+  rate_t * const p,                /* Per audio channel.                            */
+  rate_shared_t * const shared,    /* Between channels (undergoing same rate change)*/
+  double const io_ratio,           /* Input rate divided by output rate.            */
+  soxr_quality_spec_t const * const q_spec,
+  soxr_runtime_spec_t const * const r_spec,
+  double multiplier,               /* Linear gain to apply during conversion.   1   */
+  cr_core_t const * const core,
+  core_flags_t const);
+
+void _soxr_process(void * p, size_t olen);
+void * _soxr_input(void * p, void * samples, size_t n);
+void const * _soxr_output(void * p, void * samples, size_t * n0);
+void _soxr_flush(void * p);
+void _soxr_close(void * p);
+double _soxr_delay(void * p);
+void _soxr_sizes(size_t * shared, size_t * channel);
+#endif
+
+#endif
diff --git a/soxr/src/cr32.c b/soxr/src/cr32.c
new file mode 100644
index 0000000..b9eb264
--- /dev/null
+++ b/soxr/src/cr32.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate32_cb
+#define CORE_STR   "cr32"
+
+#define CORE_TYPE  0
+#include "cr-core.c"
diff --git a/soxr/src/cr32s.c b/soxr/src/cr32s.c
new file mode 100644
index 0000000..5de2a43
--- /dev/null
+++ b/soxr/src/cr32s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate32s_cb
+#define CORE_STR   "cr32s"
+
+#define CORE_TYPE  (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+#include "cr-core.c"
diff --git a/soxr/src/cr64.c b/soxr/src/cr64.c
new file mode 100644
index 0000000..518cdd7
--- /dev/null
+++ b/soxr/src/cr64.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate64_cb
+#define CORE_STR   "cr64"
+
+#define CORE_TYPE  CORE_DBL
+#include "cr-core.c"
diff --git a/soxr/src/cr64s.c b/soxr/src/cr64s.c
new file mode 100644
index 0000000..5dcd6f1
--- /dev/null
+++ b/soxr/src/cr64s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate64s_cb
+#define CORE_STR   "cr64s"
+
+#define CORE_TYPE  (CORE_DBL|CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+#include "cr-core.c"
diff --git a/soxr/src/data-io.c b/soxr/src/data-io.c
index 1cd8e7f..fb61675 100644
--- a/soxr/src/data-io.c
+++ b/soxr/src/data-io.c
@@ -1,4 +1,4 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #include <limits.h>
@@ -14,8 +14,8 @@
   unsigned i; \
   size_t j; \
   T const * src = *src0; \
-  if (ch > 1) \
-    for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) dest[i][j] = (DEINTERLEAVE_TO)*src++; \
+  if (ch > 1) for (j = 0; j < n; ++j) \
+    for (i = 0; i < ch; ++i) dest[i][j] = (DEINTERLEAVE_TO)*src++; \
   else if (flag) memcpy(dest[0], src, n * sizeof(T)), src = &src[n]; \
   else for (j = 0; j < n; dest[0][j++] = (DEINTERLEAVE_TO)*src++); \
   *src0 = src; \
@@ -23,7 +23,7 @@
 
 
 
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
 void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */
     soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
 {
@@ -40,7 +40,7 @@ void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */
 
 
 
-#if HAVE_SINGLE_PRECISION
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
 void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
     soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
 {
@@ -60,35 +60,6 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
 
 #include "rint.h"
 
-#if HAVE_FENV_H
-  #include <fenv.h>
-  #define fe_test_invalid() fetestexcept(FE_INVALID)
-  #define fe_clear_invalid() feclearexcept(FE_INVALID)
-#elif defined _MSC_VER
-  #define FE_INVALID 1
-  #if defined _WIN64
-    #include <float.h>
-    #define fe_test_invalid() (_statusfp() & _SW_INVALID)
-    #define fe_clear_invalid _clearfp /* FIXME clears all */
-  #else
-  static __inline int fe_test_invalid()
-  {
-    short status_word;
-    __asm fnstsw status_word
-    return status_word & FE_INVALID;
-  }
-
-  static __inline int fe_clear_invalid()
-  {
-    int16_t status[14];
-    __asm fnstenv status
-    status[2] &= ~FE_INVALID;
-    __asm fldenv status
-    return 0;
-  }
-  #endif
-#endif
-
 
 
 #if defined FE_INVALID && defined FPU_RINT32 && defined __STDC_VERSION__
@@ -97,13 +68,13 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
   #endif
 #endif
 
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
 #define FLOATX double
 
 #define LSX_RINT_CLIP_2 lsx_rint32_clip_2
 #define LSX_RINT_CLIP lsx_rint32_clip
 #define RINT_CLIP rint32_clip
-#define RINT rint32
+#define RINT rint32D
 #if defined FPU_RINT32
   #define FPU_RINT
 #endif
@@ -114,7 +85,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
 #define LSX_RINT_CLIP_2 lsx_rint16_clip_2
 #define LSX_RINT_CLIP lsx_rint16_clip
 #define RINT_CLIP rint16_clip
-#define RINT rint16
+#define RINT rint16D
 #if defined FPU_RINT16
   #define FPU_RINT
 #endif
@@ -125,7 +96,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
 #define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither
 #define LSX_RINT_CLIP lsx_rint16_clip_dither
 #define RINT_CLIP rint16_clip_dither
-#define RINT rint16
+#define RINT rint16D
 #if defined FPU_RINT16
   #define FPU_RINT
 #endif
@@ -139,13 +110,13 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
 
 
 
-#if HAVE_SINGLE_PRECISION
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
 #define FLOATX float
 
 #define LSX_RINT_CLIP_2 lsx_rint32_clip_2_f
 #define LSX_RINT_CLIP lsx_rint32_clip_f
 #define RINT_CLIP rint32_clip_f
-#define RINT rint32
+#define RINT rint32F
 #if defined FPU_RINT32
   #define FPU_RINT
 #endif
@@ -156,7 +127,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
 #define LSX_RINT_CLIP_2 lsx_rint16_clip_2_f
 #define LSX_RINT_CLIP lsx_rint16_clip_f
 #define RINT_CLIP rint16_clip_f
-#define RINT rint16
+#define RINT rint16F
 #if defined FPU_RINT16
   #define FPU_RINT
 #endif
@@ -167,7 +138,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
 #define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither_f
 #define LSX_RINT_CLIP lsx_rint16_clip_dither_f
 #define RINT_CLIP rint16_clip_dither_f
-#define RINT rint16
+#define RINT rint16D
 #if defined FPU_RINT16
   #define FPU_RINT
 #endif
@@ -199,7 +170,7 @@ void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
   return 0; \
 } while (0)
 
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
 size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0,
   double const * const * src, size_t n, unsigned ch, unsigned long * seed)
 {
@@ -225,7 +196,7 @@ size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0,
 }
 #endif
 
-#if HAVE_SINGLE_PRECISION
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
 size_t /* clips */ _soxr_interleave_f(soxr_datatype_t data_type, void * * dest0,
   float const * const * src, size_t n, unsigned ch, unsigned long * seed)
 {
diff --git a/soxr/src/dev32s.h b/soxr/src/dev32s.h
new file mode 100644
index 0000000..7edae86
--- /dev/null
+++ b/soxr/src/dev32s.h
@@ -0,0 +1,54 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_dev32s_included
+#define soxr_dev32s_included
+
+#if defined __GNUC__
+  #define SIMD_INLINE(T) static __inline T __attribute__((always_inline))
+  #define vAlign __attribute__((aligned (16)))
+#elif defined _MSC_VER
+  #define SIMD_INLINE(T) static __forceinline T
+  #define vAlign __declspec(align(16))
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+
+#include <xmmintrin.h>
+
+#define vZero()      _mm_setzero_ps()
+#define vSet1(a)     _mm_set_ss(a)
+#define vMul(a,b)    _mm_mul_ps(a,b)
+#define vAdd(a,b)    _mm_add_ps(a,b)
+#define vMac(a,b,c)  vAdd(vMul(a,b),c)
+#define vLds(a)      _mm_set1_ps(a)
+#define vLd(a)       _mm_load_ps(a)
+#define vLdu(a)      _mm_loadu_ps(a)
+
+typedef __m128 v4_t;
+
+SIMD_INLINE(void) vStorSum(float * a, v4_t b) {
+  v4_t t = vAdd(_mm_movehl_ps(b, b), b);
+  _mm_store_ss(a, vAdd(t, _mm_shuffle_ps(t,t,1)));}
+
+#elif defined __arm__
+
+#include <arm_neon.h>
+
+#define vZero()      vdupq_n_f32(0)
+#define vMul(a,b)    vmulq_f32(a,b)
+#define vAdd(a,b)    vaddq_f32(a,b)
+#define vMac(a,b,c)  vmlaq_f32(c,a,b)
+#define vLds(a)      vld1q_dup_f32(&(a))
+#define vLd(a)       vld1q_f32(a)
+#define vLdu(a)      vld1q_f32(a)
+
+typedef float32x4_t v4_t;
+
+SIMD_INLINE(void) vStorSum(float * a, v4_t b) {
+  float32x2_t t = vadd_f32(vget_high_f32(b), vget_low_f32(b));
+  *a = vget_lane_f32(vpadd_f32(t, t), 0);}
+
+#endif
+
+#endif
diff --git a/soxr/src/dev64s.h b/soxr/src/dev64s.h
new file mode 100644
index 0000000..4672210
--- /dev/null
+++ b/soxr/src/dev64s.h
@@ -0,0 +1,42 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_dev64s_included
+#define soxr_dev64s_included
+
+#if defined __GNUC__
+  #define SIMD_INLINE(T) static __inline T __attribute__((always_inline))
+  #define vAlign __attribute__((aligned (32)))
+#elif defined _MSC_VER
+  #define SIMD_INLINE(T) static __forceinline T
+  #define vAlign __declspec(align(32))
+#else
+  #define SIMD_INLINE(T) static __inline T
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+
+#include <immintrin.h>
+
+#if defined __AVX__
+
+#define vZero()      _mm256_setzero_pd()
+#define vSet1(a)     _mm256_set_pd(0,0,0,a)
+#define vMul(a,b)    _mm256_mul_pd(a,b)
+#define vAdd(a,b)    _mm256_add_pd(a,b)
+#define vMac(a,b,c)  vAdd(vMul(a,b),c) /* Note: gcc -mfma will `fuse' these */
+#define vLds(a)      _mm256_set1_pd(a)
+#define vLd(a)       _mm256_load_pd(a)
+#define vLdu(a)      _mm256_loadu_pd(a)
+
+typedef __m256d v4_t;
+
+SIMD_INLINE(void) vStorSum(double * a, v4_t b) {
+  b = _mm256_hadd_pd(b, _mm256_permute2f128_pd(b,b,1));
+  _mm_store_sd(a, _mm256_castpd256_pd128(_mm256_hadd_pd(b,b)));}
+
+#endif
+
+#endif
+
+#endif
diff --git a/soxr/src/fft4g.c b/soxr/src/fft4g.c
index 5fae8a6..cf6293a 100644
--- a/soxr/src/fft4g.c
+++ b/soxr/src/fft4g.c
@@ -282,22 +282,16 @@ Appendix :
 */
 
 
-#include <math.h>
+#include "math-wrap.h"
 #include "fft4g.h"
 
 #ifdef FFT4G_FLOAT
   #define double float
   #define one_half 0.5f
 
-#if defined _MSC_VER
-  #define sin   (float)sin
-  #define cos   (float)cos
-  #define atan  (float)atan
-#else
-  #define sin   sinf
-  #define cos   cosf
-  #define atan  atanf
-#endif
+  #define sin(x)   sinf(x)
+  #define cos(x)   cosf(x)
+  #define atan(x)  atanf(x)
 
   #define cdft  lsx_cdft_f
   #define rdft  lsx_rdft_f
@@ -818,7 +812,7 @@ static void bitrv2(int n, int *ip0, double *a)
 
 static void bitrv2conj(int n, int *ip0, double *a)
 {
-    int j, j1, k, k1, l, m, m2, ip[256];
+    int j, j1, k, k1, l, m, m2, ip[512];
     double xr, xi, yr, yi;
 
     (void)ip0;
diff --git a/soxr/src/fft4g32.c b/soxr/src/fft4g32.c
index 8741394..4e4912e 100644
--- a/soxr/src/fft4g32.c
+++ b/soxr/src/fft4g32.c
@@ -1,27 +1,38 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
+#include <stdlib.h>
 #include "filter.h"
 #define FFT4G_FLOAT
 #include "fft4g.c"
+#include "soxr-config.h"
 
-static void * null(void) {return 0;}
-static void forward (int length, void * setup, double * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
-static void backward(int length, void * setup, double * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
+#if WITH_CR32
+#include "rdft_t.h"
+static void * null(int u1) {(void)u1; return 0;}
+static void forward (int length, void * setup, void * H, void * scratch) {lsx_safe_rdft_f(length,  1, H); (void)setup; (void)scratch;}
+static void backward(int length, void * setup, void * H, void * scratch) {lsx_safe_rdft_f(length, -1, H); (void)setup; (void)scratch;}
 static int multiplier(void) {return 2;}
-static void nothing(void) {}
+static void nothing(void *u1) {(void)u1;}
+static void nothing2(int u1, void *u2, void *u3, void *u4) {(void)u1; (void)u2; (void)u3; (void)u4;}
+static int flags(void) {return 0;}
 
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32_cb[] = {
-  (fn_t)null,
-  (fn_t)null,
-  (fn_t)nothing,
-  (fn_t)forward,
-  (fn_t)forward,
-  (fn_t)backward,
-  (fn_t)backward,
-  (fn_t)_soxr_ordered_convolve_f,
-  (fn_t)_soxr_ordered_partial_convolve_f,
-  (fn_t)multiplier,
-  (fn_t)nothing,
+rdft_cb_table _soxr_rdft32_cb = {
+  null,
+  null,
+  nothing,
+  forward,
+  forward,
+  backward,
+  backward,
+  _soxr_ordered_convolve_f,
+  _soxr_ordered_partial_convolve_f,
+  multiplier,
+  nothing2,
+  malloc,
+  calloc,
+  free,
+  flags,
 };
+
+#endif
diff --git a/soxr/src/fft4g32s.c b/soxr/src/fft4g32s.c
index 4a95a7d..c7f3772 100644
--- a/soxr/src/fft4g32s.c
+++ b/soxr/src/fft4g32s.c
@@ -2,25 +2,30 @@
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #include "filter.h"
-#include "simd.h"
+#include "util32s.h"
+#include "rdft_t.h"
 
 static void * null(void) {return 0;}
 static void nothing(void) {}
 static void forward (int length, void * setup, float * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
 static void backward(int length, void * setup, float * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
 static int multiplier(void) {return 2;}
+static int flags(void) {return RDFT_IS_SIMD;}
 
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32s_cb[] = {
-  (fn_t)null,
-  (fn_t)null,
-  (fn_t)nothing,
-  (fn_t)forward,
-  (fn_t)forward,
-  (fn_t)backward,
-  (fn_t)backward,
-  (fn_t)_soxr_ordered_convolve_simd,
-  (fn_t)_soxr_ordered_partial_convolve_simd,
-  (fn_t)multiplier,
-  (fn_t)nothing,
+rdft_cb_table _soxr_rdft32s_cb = {
+  null,
+  null,
+  nothing,
+  forward,
+  forward,
+  backward,
+  backward,
+  ORDERED_CONVOLVE_SIMD,
+  ORDERED_PARTIAL_CONVOLVE_SIMD,
+  multiplier,
+  nothing,
+  SIMD_ALIGNED_MALLOC,
+  SIMD_ALIGNED_CALLOC,
+  SIMD_ALIGNED_FREE,
+  flags,
 };
diff --git a/soxr/src/fft4g64.c b/soxr/src/fft4g64.c
index 48eaddd..fb87281 100644
--- a/soxr/src/fft4g64.c
+++ b/soxr/src/fft4g64.c
@@ -1,29 +1,36 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
+#include <stdlib.h>
 #include "filter.h"
 #include "fft4g.c"
 #include "soxr-config.h"
 
-#if HAVE_DOUBLE_PRECISION
-static void * null(void) {return 0;}
-static void nothing(void) {}
-static void forward (int length, void * setup, double * H) {lsx_safe_rdft(length,  1, H); (void)setup;}
-static void backward(int length, void * setup, double * H) {lsx_safe_rdft(length, -1, H); (void)setup;}
+#if WITH_CR64
+#include "rdft_t.h"
+static void * null(int u1) {(void)u1; return 0;}
+static void nothing(void *u1) {(void)u1;}
+static void nothing2(int u1, void *u2, void *u3, void *u4) {(void)u1; (void)u2; (void)u3; (void)u4;}
+static void forward (int length, void * setup, void * H, void * scratch) {lsx_safe_rdft(length,  1, H); (void)setup; (void)scratch;}
+static void backward(int length, void * setup, void * H, void * scratch) {lsx_safe_rdft(length, -1, H); (void)setup; (void)scratch;}
 static int multiplier(void) {return 2;}
+static int flags(void) {return 0;}
 
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft64_cb[] = {
-  (fn_t)null,
-  (fn_t)null,
-  (fn_t)nothing,
-  (fn_t)forward,
-  (fn_t)forward,
-  (fn_t)backward,
-  (fn_t)backward,
-  (fn_t)_soxr_ordered_convolve,
-  (fn_t)_soxr_ordered_partial_convolve,
-  (fn_t)multiplier,
-  (fn_t)nothing,
+rdft_cb_table _soxr_rdft64_cb = {
+  null,
+  null,
+  nothing,
+  forward,
+  forward,
+  backward,
+  backward,
+  _soxr_ordered_convolve,
+  _soxr_ordered_partial_convolve,
+  multiplier,
+  nothing2,
+  malloc,
+  calloc,
+  free,
+  flags,
 };
 #endif
diff --git a/soxr/src/fifo.h b/soxr/src/fifo.h
index b2bda43..33af9fe 100644
--- a/soxr/src/fifo.h
+++ b/soxr/src/fifo.h
@@ -1,14 +1,15 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-#ifndef fifo_included
-#define fifo_included
+#ifndef soxr_fifo_included
+#define soxr_fifo_included
 
 #if !defined FIFO_SIZE_T
 #define FIFO_SIZE_T size_t
 #endif
 
 #if !defined FIFO_REALLOC
+#include <stdlib.h>
   #define FIFO_REALLOC(a,b,c) realloc(a,b)
   #undef FIFO_FREE
   #define FIFO_FREE free
diff --git a/soxr/src/filter.c b/soxr/src/filter.c
index ca146d2..019d24d 100644
--- a/soxr/src/filter.c
+++ b/soxr/src/filter.c
@@ -1,12 +1,9 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #include "filter.h"
 
-#include <math.h>
-#if !defined M_PI
-#define M_PI    3.14159265358979323846
-#endif
+#include "math-wrap.h"
 #include <assert.h>
 #include <string.h>
 #include <stdlib.h>
@@ -14,7 +11,7 @@
 #include "fft4g.h"
 #include "ccrw2.h"
 
-#if 1 || HAVE_DOUBLE_PRECISION /* Always need this, for lsx_fir_to_phase. */
+#if 1 || WITH_CR64 || WITH_CR64S /* Always need this, for lsx_fir_to_phase. */
 #define DFT_FLOAT double
 #define DONE_WITH_FFT_CACHE done_with_fft_cache
 #define FFT_CACHE_CCRW fft_cache_ccrw
@@ -31,7 +28,7 @@
 #include "fft4g_cache.h"
 #endif
 
-#if HAVE_SINGLE_PRECISION && !HAVE_AVFFT
+#if (WITH_CR32 && !AVCODEC_FOUND) || (WITH_CR32S && !AVCODEC_FOUND && !WITH_PFFFT)
 #define DFT_FLOAT float
 #define DONE_WITH_FFT_CACHE done_with_fft_cache_f
 #define FFT_CACHE_CCRW fft_cache_ccrw_f
@@ -48,14 +45,14 @@
 #include "fft4g_cache.h"
 #endif
 
-#if HAVE_DOUBLE_PRECISION || !SOXR_LIB
+#if WITH_CR64 || WITH_CR64S || !SOXR_LIB
 #define DFT_FLOAT double
 #define ORDERED_CONVOLVE lsx_ordered_convolve
 #define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve
 #include "rdft.h"
 #endif
 
-#if HAVE_SINGLE_PRECISION
+#if WITH_CR32
 #define DFT_FLOAT float
 #define ORDERED_CONVOLVE lsx_ordered_convolve_f
 #define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve_f
@@ -96,12 +93,12 @@ double * lsx_make_lpf(
   double * h = malloc((size_t)num_taps * sizeof(*h));
   double mult = scale / lsx_bessel_I_0(beta), mult1 = 1 / (.5 * m + rho);
   assert(Fc >= 0 && Fc <= 1);
-  lsx_debug("make_lpf(n=%i Fc=%.7g β=%g ρ=%g scale=%g)",
+  lsx_debug("make_lpf(n=%i Fc=%.7g beta=%g rho=%g scale=%g)",
       num_taps, Fc, beta, rho, scale);
 
   if (h) for (i = 0; i <= m / 2; ++i) {
     double z = i - .5 * m, x = z * M_PI, y = z * mult1;
-    h[i] = x? sin(Fc * x) / x : Fc;
+    h[i] = x!=0? sin(Fc * x) / x : Fc;
     h[i] *= lsx_bessel_I_0(beta * sqrt(1 - y * y)) * mult;
     if (m - i != i)
       h[m - i] = h[i];
@@ -123,12 +120,15 @@ double * lsx_design_lpf(
     double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI */
     double att,     /* Stop-band attenuation in dB */
     int * num_taps, /* 0: value will be estimated */
-    int k,          /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */
+    int k,          /* >0: number of phases; <0: num_taps = 1 (mod -k) */
     double beta)    /* <0: value will be estimated */
 {
   int n = *num_taps, phases = max(k, 1), modulo = max(-k, 1);
   double tr_bw, Fc, rho = phases == 1? .5 : att < 120? .63 : .75;
 
+  lsx_debug_more("./sinctest %-12.7g %-12.7g %g 0 %-5g %i %i 50 %g %g -4 >1",
+      Fp, Fs, Fn, att, *num_taps, k, beta, rho);
+
   Fp /= fabs(Fn), Fs /= fabs(Fn);        /* Normalise to Fn = 1 */
   tr_bw = .5 * (Fs - Fp); /* Transition band-width: 6dB to stop points */
   tr_bw /= phases, Fs /= phases;
@@ -145,7 +145,7 @@ double * lsx_design_lpf(
 static double safe_log(double x)
 {
   assert(x >= 0);
-  if (x)
+  if (x!=0)
     return log(x);
   lsx_debug("log(0)");
   return -26;
@@ -222,7 +222,7 @@ void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase)
   while (peak && fabs(work[peak-1]) > fabs(work[peak]) && work[peak-1] * work[peak] > 0)
     --peak;
 
-  if (!phase1)
+  if (phase1==0)
     begin = 0;
   else if (phase1 == 1)
     begin = peak - *len / 2;
@@ -243,3 +243,35 @@ void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase)
       work[imp_peak], *len, *post_len, 100 - 100. * *post_len / (*len - 1));
   free(pi_wraps), free(work);
 }
+
+#define F_x(F,expr) static double F(double x) {return expr;}
+F_x(sinePhi, ((2.0517e-07*x-1.1303e-04)*x+.023154)*x+.55924 )
+F_x(sinePsi, ((9.0667e-08*x-5.6114e-05)*x+.013658)*x+1.0977 )
+F_x(sinePow, log(.5)/log(sin(x*.5)) )
+#define dB_to_linear(x) exp((x) * (M_LN10 * 0.05))
+
+double lsx_f_resp(double t, double a)
+{
+  double x;
+  if (t > (a <= 160? .8 : .82)) {
+    double a1 = a+15;
+    double p = .00035*a+.375;
+    double w = 1/(1-.597)*asin(pow((a1-10.6)/a1,1/p));
+    double c = 1+asin(pow(1-a/a1,1/p))/w;
+    return a1*(pow(sin((c-t)*w),p)-1);
+  }
+  if (t > .5)
+    x = sinePsi(a), x = pow(sin((1-t) * x), sinePow(x));
+  else
+    x = sinePhi(a), x = 1 - pow(sin(t * x), sinePow(x));
+  return linear_to_dB(x);
+}
+
+double lsx_inv_f_resp(double drop, double a)
+{
+  double x = sinePhi(a), s;
+  drop = dB_to_linear(drop);
+  s = drop > .5 ? 1 - drop : drop;
+  x = asin(pow(s, 1/sinePow(x))) / x;
+  return drop > .5? x : 1 -x;
+}
diff --git a/soxr/src/filter.h b/soxr/src/filter.h
index 435303b..203e73d 100644
--- a/soxr/src/filter.h
+++ b/soxr/src/filter.h
@@ -16,10 +16,10 @@ void lsx_safe_rdft(int len, int type, double * d);
 void lsx_safe_cdft(int len, int type, double * d);
 void lsx_safe_rdft_f(int len, int type, float * d);
 void lsx_safe_cdft_f(int len, int type, float * d);
-void lsx_ordered_convolve(int n, void * not_used, double * a, const double * b);
-void lsx_ordered_convolve_f(int n, void * not_used, float * a, const float * b);
-void lsx_ordered_partial_convolve(int n, double * a, const double * b);
-void lsx_ordered_partial_convolve_f(int n, float * a, const float * b);
+void lsx_ordered_convolve(int n, void * not_used, void * a, const void * b);
+void lsx_ordered_convolve_f(int n, void * not_used, void * a, const void * b);
+void lsx_ordered_partial_convolve(int n, void * a, const void * b);
+void lsx_ordered_partial_convolve_f(int n, void * a, const void * b);
 
 double lsx_kaiser_beta(double att, double tr_bw);
 double * lsx_make_lpf(int num_taps, double Fc, double beta, double rho,
@@ -31,9 +31,14 @@ double * lsx_design_lpf(
     double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI; < 0: dummy run */
     double att,     /* Stop-band attenuation in dB */
     int * num_taps, /* 0: value will be estimated */
-    int k,          /* >0: number of phases; <0: num_taps ≡ 1 (mod -k) */
+    int k,          /* >0: number of phases; <0: num_taps = 1 (mod -k) */
     double beta);   /* <0: value will be estimated */
+
 void lsx_fir_to_phase(double * * h, int * len,
     int * post_len, double phase0);
 
+double lsx_f_resp(double t, double a);
+double lsx_inv_f_resp(double drop, double a);
+#define lsx_to_3dB(a) (1 - lsx_inv_f_resp(-3., a))
+
 #endif
diff --git a/soxr/src/filters.h b/soxr/src/filters.h
deleted file mode 100644
index e9a8011..0000000
--- a/soxr/src/filters.h
+++ /dev/null
@@ -1,151 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#include "half_coefs.h"
-
-#define FUNCTION h8
-#define CONVOLVE _ _ _ _ _ _ _ _
-#define h8_l 8
-#define COEFS half_fir_coefs_8
-#include "half-fir.h"
-
-#define FUNCTION h9
-#define CONVOLVE _ _ _ _ _ _ _ _ _
-#define h9_l 9
-#define COEFS half_fir_coefs_9
-#include "half-fir.h"
-
-#define FUNCTION h10
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _
-#define h10_l 10
-#define COEFS half_fir_coefs_10
-#include "half-fir.h"
-
-#define FUNCTION h11
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _
-#define h11_l 11
-#define COEFS half_fir_coefs_11
-#include "half-fir.h"
-
-#define FUNCTION h12
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _
-#define h12_l 12
-#define COEFS half_fir_coefs_12
-#include "half-fir.h"
-
-#define FUNCTION h13
-#define CONVOLVE _ _ _ _ _ _ _ _ _ _ _ _ _
-#define h13_l 13
-#define COEFS half_fir_coefs_13
-#include "half-fir.h"
-
-static struct {int num_coefs; stage_fn_t fn; float att;} const half_firs[] = {
-  { 8, h8 , 136.51f},
-  { 9, h9 , 152.32f},
-  {10, h10, 168.07f},
-  {11, h11, 183.78f},
-  {12, h12, 199.44f},
-  {13, h13, 212.75f},
-};
-
-#define HI_PREC_CLOCK
-
-#define VAR_LENGTH p->n
-#define VAR_CONVOLVE while (j < FIR_LENGTH) _
-#define VAR_POLY_PHASE_BITS p->phase_bits
-
-#define FUNCTION vpoly0
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir0.h"
-
-#define FUNCTION vpoly1
-#define COEF_INTERP 1
-#define PHASE_BITS VAR_POLY_PHASE_BITS
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir.h"
-
-#define FUNCTION vpoly2
-#define COEF_INTERP 2
-#define PHASE_BITS VAR_POLY_PHASE_BITS
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir.h"
-
-#define FUNCTION vpoly3
-#define COEF_INTERP 3
-#define PHASE_BITS VAR_POLY_PHASE_BITS
-#define FIR_LENGTH VAR_LENGTH
-#define CONVOLVE VAR_CONVOLVE
-#include "poly-fir.h"
-
-#undef HI_PREC_CLOCK
-
-#define U100_l 42
-#if RATE_SIMD_POLY
-  #define U100_l_EXTRA _ _
-  #define u100_l_EXTRA _
-  #define U100_l_EXTRA_LENGTH 2
-  #define u100_l_EXTRA_LENGTH 1
-#else
-  #define U100_l_EXTRA
-  #define u100_l_EXTRA
-  #define U100_l_EXTRA_LENGTH 0
-  #define u100_l_EXTRA_LENGTH 0
-#endif
-#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ U100_l_EXTRA
-#define FUNCTION U100_0
-#define FIR_LENGTH (U100_l + U100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_U100
-#include "poly-fir0.h"
-
-#define u100_l 11
-#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _ u100_l_EXTRA
-#define FUNCTION u100_0
-#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_u100
-#include "poly-fir0.h"
-
-#define FUNCTION u100_1
-#define COEF_INTERP 1
-#define PHASE_BITS 8
-#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_u100
-#include "poly-fir.h"
-#define u100_1_b 8
-
-#define FUNCTION u100_2
-#define COEF_INTERP 2
-#define PHASE_BITS 6
-#define FIR_LENGTH (u100_l + u100_l_EXTRA_LENGTH)
-#define CONVOLVE poly_fir_convolve_u100
-#include "poly-fir.h"
-#define u100_2_b 6
-
-typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
-typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;
-
-static poly_fir_t const poly_firs[] = {
-  {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}},
-  {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}},
-  {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}},
-  {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}},
-  {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}},
-  {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}},
-
-  {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}},
-  {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}},
-  {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}},
-  {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}},
-  {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}},
-  {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}},
-
-  {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}},
-  {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}},
-  {-1, {{0, vpoly0}, {   9, vpoly1}, {  6, vpoly2}}},
-  {-1, {{0, vpoly0}, {  11, vpoly1}, {  7, vpoly2}}},
-  {-1, {{0, vpoly0}, {  13, vpoly1}, {  8, vpoly2}}},
-  {-1, {{0, vpoly0}, {  10, vpoly2}, {  8, vpoly3}}},
-  {-1, {{0, vpoly0}, {  12, vpoly2}, {  9, vpoly3}}},
-};
diff --git a/soxr/src/half-coefs.h b/soxr/src/half-coefs.h
new file mode 100644
index 0000000..a5a0882
--- /dev/null
+++ b/soxr/src/half-coefs.h
@@ -0,0 +1,75 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if defined __GNUC__
+  #pragma GCC system_header
+#elif defined __SUNPRO_C
+  #pragma disable_warn
+#elif defined _MSC_VER
+  #pragma warning(push, 1)
+#endif
+
+#if CORE_TYPE & CORE_SIMD_HALF
+  #define VALIGN vAlign
+#else
+  #define VALIGN
+#endif
+
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+static VALIGN const sample_t half_fir_coefs_7[] = {
+ 3.1062656496657370e-01, -8.4998810699955796e-02,  3.4007044621123500e-02,
+-1.2839903789829387e-02,  3.9899380181723145e-03, -8.9355202017945374e-04,
+ 1.0918292424806546e-04,
+};
+#endif
+
+static VALIGN const sample_t half_fir_coefs_8[] = {
+ 3.1154652365332069e-01, -8.7344917685739543e-02,  3.6814458353637280e-02,
+-1.5189204581464479e-02,  5.4540855610738801e-03, -1.5643862626630416e-03,
+ 3.1816575906323303e-04, -3.4799449225005688e-05,
+};
+
+static VALIGN const sample_t half_fir_coefs_9[] = {
+ 3.1227034755311189e-01, -8.9221517147969526e-02,  3.9139704015071934e-02,
+-1.7250558515852023e-02,  6.8589440230476112e-03, -2.3045049636430419e-03,
+ 6.0963740543348963e-04, -1.1323803957431231e-04,  1.1197769991000046e-05,
+};
+
+#if CORE_TYPE & CORE_DBL
+static VALIGN const sample_t half_fir_coefs_10[] = {
+ 3.1285456012000523e-01, -9.0756740799292787e-02,  4.1096398104193160e-02,
+-1.9066319572525220e-02,  8.1840569787684902e-03, -3.0766876176359834e-03,
+ 9.6396524429277980e-04, -2.3585679989922018e-04,  4.0252189026627833e-05,
+-3.6298196342497932e-06,
+};
+
+static VALIGN const sample_t half_fir_coefs_11[] = {
+ 3.1333588822574199e-01, -9.2035898673019811e-02,  4.2765169698406408e-02,
+-2.0673580894964429e-02,  9.4225426824512421e-03, -3.8563379950013192e-03,
+ 1.3634742159642453e-03, -3.9874150714431009e-04,  9.0586723632664806e-05,
+-1.4285617244076783e-05,  1.1834642946400529e-06,
+};
+
+static VALIGN const sample_t half_fir_coefs_12[] = {
+ 3.1373928463345568e-01, -9.3118180335301962e-02,  4.4205005881659098e-02,
+-2.2103860986973051e-02,  1.0574689371162864e-02, -4.6276428065385065e-03,
+ 1.7936153397572132e-03, -5.9617527051353237e-04,  1.6314517495669067e-04,
+-3.4555126770115446e-05,  5.0617615610782593e-06, -3.8768958592971409e-07,
+};
+
+static VALIGN const sample_t half_fir_coefs_13[] = {
+ 3.1408224847888910e-01, -9.4045836332667387e-02,  4.5459878763259978e-02,
+-2.3383369012219993e-02,  1.1644273044890753e-02, -5.3806714579057013e-03,
+ 2.2429072878264022e-03, -8.2204347506606424e-04,  2.5724946477840893e-04,
+-6.6072709864248668e-05,  1.3099163296288644e-05, -1.7907147069136000e-06,
+ 1.2750825595240592e-07,
+};
+#endif
+
+#undef VALIGN
+
+#if defined __SUNPRO_C
+  #pragma enable_warn
+#elif defined _MSC_VER
+  #pragma warning(pop)
+#endif
diff --git a/soxr/src/half-fir.h b/soxr/src/half-fir.h
index 0a8ee97..782be1b 100644
--- a/soxr/src/half-fir.h
+++ b/soxr/src/half-fir.h
@@ -1,25 +1,61 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-/* Down-sample by a factor of 2 using a FIR with odd length (LEN).*/
+/* Decimate by 2 using a FIR with odd length (LEN). */
 /* Input must be preceded and followed by LEN >> 1 samples. */
 
-#define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j;
-static void FUNCTION(stage_t * p, fifo_t * output_fifo)
+#define COEFS ((sample_t const *)p->coefs)
+
+#if SIMD_SSE
+  #define BEGINNING v4_t sum, q1, q2, t
+  #define ____ \
+    q1 = _mm_shuffle_ps(t=vLdu(input+2*j),vLdu(input+2*j+4),_MM_SHUFFLE(3,1,3,1)); \
+    q2 = _mm_shuffle_ps(vLdu(input-2*j-4),vLdu(input-2*j-8),_MM_SHUFFLE(1,3,1,3)); \
+    sum = vAdd(j? sum : vMul(vSet1(.5), t), vMul(vAdd(q1, q2), vLd(COEFS+j))); \
+    j += 4;
+  #define __ \
+    q1 = _mm_shuffle_ps(vLdu(input+2*j), vLdu(input-2*j-4), _MM_SHUFFLE(1,3,3,1)); \
+    q2 = _mm_loadl_pi(q2, (__m64*)(COEFS+j)), q2 = _mm_movelh_ps(q2, q2); \
+    sum = vAdd(sum, vMul(q1, q2)); \
+    j += 2;
+  #define _ \
+    q1 = _mm_add_ss(_mm_load_ss(input+2*j+1), _mm_load_ss(input-2*j-1)); \
+    sum = _mm_add_ss(sum, _mm_mul_ss(q1, _mm_load_ss(COEFS+j))); \
+    ++j;
+  #define END vStorSum(output+i, sum)
+/* #elif SIMD_AVX; No good solution found. */
+/* #elif SIMD_NEON; No need: gcc -O3 does a good job by itself. */
+#else
+  #define BEGINNING sample_t sum = input[0] * .5f
+  #define ____ __ __
+  #define __ _ _
+  #define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j;
+  #define END output[i] = sum
+#endif
+
+
+
+static void FUNCTION_H(stage_t * p, fifo_t * output_fifo)
 {
-  sample_t const * input = stage_read_p(p);
-  int i, num_out = (stage_occupancy(p) + 1) / 2;
-  sample_t * output = fifo_reserve(output_fifo, num_out);
+  sample_t const * __restrict input = stage_read_p(p);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  int i, num_out = (num_in + 1) >> 1;
+  sample_t * __restrict output = fifo_reserve(output_fifo, num_out);
 
   for (i = 0; i < num_out; ++i, input += 2) {
     int j = 0;
-    sample_t sum = input[0] * .5f;
-    CONVOLVE
-    output[i] = sum;
+    BEGINNING; CONVOLVE; END;
   }
   fifo_read(&p->fifo, 2 * num_out, NULL);
 }
+
+
+
 #undef _
+#undef __
+#undef ____
+#undef BEGINNING
+#undef END
 #undef COEFS
 #undef CONVOLVE
-#undef FUNCTION
+#undef FUNCTION_H
diff --git a/soxr/src/half_coefs.h b/soxr/src/half_coefs.h
deleted file mode 100644
index aac7769..0000000
--- a/soxr/src/half_coefs.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#if defined __GNUC__
-  #pragma GCC system_header
-#elif defined __SUNPRO_C
-  #pragma disable_warn
-#elif defined _MSC_VER
-  #pragma warning(push, 1)
-#endif
-
-static const sample_t half_fir_coefs_8[] = {
-  0.3115465451887802, -0.08734497241282892, 0.03681452335604365,
-  -0.01518925831569441, 0.005454118437408876, -0.001564400922162005,
-  0.0003181701445034203, -3.48001341225749e-5,
-};
-
-static const sample_t half_fir_coefs_9[] = {
-  0.3122703613711853, -0.08922155288172305, 0.03913974805854332,
-  -0.01725059723447163, 0.006858970092378141, -0.002304518467568703,
-  0.0006096426006051062, -0.0001132393923815236, 1.119795386287666e-5,
-};
-
-static const sample_t half_fir_coefs_10[] = {
-  0.3128545521327376, -0.09075671986104322, 0.04109637155154835,
-  -0.01906629512749895, 0.008184039342054333, -0.0030766775017262,
-  0.0009639607022414314, -0.0002358552746579827, 4.025184282444155e-5,
-  -3.629779111541012e-6,
-};
-
-static const sample_t half_fir_coefs_11[] = {
-  0.3133358837508807, -0.09203588680609488, 0.04276515428384758,
-  -0.02067356614745591, 0.00942253142371517, -0.003856330993895144,
-  0.001363470684892284, -0.0003987400965541919, 9.058629923971627e-5,
-  -1.428553070915318e-5, 1.183455238783835e-6,
-};
-
-static const sample_t half_fir_coefs_12[] = {
-  0.3137392991811407, -0.0931182192961332, 0.0442050575271454,
-  -0.02210391200618091, 0.01057473015666001, -0.00462766983973885,
-  0.001793630226239453, -0.0005961819959665878, 0.0001631475979359577,
-  -3.45557865639653e-5, 5.06188341942088e-6, -3.877010943315563e-7,
-};
-
-static const sample_t half_fir_coefs_13[] = {
-  0.3140822554324578, -0.0940458550886253, 0.04545990399121566,
-  -0.02338339450796002, 0.01164429409071052, -0.005380686021429845,
-  0.002242915773871009, -0.000822047600000082, 0.0002572510962395222,
-  -6.607320708956279e-5, 1.309926399120154e-5, -1.790719575255006e-6,
-  1.27504961098836e-7,
-};
-
-#if defined __SUNPRO_C
-  #pragma enable_warn
-#elif defined _MSC_VER
-  #pragma warning(pop)
-#endif
diff --git a/soxr/src/internal.h b/soxr/src/internal.h
index 5d8d44e..08924d5 100644
--- a/soxr/src/internal.h
+++ b/soxr/src/internal.h
@@ -1,46 +1,84 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #if !defined soxr_internal_included
 #define soxr_internal_included
 
-#include "soxr-config.h"
+#include "std-types.h"
+
+
 
 #undef min
 #undef max
 #define min(a, b) ((a) <= (b) ? (a) : (b))
 #define max(a, b) ((a) >= (b) ? (a) : (b))
 
+
+
 #define range_limit(x, lower, upper) (min(max(x, lower), upper))
 #define linear_to_dB(x) (log10(x) * 20)
 #define array_length(a) (sizeof(a)/sizeof(a[0]))
+#if !defined AL
 #define AL(a) array_length(a)
+#endif
 #define iAL(a) (int)AL(a)
 #define sqr(a) ((a) * (a))
 
-#ifdef __GNUC__
+
+
+#if defined __GNUC__
   #define UNUSED __attribute__ ((unused))
 #else
   #define UNUSED
 #endif
 
-#if defined NDEBUG || SOXR_SILENT
+
+
+#if !WITH_DEV_TRACE
   #ifdef __GNUC__
     void lsx_dummy(char const *, ...);
   #else
     static __inline void lsx_dummy(char const * x, ...) {}
   #endif
   #define lsx_debug if(0) lsx_dummy
+  #define lsx_debug_more lsx_debug
 #else
-  #include <stdarg.h>
-  #include <stdio.h>
-  UNUSED static void lsx_debug(char const * fmt, ...)
-  {
-    va_list args;
-    va_start(args, fmt);
-    vfprintf(stderr, fmt, args);
-    fputc('\n', stderr);
-    va_end(args);
-  }
+  extern int _soxr_trace_level;
+  void _soxr_trace(char const * fmt, ...);
+  #define lsx_debug      if (_soxr_trace_level > 0) _soxr_trace
+  #define lsx_debug_more if (_soxr_trace_level > 1) _soxr_trace
 #endif
+
+
+
+/* soxr_quality_spec_t.flags: */
+
+#define SOXR_ROLLOFF_LSR2Q     3u    /* Reserved for internal use. */
+#define SOXR_ROLLOFF_MASK      3u    /* For masking these bits. */
+#define SOXR_MAINTAIN_3DB_PT   4u    /* Reserved for internal use. */
+#define SOXR_PROMOTE_TO_LQ    64u    /* Reserved for internal use. */
+
+
+
+/* soxr_runtime_spec_t.flags: */
+
+#define SOXR_STRICT_BUFFERING  4u    /* Reserved for future use. */
+#define SOXR_NOSMALLINTOPT     8u    /* For test purposes only. */
+
+
+
+/* soxr_quality_spec recipe: */
+
+#define SOXR_PRECISIONQ         11   /* Quality specified by the precision parameter. */
+
+#define SOXR_PHASE_MASK         0x30 /* For masking these bits. */
+
+
+
+/* soxr_quality_spec flags: */
+
+#define RESET_ON_CLEAR   (1u<<31)
+
+
+
 #endif
diff --git a/soxr/src/libsoxr-dev.src.in b/soxr/src/libsoxr-dev.src.in
deleted file mode 100644
index ce879f9..0000000
--- a/soxr/src/libsoxr-dev.src.in
+++ /dev/null
@@ -1,2 +0,0 @@
-set(TARGET_HEADERS "@TARGET_HEADERS@")
-set(TARGET_PCS "@TARGET_PCS@")
diff --git a/soxr/src/libsoxr.src.in b/soxr/src/libsoxr.src.in
deleted file mode 100644
index 1c926ff..0000000
--- a/soxr/src/libsoxr.src.in
+++ /dev/null
@@ -1 +0,0 @@
-set(TARGET_LIBS "@TARGET_LIBS@")
diff --git a/soxr/src/lsr.c b/soxr/src/lsr.c
deleted file mode 100644
index 64b5798..0000000
--- a/soxr/src/lsr.c
+++ /dev/null
@@ -1,114 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-/* Wrapper mostly compatible with `libsamplerate'. */
-
-#include <assert.h>
-#include <stdlib.h>
-#include "soxr.h"
-
-/* Runtime casts: */
-typedef struct io_t {
-  float *in,*out; long ilen,olen,idone,odone; int eoi; double oi_ratio;} io_t;
-#define SRC_DATA io_t
-typedef struct  soxr SRC_STATE;
-#define src_callback_t soxr_input_fn_t
-#define SRC_ERROR soxr_error_t
-#define SRC_SRCTYPE unsigned
-
-#include "soxr-lsr.h"
-#include "rint.h"
-
-soxr_error_t src_simple(io_t * p, unsigned id, int channels)
-{
-  size_t idone, odone;
-  soxr_error_t error;
-  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + id, 0);
-  char const * e = getenv("SOXR_LSR_NUM_THREADS");
-  soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
-  assert (channels > 0);
-  assert (p->ilen >= 0);
-  assert (p->olen >= 0);
-  error = soxr_oneshot(1, p->oi_ratio, (unsigned)channels,
-      p->in, (size_t)p->ilen, &idone, p->out, (size_t)p->olen, &odone,
-      0, &q_spec, &r_spec);
-  p->idone = (long)idone, p->odone = (long)odone;
-  return error;
-}
-
-soxr_t src_callback_new(soxr_input_fn_t fn, unsigned id, int channels, SRC_ERROR * error0, void * p)
-{
-  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + id, 0);
-  char const * e = getenv("SOXR_LSR_NUM_THREADS");
-  soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
-  soxr_error_t error;
-  soxr_t soxr = 0;
-  assert (channels > 0);
-  /* To minimise latency e.g. for real-time playback:
-  if (id == 2)
-    r_spec.log2_large_dft_size = r_spec.log2_min_dft_size = 8;
-    */
-  soxr = soxr_create(0, 0, (unsigned)channels, &error, 0, &q_spec, &r_spec);
-  if (soxr)
-    error = soxr_set_input_fn(soxr, fn, p, 0);
-  if (error0)
-    *(int *)error0 = (int)(ptrdiff_t)error;
-  return soxr;
-}
-
-soxr_error_t src_process(soxr_t p, io_t * io)
-{
-  if (!p || !io) return "null pointer";
-  soxr_set_error(p, soxr_set_io_ratio(p, 1/io->oi_ratio, (size_t)io->olen));
-
-  { size_t idone , odone;
-  soxr_process(p, io->in, (size_t)(io->eoi? ~io->ilen : io->ilen), /* hack */
-      &idone, io->out, (size_t)io->olen, &odone);
-  io->idone = (long)idone, io->odone = (long)odone;
-  return soxr_error(p); }
-}
-
-long src_callback_read(soxr_t p, double oi_ratio, long olen, float * obuf)
-{
-  if (!p || olen < 0) return -1;
-  soxr_set_error(p, soxr_set_io_ratio(p, 1/oi_ratio, (size_t)olen));
-  return (long)soxr_output(p, obuf, (size_t)olen);
-}
-
-void src_float_to_short_array(float const * src, short * dest, int len)
-{
-  double d, N = 1. + SHRT_MAX;
-  assert (src && dest);
-  while (len--) d = src[len] * N, dest[len] = (short)(d > N - 1? (short)(N - 1) : d < -N? (short)-N : rint16(d));
-}
-
-void src_short_to_float_array(short const * src, float * dest, int len)
-{
-  assert (src && dest);
-  while (len--) dest[len] = (float)(src[len] * (1 / (1. + SHRT_MAX)));
-}
-
-void src_float_to_int_array(float const * src, int * dest, int len)
-{
-  double d, N = 32768. * 65536.; /* N.B. int32, not int! (Also next fn.) */
-  assert (src && dest);
-  while (len--) d = src[len] * N, dest[len] = d >= N - 1? (int)(N - 1) : d < -N? (int)(-N) : rint32(d);
-}
-
-void src_int_to_float_array(int const * src, float * dest, int len)
-{
-  assert (src && dest);
-  while (len--) dest[len] = (float)(src[len] * (1 / (32768. * 65536.)));
-}
-
-static char const * const names[] = {"LSR best sinc", "LSR medium sinc", "LSR fastest sinc", "LSR ZOH", "LSR linear", "SoX VHQ"};
-char const * src_get_name(unsigned n)         {return n < 5u + !getenv("SOXR_LSR_STRICT")? names[n] : 0;}
-char const * src_get_description(unsigned id) {return src_get_name(id);}
-char const * src_get_version(void)            {return soxr_version();}
-char const * src_strerror(soxr_error_t error) {return error == (soxr_error_t)1? "Placeholder." : sizeof(int) >= sizeof(char *) || !error ? soxr_strerror(error) : "soxr error";}
-int src_is_valid_ratio(double oi_ratio)       {return getenv("SOXR_LSR_STRICT")? oi_ratio >= 1./256 && oi_ratio <= 256 : oi_ratio > 0;}
-soxr_error_t src_error(soxr_t p)              {return soxr_error(p);}
-soxr_error_t src_reset(soxr_t p)              {return soxr_clear(p);}
-soxr_t src_delete(soxr_t p)                   {soxr_delete(p); return 0;}
-soxr_error_t src_set_ratio(soxr_t p, double oi_ratio) {return soxr_set_io_ratio(p, 1/oi_ratio, 0);}
-soxr_t src_new(unsigned id, int channels, SRC_ERROR * error) {return src_callback_new(0, id, channels, error, 0);}
diff --git a/soxr/src/math-wrap.h b/soxr/src/math-wrap.h
new file mode 100644
index 0000000..8a526f1
--- /dev/null
+++ b/soxr/src/math-wrap.h
@@ -0,0 +1,31 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_math_wrap_included
+#define soxr_math_wrap_included
+
+#include <math.h>
+
+#if defined __STRICT_ANSI__
+  #define sinf(x)  (float)sin ((double)(x))
+  #define cosf(x)  (float)cos ((double)(x))
+  #define atanf(x) (float)atan((double)(x))
+#endif
+
+#if !defined M_PI
+  #define M_PI    3.141592653589793238462643383279502884
+#endif
+
+#if !defined M_LN10
+  #define M_LN10  2.302585092994045684017991454684364208
+#endif
+
+#if !defined M_SQRT2
+  #define M_SQRT2 1.414213562373095048801688724209698079
+#endif
+
+#if !defined M_LN2
+  #define M_LN2   0.693147180559945309417232121458176568
+#endif
+
+#endif
diff --git a/soxr/src/pffft-avx.h b/soxr/src/pffft-avx.h
new file mode 100644
index 0000000..ace19b5
--- /dev/null
+++ b/soxr/src/pffft-avx.h
@@ -0,0 +1,40 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* AVX support macros */
+
+#if !defined soxr_avx_included
+#define soxr_avx_included
+
+#include <immintrin.h>
+
+typedef __m256d v4sf;
+#define VZERO() _mm256_setzero_pd()
+#define VMUL(a,b) _mm256_mul_pd(a,b)
+#define VADD(a,b) _mm256_add_pd(a,b)
+#define VMADD(a,b,c) VADD(VMUL(a,b),c) /* Note: gcc -mfma will `fuse' these */
+#define VSUB(a,b) _mm256_sub_pd(a,b)
+#define LD_PS1(p) _mm256_set1_pd(p)
+#define INTERLEAVE2(in1, in2, out1, out2) {v4sf \
+  t1 = _mm256_unpacklo_pd(in1, in2), \
+  t2 = _mm256_unpackhi_pd(in1, in2); \
+  out1 = _mm256_permute2f128_pd(t1,t2,0x20); \
+  out2 = _mm256_permute2f128_pd(t1,t2,0x31); }
+#define UNINTERLEAVE2(in1, in2, out1, out2) {v4sf \
+  t1 = _mm256_permute2f128_pd(in1,in2,0x20), \
+  t2 = _mm256_permute2f128_pd(in1,in2,0x31); \
+  out1 = _mm256_unpacklo_pd(t1, t2); \
+  out2 = _mm256_unpackhi_pd(t1, t2);}
+#define VTRANSPOSE4(x0,x1,x2,x3) {v4sf \
+  t0 = _mm256_shuffle_pd(x0,x1, 0x0), \
+  t2 = _mm256_shuffle_pd(x0,x1, 0xf), \
+  t1 = _mm256_shuffle_pd(x2,x3, 0x0), \
+  t3 = _mm256_shuffle_pd(x2,x3, 0xf); \
+  x0 = _mm256_permute2f128_pd(t0,t1, 0x20); \
+  x1 = _mm256_permute2f128_pd(t2,t3, 0x20); \
+  x2 = _mm256_permute2f128_pd(t0,t1, 0x31); \
+  x3 = _mm256_permute2f128_pd(t2,t3, 0x31);}
+#define VSWAPHL(a,b) _mm256_permute2f128_pd(b, a, 0x30)
+#define VALIGNED(ptr) ((((long)(ptr)) & 0x1F) == 0)
+
+#endif
diff --git a/soxr/src/pffft-wrap.c b/soxr/src/pffft-wrap.c
new file mode 100644
index 0000000..c920f06
--- /dev/null
+++ b/soxr/src/pffft-wrap.c
@@ -0,0 +1,110 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined PFFT_MACROS_ONLY
+
+#include "math-wrap.h"
+
+#if PFFFT_DOUBLE
+  #include "util64s.h"
+#else
+  #include "util32s.h"
+  #define sin(x) sinf(x)
+  #define cos(x) cosf(x)
+#endif
+
+#define pffft_aligned_free    SIMD_ALIGNED_FREE
+#define pffft_aligned_malloc  SIMD_ALIGNED_MALLOC
+#define pffft_aligned_calloc  SIMD_ALIGNED_CALLOC
+
+#undef inline
+#define inline __inline
+
+#endif
+
+
+
+#include "pffft.c"
+
+
+
+#if !defined PFFT_MACROS_ONLY
+
+#if !defined PFFFT_SIMD_DISABLE
+
+static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
+  int i, Ncvec = s->Ncvec;
+  const v4sf * /*RESTRICT*/ va = (const v4sf*)a;
+  const v4sf * RESTRICT vb = (const v4sf*)b;
+  v4sf * /*RESTRICT*/ vab = (v4sf*)ab;
+
+  float ar, ai, br, bi;
+
+#ifdef __arm__
+  __builtin_prefetch(va);
+  __builtin_prefetch(vb);
+  __builtin_prefetch(va+2);
+  __builtin_prefetch(vb+2);
+  __builtin_prefetch(va+4);
+  __builtin_prefetch(vb+4);
+  __builtin_prefetch(va+6);
+  __builtin_prefetch(vb+6);
+#endif
+
+  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+  ar = ((v4sf_union*)va)[0].f[0];
+  ai = ((v4sf_union*)va)[1].f[0];
+  br = ((v4sf_union*)vb)[0].f[0];
+  bi = ((v4sf_union*)vb)[1].f[0];
+
+  for (i=0; i < Ncvec; i += 2) {
+    v4sf ar, ai, br, bi;
+    ar = va[2*i+0]; ai = va[2*i+1];
+    br = vb[2*i+0]; bi = vb[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+0] = ar;
+    vab[2*i+1] = ai;
+    ar = va[2*i+2]; ai = va[2*i+3];
+    br = vb[2*i+2]; bi = vb[2*i+3];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+2] = ar;
+    vab[2*i+3] = ai;
+  }
+  if (s->transform == PFFFT_REAL) {
+    ((v4sf_union*)vab)[0].f[0] = ar*br;
+    ((v4sf_union*)vab)[1].f[0] = ai*bi;
+  }
+}
+
+#else
+
+static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
+  int i, Ncvec = s->Ncvec;
+
+  if (s->transform == PFFFT_REAL) {
+    /* take care of the fftpack ordering */
+    ab[0] = a[0]*b[0];
+    ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1];
+    ++ab; ++a; ++b; --Ncvec;
+  }
+  for (i=0; i < Ncvec; ++i) {
+    float ar, ai, br, bi;
+    ar = a[2*i+0]; ai = a[2*i+1];
+    br = b[2*i+0]; bi = b[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    ab[2*i+0] = ar;
+    ab[2*i+1] = ai;
+  }
+}
+
+#endif
+
+#include <string.h>
+
+static void pffft_reorder_back(int length, void * setup, float * data, float * work)
+{
+  memcpy(work, data, (unsigned)length * sizeof(*work));
+  pffft_zreorder(setup, work, data, PFFFT_BACKWARD);
+}
+
+#endif
diff --git a/soxr/src/pffft.c b/soxr/src/pffft.c
index 957e604..46c841e 100644
--- a/soxr/src/pffft.c
+++ b/soxr/src/pffft.c
@@ -1,4 +1,7 @@
-/* Copyright (c) 2011  Julien Pommier ( pommier@modartt.com )
+/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.c
+ * with minor changes for libsoxr. */
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
 
    Based on original fortran 77 code from FFTPACKv4 from NETLIB
    (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
@@ -57,29 +60,12 @@
   - 2011/10/02, version 1: This is the very first release of this file.
 */
 
-#if !defined PFFT_MACROS_ONLY
 #include "pffft.h"
-#include "simd.h"
-#include <string.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <math.h>
 #include <assert.h>
 
-#define pffft_aligned_free    _soxr_simd_aligned_free
-#define pffft_aligned_malloc  _soxr_simd_aligned_malloc
-#define pffft_aligned_calloc  _soxr_simd_aligned_calloc
-#endif
-
-/*
-   vector support macros: the rest of the code is independant of
-   SSE/Altivec/NEON -- adding support for other platforms with 4-element
-   vectors should be limited to these macros
-*/
-
-
-/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */
-/*#define PFFFT_SIMD_DISABLE */
-
 /* detect compiler flavour */
 #if defined(_MSC_VER)
 #  define COMPILER_MSVC
@@ -91,14 +77,25 @@
 #  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
 #  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
 #  define RESTRICT __restrict
-/*#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__]; */
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
 #elif defined(COMPILER_MSVC)
 #  define ALWAYS_INLINE(return_type) __forceinline return_type
 #  define NEVER_INLINE(return_type) __declspec(noinline) return_type
 #  define RESTRICT __restrict
-/*#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (v4sf*)_alloca(size__ * sizeof(type__)) */
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
 #endif
 
+
+/*
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros
+*/
+
+
+/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */
+/*#define PFFFT_SIMD_DISABLE */
+
 /*
    Altivec support macros
 */
@@ -136,9 +133,11 @@ inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_p
 */
 #elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
 
+#  define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+
+#if !PFFFT_DOUBLE
 #include <xmmintrin.h>
 typedef __m128 v4sf;
-#  define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
 #  define VZERO() _mm_setzero_ps()
 #  define VMUL(a,b) _mm_mul_ps(a,b)
 #  define VADD(a,b) _mm_add_ps(a,b)
@@ -151,10 +150,14 @@ typedef __m128 v4sf;
 #  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
 #  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
 
+#else
+#include "pffft-avx.h"
+#endif
+
 /*
   ARM NEON support macros
 */
-#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__arm64__) || defined(__aarch64__))
+#elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__)
 #  include <arm_neon.h>
 typedef float32x4_t v4sf;
 #  define SIMD_SZ 4
@@ -166,7 +169,7 @@ typedef float32x4_t v4sf;
 #  define LD_PS1(p) vld1q_dup_f32(&(p))
 #  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
 #  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
-#  define VTRANSPOSE4_(x0,x1,x2,x3) {                                    \
+#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
     float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
     float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
     float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
@@ -174,7 +177,7 @@ typedef float32x4_t v4sf;
     x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
   }
 /* marginally faster version */
-#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+/*#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } */
 #  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
 #  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
 #else
@@ -184,6 +187,10 @@ typedef float32x4_t v4sf;
 #  endif
 #endif
 
+#if PFFFT_DOUBLE
+#define float double
+#endif
+
 /* fallback mode for situations where SSE/Altivec are not available, use scalar mode instead */
 #ifdef PFFFT_SIMD_DISABLE
 typedef float v4sf;
@@ -200,6 +207,12 @@ typedef float v4sf;
 /* shortcuts for complex multiplcations */
 #define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
 #define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+/* multiply a scalar with a vector */
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#if !defined PFFT_MACROS_ONLY
 
 #if !defined(PFFFT_SIMD_DISABLE)
 typedef union v4sf_union {
@@ -213,7 +226,8 @@ typedef union v4sf_union {
 #define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
 
 /* detect bugs with the vector support macros */
-void validate_pffft_simd() {
+void validate_pffft_simd(void);
+void validate_pffft_simd(void) {
   float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
   v4sf_union a0, a1, a2, a3, t, u;
   memcpy(a0.f, f, 4*sizeof(float));
@@ -229,7 +243,6 @@ void validate_pffft_simd() {
   printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
   t.v = VMADD(a1.v, a2.v,a0.v);
   printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
-
   INTERLEAVE2(a1.v,a2.v,t.v,u.v);
   printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
   assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
@@ -252,20 +265,23 @@ void validate_pffft_simd() {
 #endif
 #endif /*!PFFFT_SIMD_DISABLE */
 
-#if !defined PFFT_MACROS_ONLY
+#if 0
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */
+void *pffft_aligned_malloc(size_t nb_bytes) {
+  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+  if (!p0) return (void *) 0;
+  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+  *((void **) p - 1) = p0;
+  return p;
+}
 
+void pffft_aligned_free(void *p) {
+  if (p) free(*((void **) p - 1));
+}
 
-#if defined (COMPILER_MSVC)
-  #define sin   (float)sin
-  #define cos   (float)cos
-#else
-  #define sin   sinf
-  #define cos   cosf
-#endif
-
-/*
 int pffft_simd_size() { return SIMD_SZ; }
-*/
+#endif
 
 /*
   passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
@@ -299,6 +315,7 @@ static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, c
 /*
   passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
 */
+#if 0
 static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
                                     const float *wa1, const float *wa2, float fsign) {
   static const float taur = -0.5f;
@@ -311,13 +328,13 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
   for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
     for (i=0; i<ido-1; i+=2) {
       tr2 = VADD(cc[i+ido], cc[i+2*ido]);
-      cr2 = VADD(cc[i], VMUL(LD_PS1(taur),tr2));
+      cr2 = VADD(cc[i], SVMUL(taur,tr2));
       ch[i]    = VADD(cc[i], tr2);
       ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
-      ci2 = VADD(cc[i    +1], VMUL(LD_PS1(taur),ti2));
+      ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
       ch[i+1]  = VADD(cc[i+1], ti2);
-      cr3 = VMUL(LD_PS1(taui), VSUB(cc[i+ido], cc[i+2*ido]));
-      ci3 = VMUL(LD_PS1(taui), VSUB(cc[i+ido+1], cc[i+2*ido+1]));
+      cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
+      ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
       dr2 = VSUB(cr2, ci3);
       dr3 = VADD(cr2, ci3);
       di2 = VADD(ci2, cr3);
@@ -332,6 +349,7 @@ static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
     }
   }
 } /* passf3 */
+#endif
 
 static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
                                     const float *wa1, const float *wa2, const float *wa3, float fsign) {
@@ -401,6 +419,78 @@ static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
   }
 } /* passf4 */
 
+#if 0
+/*
+  passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
+*/
+static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2,
+                                    const float *wa3, const float *wa4, float fsign) {
+  static const float tr11 = .309016994374947f;
+  const float ti11 = .951056516295154f*fsign;
+  static const float tr12 = -.809016994374947f;
+  const float ti12 = .587785252292473f*fsign;
+
+  /* Local variables */
+  int i, k;
+  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+
+  float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
+
+#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
+#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
+
+  assert(ido > 2);
+  for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
+    for (i = 0; i < ido-1; i += 2) {
+      ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
+      ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
+      ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
+      ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
+      tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
+      tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
+      tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
+      tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
+      ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
+      ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
+      cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
+      ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
+      cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
+      ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
+      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+      dr3 = VSUB(cr3, ci4);
+      dr4 = VADD(cr3, ci4);
+      di3 = VADD(ci3, cr4);
+      di4 = VSUB(ci3, cr4);
+      dr5 = VADD(cr2, ci5);
+      dr2 = VSUB(cr2, ci5);
+      di5 = VSUB(ci2, cr5);
+      di2 = VADD(ci2, cr5);
+      wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
+      wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
+      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+      ch_ref(i - 1, 2) = dr2;
+      ch_ref(i, 2)     = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+      ch_ref(i - 1, 3) = dr3;
+      ch_ref(i, 3)     = di3;
+      VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+      ch_ref(i - 1, 4) = dr4;
+      ch_ref(i, 4)     = di4;
+      VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+      ch_ref(i - 1, 5) = dr5;
+      ch_ref(i, 5)     = di5;
+    }
+  }
+#undef ch_ref
+#undef cc_ref
+}
+#endif
+
 static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
   static const float minus_one = -1.f;
   int i, k, l1ido = l1*ido;
@@ -425,7 +515,7 @@ static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
     if (ido % 2 == 1) return;
   }
   for (k=0; k < l1ido; k += ido) {
-    ch[2*k + ido] = VMUL(LD_PS1(minus_one), cc[ido-1 + k + l1ido]);
+    ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
     ch[2*k + ido-1] = cc[k + ido-1];
   }
 } /* radf2 */
@@ -460,10 +550,11 @@ static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, co
   for (k = 0; k < l1ido; k += ido) {
     a = cc[2*k + ido-1]; b = cc[2*k + ido];
     ch[k + ido-1] = VADD(a,a);
-    ch[k + ido-1 + l1ido] = VMUL(LD_PS1(minus_two), b);
+    ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
   }
 } /* radb2 */
 
+#if 0
 static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
                      const float *wa1, const float *wa2) {
   static const float taur = -0.5f;
@@ -473,8 +564,8 @@ static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT
   for (k=0; k<l1; k++) {
     cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
     ch[3*k*ido] = VADD(cc[k*ido], cr2);
-    ch[(3*k+2)*ido] = VMUL(LD_PS1(taui), VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
-    ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], VMUL(LD_PS1(taur), cr2));
+    ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+    ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
   }
   if (ido == 1) return;
   for (k=0; k<l1; k++) {
@@ -492,10 +583,10 @@ static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT
       ci2 = VADD(di2, di3);
       ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
       ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
-      tr2 = VADD(cc[i - 1 + k*ido], VMUL(LD_PS1(taur), cr2));
-      ti2 = VADD(cc[i + k*ido], VMUL(LD_PS1(taur), ci2));
-      tr3 = VMUL(LD_PS1(taui), VSUB(di2, di3));
-      ti3 = VMUL(LD_PS1(taui), VSUB(dr3, dr2));
+      tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
+      ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
+      tr3 = SVMUL(taui, VSUB(di2, di3));
+      ti3 = SVMUL(taui, VSUB(dr3, dr2));
       ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
       ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
       ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
@@ -517,7 +608,7 @@ static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
     tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
     cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
     ch[k*ido] = VADD(cc[3*k*ido], tr2);
-    ci3 = VMUL(LD_PS1(taui_2), cc[(3*k + 2)*ido]);
+    ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
     ch[(k + l1)*ido] = VSUB(cr2, ci3);
     ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
   }
@@ -531,8 +622,8 @@ static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
       ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
       ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
       ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
-      cr3 = VMUL(LD_PS1(taui), VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
-      ci3 = VMUL(LD_PS1(taui), VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
+      cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
+      ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
       dr2 = VSUB(cr2, ci3);
       dr3 = VADD(cr2, ci3);
       di2 = VADD(ci2, cr3);
@@ -546,7 +637,7 @@ static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch
     }
   }
 } /* radb3 */
-
+#endif
 
 static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
                                    const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
@@ -622,8 +713,8 @@ static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4s
   for (k=0; k<l1ido; k += ido) {
     v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
     v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
-    v4sf ti1 = VMUL(LD_PS1(minus_hsqt2), VADD(a, b));
-    v4sf tr1 = VMUL(LD_PS1(minus_hsqt2), VSUB(b, a));
+    v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
+    v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
     ch[ido-1 + 4*k] = VADD(tr1, c);
     ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
     ch[4*k + 1*ido] = VSUB(ti1, d);
@@ -645,10 +736,10 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
     while (ch < ch_end) {
       v4sf a = cc[0], b = cc[4*ido-1];
       v4sf c = cc[2*ido], d = cc[2*ido-1];
-      tr3 = VMUL(LD_PS1(two),d);
+      tr3 = SVMUL(two,d);
       tr2 = VADD(a,b);
       tr1 = VSUB(a,b);
-      tr4 = VMUL(LD_PS1(two),c);
+      tr4 = SVMUL(two,c);
       ch[0*l1ido] = VADD(tr2, tr3);
       ch[2*l1ido] = VSUB(tr2, tr3);
       ch[1*l1ido] = VSUB(tr1, tr4);
@@ -706,12 +797,190 @@ static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4
     ti1 = VADD(b,a);
     ti2 = VSUB(b,a);
     ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
-    ch[ido-1 + k + 1*l1ido] = VMUL(LD_PS1(minus_sqrt2), VSUB(ti1, tr1));
+    ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
     ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
-    ch[ido-1 + k + 3*l1ido] = VMUL(LD_PS1(minus_sqrt2), VADD(ti1, tr1));
+    ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
   }
 } /* radb4 */
 
+#if 0
+static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
+{
+  static const float tr11 = .309016994374947f;
+  static const float ti11 = .951056516295154f;
+  static const float tr12 = -.809016994374947f;
+  static const float ti12 = .587785252292473f;
+
+  /* System generated locals */
+  int cc_offset, ch_offset;
+
+  /* Local variables */
+  int i, k, ic;
+  v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
+    cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+  int idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * 6;
+  ch -= ch_offset;
+  cc_offset = 1 + ido * (1 + l1);
+  cc -= cc_offset;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
+    ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
+    cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
+    ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
+    ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
+    ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+    ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+    ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+    ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+    /*printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4); */
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
+      dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
+      dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
+      dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
+      VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+      VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+      VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+      VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+      cr2 = VADD(dr2, dr5);
+      ci5 = VSUB(dr5, dr2);
+      cr5 = VSUB(di2, di5);
+      ci2 = VADD(di2, di5);
+      cr3 = VADD(dr3, dr4);
+      ci4 = VSUB(dr4, dr3);
+      cr4 = VSUB(di3, di4);
+      ci3 = VADD(di3, di4);
+      ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
+      ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));/* */
+      tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+      ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));/* */
+      tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+      ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));/* */
+      tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
+      ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+      tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
+      ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+      ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
+      ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
+      ch_ref(i, 3, k) = VADD(ti2, ti5);
+      ch_ref(ic, 2, k) = VSUB(ti5, ti2);
+      ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
+      ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
+      ch_ref(i, 5, k) = VADD(ti3, ti4);
+      ch_ref(ic, 4, k) = VSUB(ti4, ti3);
+    }
+  }
+#undef cc_ref
+#undef ch_ref
+} /* radf5 */
+
+static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+                  const float *wa1, const float *wa2, const float *wa3, const float *wa4)
+{
+  static const float tr11 = .309016994374947f;
+  static const float ti11 = .951056516295154f;
+  static const float tr12 = -.809016994374947f;
+  static const float ti12 = .587785252292473f;
+
+  int cc_offset, ch_offset;
+
+  /* Local variables */
+  int i, k, ic;
+  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+  int idp2;
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 6;
+  cc -= cc_offset;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
+    ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
+    tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
+    tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
+    ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
+    cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+    cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+    ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+    ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+    ch_ref(1, k, 2) = VSUB(cr2, ci5);
+    ch_ref(1, k, 3) = VSUB(cr3, ci4);
+    ch_ref(1, k, 4) = VADD(cr3, ci4);
+    ch_ref(1, k, 5) = VADD(cr2, ci5);
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+      ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+      ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+      ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+      tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+      tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+      tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+      tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+      ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
+      ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
+      cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+      ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
+      cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+      ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
+      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+      dr3 = VSUB(cr3, ci4);
+      dr4 = VADD(cr3, ci4);
+      di3 = VADD(ci3, cr4);
+      di4 = VSUB(ci3, cr4);
+      dr5 = VADD(cr2, ci5);
+      dr2 = VSUB(cr2, ci5);
+      di5 = VSUB(ci2, cr5);
+      di2 = VADD(ci2, cr5);
+      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+      VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+      VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
+
+      ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
+      ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
+      ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
+      ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
+    }
+  }
+#undef cc_ref
+#undef ch_ref
+} /* radb5 */
+#endif
+
 static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
                                       const float *wa, const int *ifac) {
   v4sf *in  = (v4sf*)input_readonly;
@@ -727,15 +996,25 @@ static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *w
     int ido = n / l2;
     iw -= (ip - 1)*ido;
     switch (ip) {
+#if 0
+      case 5: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        int ix4 = ix3 + ido;
+        radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+      } break;
+#endif
       case 4: {
         int ix2 = iw + ido;
         int ix3 = ix2 + ido;
         radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
       } break;
+#if 0
       case 3: {
         int ix2 = iw + ido;
         radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
       } break;
+#endif
       case 2:
         radf2_ps(ido, l1, in, out, &wa[iw]);
         break;
@@ -766,15 +1045,25 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
     int l2 = ip*l1;
     int ido = n / l2;
     switch (ip) {
+#if 0
+      case 5: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        int ix4 = ix3 + ido;
+        radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+      } break;
+#endif
       case 4: {
         int ix2 = iw + ido;
         int ix3 = ix2 + ido;
         radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
       } break;
+#if 0
       case 3: {
         int ix2 = iw + ido;
         radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
       } break;
+#endif
       case 2:
         radb2_ps(ido, l1, in, out, &wa[iw]);
         break;
@@ -794,9 +1083,9 @@ static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *w
   return in; /* this is in fact the output .. */
 }
 
-static int decompose(int n, int *ifac, const int ntryh[3]) {
+static int decompose(int n, int *ifac, const int *ntryh) {
   int nl = n, nf = 0, i, j = 0;
-  for (j=0; j < 3; ++j) {
+  for (j=0; ntryh[j]; ++j) {
     int ntry = ntryh[j];
     while (nl != 1) {
       int nq = nl / ntry;
@@ -823,7 +1112,7 @@ static int decompose(int n, int *ifac, const int ntryh[3]) {
 
 static void rffti1_ps(int n, float *wa, int *ifac)
 {
-  static const int ntryh[3] = { 4,2,3 };
+  static const int ntryh[] = { 4,2,3,5,0 };
   int k1, j, ii;
 
   int nf = decompose(n,ifac,ntryh);
@@ -831,7 +1120,6 @@ static void rffti1_ps(int n, float *wa, int *ifac)
   int is = 0;
   int nfm1 = nf - 1;
   int l1 = 1;
-  if (nfm1 == 0) return;
   for (k1 = 1; k1 <= nfm1; k1++) {
     int ip = ifac[k1 + 1];
     int ld = 0;
@@ -855,9 +1143,10 @@ static void rffti1_ps(int n, float *wa, int *ifac)
   }
 } /* rffti1 */
 
-static void cffti1_ps(int n, float *wa, int *ifac)
+static
+void cffti1_ps(int n, float *wa, int *ifac)
 {
-  static const int ntryh[3] = { 3,4,2 };
+  static const int ntryh[] = { 5,3,4,2,0 };
   int k1, j, ii;
 
   int nf = decompose(n,ifac,ntryh);
@@ -894,7 +1183,8 @@ static void cffti1_ps(int n, float *wa, int *ifac)
 } /* cffti1 */
 
 
-static v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
+static
+v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
   v4sf *in  = (v4sf*)input_readonly;
   v4sf *out = (in == work2 ? work1 : work2);
   int nf = ifac[1], k1;
@@ -907,6 +1197,14 @@ static v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *wor
     int ido = n / l2;
     int idot = ido + ido;
     switch (ip) {
+#if 0
+      case 5: {
+        int ix2 = iw + idot;
+        int ix3 = ix2 + idot;
+        int ix4 = ix3 + idot;
+        passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], (float)isign);
+      } break;
+#endif
       case 4: {
         int ix2 = iw + idot;
         int ix3 = ix2 + idot;
@@ -915,10 +1213,12 @@ static v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *wor
       case 2: {
         passf2_ps(idot, l1, in, out, &wa[iw], (float)isign);
       } break;
+#if 0
       case 3: {
         int ix2 = iw + idot;
         passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], (float)isign);
       } break;
+#endif
       default:
         assert(0);
     }
@@ -945,23 +1245,23 @@ struct PFFFT_Setup {
   float *twiddle; /* points into 'data', N/4 elements */
 };
 
+static
 PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
-  int k, m;
   PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
-  if (!s)
-    return s;
-  if (transform == PFFFT_REAL) { assert(N >= 32); }
-  if (transform == PFFFT_COMPLEX) { assert(N >= 16); }
+  int k, m;
+  if (!s) return s;
+  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
+     and 32 for real FFTs -- a lot of stuff would need to be rewritten to
+     handle other cases (or maybe just switch to a scalar fft, I don't know..) */
+  if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
+  if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
   /*assert((N % 32) == 0); */
   s->N = N;
   s->transform = transform;
   /* nb of complex simd vectors */
   s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
   s->data = (v4sf*)pffft_aligned_malloc(2*(size_t)s->Ncvec * sizeof(v4sf));
-  if (!s->data) {
-    free(s);
-    return 0;
-  }
+  if (!s->data) {free(s); return 0;}
   s->e = (float*)s->data;
   s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
 
@@ -988,15 +1288,22 @@ PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
     }
     cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
   }
+
+  /* check that N is decomposable with allowed prime factors */
+  for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
+  if (m != N/SIMD_SZ) {
+    pffft_destroy_setup(s); s = 0;
+  }
+
   return s;
 }
 
 
-static void pffft_destroy_setup(PFFFT_Setup *s) {
-  if(s){
-    pffft_aligned_free(s->data);
-    free(s);
-  }
+static
+void pffft_destroy_setup(PFFFT_Setup *s) {
+  if (!s) return;
+  pffft_aligned_free(s->data);
+  free(s);
 }
 
 #if !defined(PFFFT_SIMD_DISABLE)
@@ -1035,7 +1342,8 @@ static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
   UNINTERLEAVE2(h0, g1, out[0], out[1]);
 }
 
-static void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+static
+void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
   int k, N = setup->N, Ncvec = setup->Ncvec;
   const v4sf *vin = (const v4sf*)in;
   v4sf *vout = (v4sf*)out;
@@ -1072,7 +1380,8 @@ static void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pfff
   }
 }
 
-static void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+static
+void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
@@ -1116,7 +1425,8 @@ static void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf
   }
 }
 
-static void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+static
+void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
   int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
   v4sf r0, i0, r1, i1, r2, i2, r3, i3;
   v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
@@ -1342,22 +1652,23 @@ static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf
 }
 
 
-static void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
+static
+void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
                              pffft_direction_t direction, int ordered) {
   int k, Ncvec   = setup->Ncvec;
   int nf_odd = (setup->ifac[1] & 1);
 
+#if 0
   /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
-  /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */
-  /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */
+  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+#endif
 
-  int ib = (nf_odd ^ ordered ? 1 : 0);
   const v4sf *vinput = (const v4sf*)finput;
   v4sf *voutput      = (v4sf*)foutput;
   v4sf *buff[2];
-  buff[0] = voutput, buff[1] = scratch /*? scratch : scratch_on_stack*/;
-
-  /*if (scratch == 0) scratch = scratch_on_stack; */
+  int ib = (nf_odd ^ ordered ? 1 : 0);
+  buff[0] = voutput; buff[1] = scratch;
 
   assert(VALIGNED(finput) && VALIGNED(foutput));
 
@@ -1415,8 +1726,8 @@ static void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, fl
 }
 
 #if 0
-static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
-  int i, Ncvec = s->Ncvec;
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
+  int Ncvec = s->Ncvec;
   const v4sf * RESTRICT va = (const v4sf*)a;
   const v4sf * RESTRICT vb = (const v4sf*)b;
   v4sf * RESTRICT vab = (v4sf*)ab;
@@ -1434,10 +1745,16 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo
   __builtin_prefetch(va+6);
   __builtin_prefetch(vb+6);
   __builtin_prefetch(vab+6);
+# ifndef __clang__
+#   define ZCONVOLVE_USING_INLINE_NEON_ASM
+# endif
 #endif
 
   float ar, ai, br, bi, abr, abi;
+#ifndef ZCONVOLVE_USING_INLINE_ASM
   v4sf vscal = LD_PS1(scaling);
+  int i;
+#endif
 
   assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
   ar = ((v4sf_union*)va)[0].f[0];
@@ -1447,8 +1764,7 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo
   abr = ((v4sf_union*)vab)[0].f[0];
   abi = ((v4sf_union*)vab)[1].f[0];
 
-#ifdef __arm__
-#  if 1 /* inline asm version */
+#ifdef ZCONVOLVE_USING_INLINE_ASM /* inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc */
   const float *a_ = a, *b_ = b; float *ab_ = ab;
   int N = Ncvec;
   asm volatile("mov         r8, %2                  \n"
@@ -1484,49 +1800,7 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo
                "subs        %3, #2                  \n"
                "bne         1b                      \n"
                : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
-
-#  else /* neon instrinsics version, 30% slower that the asm one with gcc 4.6 */
-  v4sf a1r, a1i, b1r, b1i;
-  v4sf a2r, a2i, b2r, b2i;
-  v4sf ab1r, ab1i, ab2r, ab2i;
-  for (i=0; i < Ncvec; i += 2) {
-    __builtin_prefetch(va+8);
-    __builtin_prefetch(va+10);
-
-    a1r = *va++; a1i = *va++;
-    a2r = *va++; a2i = *va++;
-    b1r = *vb++; b1i = *vb++;
-    b2r = *vb++; b2i = *vb++;
-    ab1r = vab[0]; ab1i = vab[1];
-    ab2r = vab[2]; ab2i = vab[3];
-
-    v4sf z1r = VMUL(a1r, b1r);
-    v4sf z2r = VMUL(a2r, b2r);
-    v4sf z1i = VMUL(a1r, b1i);
-    v4sf z2i = VMUL(a2r, b2i);
-
-    __builtin_prefetch(vb+4);
-    __builtin_prefetch(vb+6);
-
-    z1r = vmlsq_f32(z1r, a1i, b1i);
-    z2r = vmlsq_f32(z2r, a2i, b2i);
-    z1i = vmlaq_f32(z1i, a1i, b1r);
-    z2i = vmlaq_f32(z2i, a2i, b2r);
-
-    __builtin_prefetch(vab+4);
-    __builtin_prefetch(vab+6);
-
-    ab1r = vmlaq_f32(ab1r, z1r, vscal);
-    ab2r = vmlaq_f32(ab2r, z2r, vscal);
-    ab1i = vmlaq_f32(ab1i, z1i, vscal);
-    ab2i = vmlaq_f32(ab2i, z2i, vscal);
-
-    *vab++ = ab1r; *vab++ = ab1i;
-    *vab++ = ab2r; *vab++ = ab2i;
-  }
-#  endif
-
-#else /* not ARM, no need to use a special routine */
+#else /* default routine, works fine for non-arm cpus with current compilers */
   for (i=0; i < Ncvec; i += 2) {
     v4sf ar, ai, br, bi;
     ar = va[2*i+0]; ai = va[2*i+1];
@@ -1548,50 +1822,14 @@ static void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const flo
 }
 #endif
 
-static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
-  int i, Ncvec = s->Ncvec;
-  const v4sf * /*RESTRICT*/ va = (const v4sf*)a;
-  const v4sf * RESTRICT vb = (const v4sf*)b;
-  v4sf * /*RESTRICT*/ vab = (v4sf*)ab;
-
-  float ar, ai, br, bi;
-
-#ifdef __arm__
-#error
-#endif
-  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
-  ar = ((v4sf_union*)va)[0].f[0];
-  ai = ((v4sf_union*)va)[1].f[0];
-  br = ((v4sf_union*)vb)[0].f[0];
-  bi = ((v4sf_union*)vb)[1].f[0];
-
-  for (i=0; i < Ncvec; i += 2) {
-    v4sf ar, ai, br, bi;
-    ar = va[2*i+0]; ai = va[2*i+1];
-    br = vb[2*i+0]; bi = vb[2*i+1];
-    VCPLXMUL(ar, ai, br, bi);
-    vab[2*i+0] = ar;
-    vab[2*i+1] = ai;
-    ar = va[2*i+2]; ai = va[2*i+3];
-    br = vb[2*i+2]; bi = vb[2*i+3];
-    VCPLXMUL(ar, ai, br, bi);
-    vab[2*i+2] = ar;
-    vab[2*i+3] = ai;
-  }
-  if (s->transform == PFFFT_REAL) {
-    ((v4sf_union*)vab)[0].f[0] = ar*br;
-    ((v4sf_union*)vab)[1].f[0] = ai*bi;
-  }
-}
-
-
 
 #else /* defined(PFFFT_SIMD_DISABLE) */
 
 /* standard routine using scalar floats, without SIMD stuff. */
 
 #define pffft_zreorder_nosimd pffft_zreorder
-static void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+static
+void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
   int k, N = setup->N;
   if (setup->transform == PFFFT_COMPLEX) {
     for (k=0; k < 2*N; ++k) out[k] = in[k];
@@ -1611,19 +1849,22 @@ static void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *ou
 }
 
 #define pffft_transform_internal_nosimd pffft_transform_internal
-static void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
+static
+void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
                                     pffft_direction_t direction, int ordered) {
   int Ncvec   = setup->Ncvec;
   int nf_odd = (setup->ifac[1] & 1);
 
+#if 0
   /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
-  /*int stack_allocate = (scratch == 0 ? Ncvec*2 : 1); */
-  /*VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate); */
-  /*if (scratch == 0) scratch = scratch_on_stack; */
-
-  int ib;
+  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+#endif
   float *buff[2];
-  buff[0] = output, buff[1] = scratch;
+  int ib;
+  /* if (scratch == 0) scratch = scratch_on_stack; */
+  buff[0] = output; buff[1] = scratch;
+
   if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */
   ib = (nf_odd ^ ordered ? 1 : 0);
 
@@ -1669,7 +1910,7 @@ static void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *inp
 
 #if 0
 #define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
-static void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
+void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
                                        float *ab, float scaling) {
   int i, Ncvec = s->Ncvec;
 
@@ -1690,40 +1931,16 @@ static void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, co
 }
 #endif
 
-#define pffft_zconvolve_nosimd pffft_zconvolve
-static void pffft_zconvolve_nosimd(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
-  int i, Ncvec = s->Ncvec;
-
-  if (s->transform == PFFFT_REAL) {
-    /* take care of the fftpack ordering */
-    ab[0] = a[0]*b[0];
-    ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1];
-    ++ab; ++a; ++b; --Ncvec;
-  }
-  for (i=0; i < Ncvec; ++i) {
-    float ar, ai, br, bi;
-    ar = a[2*i+0]; ai = a[2*i+1];
-    br = b[2*i+0]; bi = b[2*i+1];
-    VCPLXMUL(ar, ai, br, bi);
-    ab[2*i+0] = ar;
-    ab[2*i+1] = ai;
-  }
-}
-
 #endif /* defined(PFFFT_SIMD_DISABLE) */
 
-static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+static
+void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
   pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
 }
 
-static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+static
+void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
   pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
 }
 
-
-static void pffft_reorder_back(int length, void * setup, float * data, float * work)
-{
-  memcpy(work, data, (unsigned)length * sizeof(*work));
-  pffft_zreorder(setup, work, data, PFFFT_BACKWARD);
-}
 #endif
diff --git a/soxr/src/pffft.h b/soxr/src/pffft.h
index 78d936b..63522ca 100644
--- a/soxr/src/pffft.h
+++ b/soxr/src/pffft.h
@@ -1,4 +1,9 @@
-/* Copyright (c) 2011  Julien Pommier ( pommier@modartt.com )
+/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.h
+ * with minor changes for libsoxr. */
+
+#if !defined PFFT_MACROS_ONLY
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
 
    Based on original fortran 77 code from FFTPACKv4 from NETLIB,
    authored by Dr Paul Swarztrauber of NCAR, in 1985.
@@ -60,8 +65,9 @@
    - 1D transforms only, with 32-bit single precision.
 
    - supports only transforms for inputs of length N of the form
-   N=(2^a)*(3^b), a >= 5 and b >=0 (32, 48, 64, 96, 128, 144 etc
-   are all acceptable lengths). Performance is best for 128<=N<=8192.
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
 
    - all (float*) pointers in the functions below are expected to
    have an "simd-compatible" alignment, that is 16 bytes on x86 and
@@ -80,6 +86,10 @@
 
 #ifdef __cplusplus
 extern "C" {
+#endif
+
+#if PFFFT_DOUBLE
+#define float double
 #endif
 
   /* opaque struct holding internal stuff (precomputed twiddle factors)
@@ -99,8 +109,10 @@ extern "C" {
     PFFFT_Setup structure is read-only so it can safely be shared by
     multiple concurrent threads.
   */
-  static PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
-  static void pffft_destroy_setup(PFFFT_Setup *);
+  static
+  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  static
+  void pffft_destroy_setup(PFFFT_Setup *);
   /*
      Perform a Fourier transform , The z-domain data is stored in the
      most efficient order for transforming it back, or using it for
@@ -113,13 +125,14 @@ extern "C" {
      Typically you will want to scale the backward transform by 1/N.
 
      The 'work' pointer should point to an area of N (2*N for complex
-     fft) floats, properly aligned. [del]If 'work' is NULL, then stack will
-     be used instead (this is probably the beest strategy for small
-     FFTs, say for N < 16384).[/del]
+     fft) floats, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384).
 
      input and output may alias.
   */
-  static void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+  static
+  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
 
   /*
      Similar to pffft_transform, but makes sure that the output is
@@ -128,7 +141,8 @@ extern "C" {
 
      input and output may alias.
   */
-  static void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+  static
+  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
 
   /*
      call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
@@ -142,7 +156,8 @@ extern "C" {
 
      input and output should not alias.
   */
-  static void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+  static
+  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
 
   /*
      Perform a multiplication of the frequency components of dft_a and
@@ -155,23 +170,28 @@ extern "C" {
      the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
 
      The dft_a, dft_b and dft_ab pointers may alias.
-  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
   */
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
 
   /*
-     the operation performed is: dft_ab = (dft_a * fdt_b)
-
-     The dft_a, dft_b and dft_ab pointers may alias.
+    the float buffers must have the correct alignment (16-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers.
   */
-  static void pffft_zconvolve(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab);
+#if 0
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffft_aligned_free(void *);
 
   /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
-  int pffft_simd_size(void);
+  int pffft_simd_size();
+#endif
 
-  static void pffft_reorder_back(int length, void * setup, float * data, float * work);
+#undef float
 
 #ifdef __cplusplus
 }
 #endif
 
 #endif
+
+#endif
diff --git a/soxr/src/pffft32.c b/soxr/src/pffft32.c
index 21bd845..c4c8e0a 100644
--- a/soxr/src/pffft32.c
+++ b/soxr/src/pffft32.c
@@ -1,11 +1,14 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-#define _soxr_simd_aligned_free free
-#define _soxr_simd_aligned_malloc malloc
+#define SIMD_ALIGNED_FREE free
+#define SIMD_ALIGNED_MALLOC malloc
 #define PFFFT_SIMD_DISABLE
-#include "pffft.c"
+#define PFFFT_DOUBLE 0
+#include "pffft-wrap.c"
+
 #include "filter.h"
+#include "rdft_t.h"
 
 static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
 static void delete_setup(void * setup) {pffft_destroy_setup(setup);}
@@ -15,18 +18,22 @@ static void backward (int length, void * setup, float * H, float * scratch) {pff
 static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
 static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H);  (void)length;}
 static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_NEEDS_SCRATCH;}
 
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32_cb[] = {
-  (fn_t)setup,
-  (fn_t)setup,
-  (fn_t)delete_setup,
-  (fn_t)forward,
-  (fn_t)oforward,
-  (fn_t)backward,
-  (fn_t)obackward,
-  (fn_t)convolve,
-  (fn_t)_soxr_ordered_partial_convolve_f,
-  (fn_t)multiplier,
-  (fn_t)pffft_reorder_back,
+rdft_cb_table _soxr_rdft32_cb = {
+  setup,
+  setup,
+  delete_setup,
+  forward,
+  oforward,
+  backward,
+  obackward,
+  convolve,
+  _soxr_ordered_partial_convolve_f,
+  multiplier,
+  pffft_reorder_back,
+  malloc,
+  calloc,
+  free,
+  flags,
 };
diff --git a/soxr/src/pffft32s.c b/soxr/src/pffft32s.c
index d049990..06f8fd5 100644
--- a/soxr/src/pffft32s.c
+++ b/soxr/src/pffft32s.c
@@ -1,27 +1,34 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-#include "pffft.c"
+#define PFFFT_DOUBLE 0
+#include "pffft-wrap.c"
+
+#include "rdft_t.h"
 
 static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
 static void forward  (int length, void * setup, float * h, float * scratch) {pffft_transform        (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
 static void oforward (int length, void * setup, float * h, float * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
 static void backward (int length, void * setup, float * H, float * scratch) {pffft_transform        (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
 static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
-static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H);                  (void)length;}
+static void convolve(int length, void * setup, float * H, float const * with) {pffft_zconvolve(setup, H, with, H); (void)length;}
 static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;}
 
-typedef void (* fn_t)(void);
-fn_t _soxr_rdft32s_cb[] = {
-  (fn_t)setup,
-  (fn_t)setup,
-  (fn_t)pffft_destroy_setup,
-  (fn_t)forward,
-  (fn_t)oforward,
-  (fn_t)backward,
-  (fn_t)obackward,
-  (fn_t)convolve,
-  (fn_t)_soxr_ordered_partial_convolve_simd,
-  (fn_t)multiplier,
-  (fn_t)pffft_reorder_back,
+rdft_cb_table _soxr_rdft32s_cb = {
+  setup,
+  setup,
+  pffft_destroy_setup,
+  forward,
+  oforward,
+  backward,
+  obackward,
+  convolve,
+  ORDERED_PARTIAL_CONVOLVE_SIMD,
+  multiplier,
+  pffft_reorder_back,
+  SIMD_ALIGNED_MALLOC,
+  SIMD_ALIGNED_CALLOC,
+  SIMD_ALIGNED_FREE,
+  flags,
 };
diff --git a/soxr/src/pffft64s.c b/soxr/src/pffft64s.c
new file mode 100644
index 0000000..82f6504
--- /dev/null
+++ b/soxr/src/pffft64s.c
@@ -0,0 +1,34 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 1
+#include "pffft-wrap.c"
+
+#include "rdft_t.h"
+
+static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
+static void forward  (int length, void * setup, double * h, double * scratch) {pffft_transform        (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void oforward (int length, void * setup, double * h, double * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void backward (int length, void * setup, double * H, double * scratch) {pffft_transform        (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void obackward(int length, void * setup, double * H, double * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void convolve(int length, void * setup, double * H, double const * with) {pffft_zconvolve(setup, H, with, H); (void)length;}
+static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;}
+
+rdft_cb_table _soxr_rdft64s_cb = {
+  setup,
+  setup,
+  pffft_destroy_setup,
+  forward,
+  oforward,
+  backward,
+  obackward,
+  convolve,
+  ORDERED_PARTIAL_CONVOLVE_SIMD,
+  multiplier,
+  pffft_reorder_back,
+  SIMD_ALIGNED_MALLOC,
+  SIMD_ALIGNED_CALLOC,
+  SIMD_ALIGNED_FREE,
+  flags,
+};
diff --git a/soxr/src/poly-fir.h b/soxr/src/poly-fir.h
index f7b4261..d138e03 100644
--- a/soxr/src/poly-fir.h
+++ b/soxr/src/poly-fir.h
@@ -1,97 +1,149 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-/* Resample using an interpolated poly-phase FIR with length LEN.*/
-/* Input must be followed by LEN-1 samples. */
+/* Resample using an interpolated poly-phase FIR with length LEN. */
+/* Input must be followed by FIR_LENGTH-1 samples. */
 
-#define a (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 0,j))
-#define b (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 1,j))
-#define c (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 2,j))
-#define d (coef(p->shared->poly_fir_coefs, COEF_INTERP, FIR_LENGTH, phase, 3,j))
-#if COEF_INTERP == 0
-  #define _ sum += a *in[j], ++j;
-#elif COEF_INTERP == 1
-  #define _ sum += (b *x + a)*in[j], ++j;
-#elif COEF_INTERP == 2
-  #define _ sum += ((c *x + b)*x + a)*in[j], ++j;
-#elif COEF_INTERP == 3
-  #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j;
-#else
+#if COEF_INTERP != 1 && COEF_INTERP != 2 && COEF_INTERP != 3
   #error COEF_INTERP
 #endif
 
+#if SIMD_AVX || SIMD_SSE || SIMD_NEON
+  #define N (FIR_LENGTH>>2)
+
+  #if COEF_INTERP == 1
+    #define _ sum=vMac(vMac(b,X,a),vLdu(in+j*4),sum), ++j;
+  #elif COEF_INTERP == 2
+    #define _ sum=vMac(vMac(vMac(c,X,b),X,a),vLdu(in+j*4),sum), ++j;
+  #else
+    #define _ sum=vMac(vMac(vMac(vMac(d,X,c),X,b),X,a),vLdu(in+j*4),sum), ++j;
+  #endif
+
+  #define a coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-0)]
+  #define b coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-1)]
+  #define c coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-2)]
+  #define d coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-3)]
+
+  #define BEGINNING v4_t X = vLds(x), sum = vZero(); \
+      v4_t const * const __restrict coefs = (v4_t *)COEFS
+  #define END vStorSum(output+i, sum)
+  #define cc(n) case n: core(n); break
+  #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
+#else
+  #define N FIR_LENGTH
+
+  #if COEF_INTERP == 1
+    #define _ sum += (b*x + a)*in[j], ++j;
+  #elif COEF_INTERP == 2
+    #define _ sum += ((c*x + b)*x + a)*in[j], ++j;
+  #else
+    #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j;
+  #endif
+
+  #define a (coef(COEFS, COEF_INTERP, N, phase, 0,j))
+  #define b (coef(COEFS, COEF_INTERP, N, phase, 1,j))
+  #define c (coef(COEFS, COEF_INTERP, N, phase, 2,j))
+  #define d (coef(COEFS, COEF_INTERP, N, phase, 3,j))
+
+  #define BEGINNING sample_t sum = 0
+  #define END output[i] = sum
+  #define CORE(n) core(n)
+#endif
+
+
+
+#define floatPrecCore(n) { \
+  float_step_t at = p->at.flt; \
+  for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { \
+    sample_t const * const __restrict in = input + (int)at; \
+    float_step_t frac = at - (int)at; \
+    int phase = (int)(frac * (1 << PHASE_BITS)); \
+    sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, (int)at, NULL); \
+  p->at.flt = at - (int)at; } /* Could round to 1 in some cirmcumstances. */
+
+
+
+#define highPrecCore(n) { \
+  step_t at; at.fix = p->at.fix; \
+  for (i = 0; at.integer < num_in; ++i, \
+      at.fix.ls.all += p->step.fix.ls.all, \
+      at.whole += p->step.whole + (at.fix.ls.all < p->step.fix.ls.all)) { \
+    sample_t const * const __restrict in = input + at.integer; \
+    uint32_t frac = at.fraction; \
+    int phase = (int)(frac >> (32 - PHASE_BITS)); /* High-order bits */ \
+    /* Low-order bits, scaled to [0,1): */ \
+    sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, at.integer, NULL); \
+  p->at.whole = at.fraction; \
+  p->at.fix.ls = at.fix.ls; }
+
+
+
+#define stdPrecCore(n) { \
+  int64p_t at; at.all = p->at.whole; \
+  for (i = 0; at.parts.ms < num_in; ++i, at.all += p->step.whole) { \
+    sample_t const * const __restrict in = input + at.parts.ms; \
+    uint32_t const frac = at.parts.ls; \
+    int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
+    /* Low-order bits, scaled to [0,1): */ \
+    sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, at.parts.ms, NULL); \
+  p->at.whole = at.parts.ls; }
+
+
+
+#if WITH_FLOAT_STD_PREC_CLOCK
+  #define SPCORE floatPrecCore
+#else
+  #define SPCORE stdPrecCore
+#endif
+
+
+
+#if WITH_HI_PREC_CLOCK
+  #define core(n) if (p->use_hi_prec_clock) highPrecCore(n) else SPCORE(n)
+#else
+  #define core(n) SPCORE(n)
+#endif
+
+
+
 static void FUNCTION(stage_t * p, fifo_t * output_fifo)
 {
   sample_t const * input = stage_read_p(p);
-  int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
-  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio);
+  sample_t * const __restrict output = fifo_reserve(output_fifo, max_num_out);
 
-#if defined HI_PREC_CLOCK
-#if FLOAT_HI_PREC_CLOCK
-  if (p->use_hi_prec_clock) {
-    float_step_t at = p->at.flt;
-    for (i = 0; (int)at < num_in; ++i, at += p->step.flt) {
-      sample_t const * in = input + (int)at;
-      float_step_t frac = at - (int)at;
-      int phase = (int)(frac * (1 << PHASE_BITS));
-#if COEF_INTERP > 0
-      sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase);
-#endif
-      sample_t sum = 0;
-      int j = 0;
-      CONVOLVE
-      output[i] = sum;
-    }
-    fifo_read(&p->fifo, (int)at, NULL);
-    p->at.flt = at - (int)at;
-  } else
-#else
-  if (p->use_hi_prec_clock) {
-    for (i = 0; p->at.integer < num_in; ++i,
-        p->at.fix.ls.all += p->step.fix.ls.all,
-        p->at.whole += p->step.whole + (p->at.fix.ls.all < p->step.fix.ls.all)) {
-      sample_t const * in = input + p->at.integer;
-      uint32_t frac = p->at.fraction;
-      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */
-#if COEF_INTERP > 0              /* low-order bits, scaled to [0,1) */
-      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32));
-#endif
-      sample_t sum = 0;
-      int j = 0;
-      CONVOLVE
-      output[i] = sum;
-    }
-    fifo_read(&p->fifo, p->at.integer, NULL);
-    p->at.integer = 0;
-  } else
-#endif
-#endif
-  {
-    for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
-      sample_t const * in = input + p->at.integer;
-      uint32_t frac = p->at.fraction;
-      int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */
-#if COEF_INTERP > 0              /* low-order bits, scaled to [0,1) */
-      sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32));
-#endif
-      sample_t sum = 0;
-      int j = 0;
-      CONVOLVE
-      output[i] = sum;
-    }
-    fifo_read(&p->fifo, p->at.integer, NULL);
-    p->at.integer = 0;
-  }
+  CORE(N);
   assert(max_num_out - i >= 0);
   fifo_trim_by(output_fifo, max_num_out - i);
 }
 
+
+
 #undef _
 #undef a
 #undef b
 #undef c
 #undef d
+#undef CORE
+#undef cc
+#undef core
 #undef COEF_INTERP
+#undef N
+#undef BEGINNING
+#undef END
 #undef CONVOLVE
 #undef FIR_LENGTH
 #undef FUNCTION
diff --git a/soxr/src/poly-fir0.h b/soxr/src/poly-fir0.h
index 52d85b3..76fca2d 100644
--- a/soxr/src/poly-fir0.h
+++ b/soxr/src/poly-fir0.h
@@ -1,32 +1,56 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-/* Resample using a non-interpolated poly-phase FIR with length LEN.*/
-/* Input must be followed by LEN-1 samples. */
+/* Resample using a non-interpolated poly-phase FIR with length LEN. */
+/* Input must be followed by FIR_LENGTH-1 samples. */
 
-#define _ sum += (coef(p->shared->poly_fir_coefs, 0, FIR_LENGTH, rem, 0, j)) *at[j], ++j;
+#if SIMD_AVX || SIMD_SSE || SIMD_NEON
+  #define N (FIR_LENGTH>>2)
+  #define BEGINNING v4_t sum = vZero(); \
+      v4_t const * const __restrict coefs = (v4_t *)COEFS + N * rem;
+  #define _ sum = vMac(vLdu(at+j*4), coefs[j], sum), ++j;
+  #define END vStorSum(output+i, sum)
+  #define cc(n) case n: core(n); break
+  #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
+#else
+  #define N FIR_LENGTH
+  #define BEGINNING sample_t sum = 0; \
+      sample_t const * const __restrict coefs = (sample_t *)COEFS + N * rem;
+  #define _ sum += coefs[j]*at[j], ++j;
+  #define END output[i] = sum
+  #define CORE(n) core(n)
+#endif
+
+#define core(n) \
+  for (i = 0; at < num_in * p->L; ++i, at += step) { \
+    int const div = at / p->L, rem = at % p->L; \
+    sample_t const * const __restrict at = input + div; \
+    int j = 0; BEGINNING; CONVOLVE(n); END;}
 
 static void FUNCTION(stage_t * p, fifo_t * output_fifo)
 {
-  sample_t const * input = stage_read_p(p);
-  int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
-  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  if (num_in) {
+    sample_t const * input = stage_read_p(p);
+    int at = p->at.integer, step = p->step.integer;
+    int i, num_out = (num_in * p->L - at + step - 1) / step;
+    sample_t * __restrict output = fifo_reserve(output_fifo, num_out);
 
-  for (i = 0; p->at.integer < num_in * p->L; ++i, p->at.integer += p->step.integer) {
-    int div = p->at.integer / p->L, rem = p->at.integer % p->L;
-    sample_t const * at = input + div;
-    sample_t sum = 0;
-    int j = 0;
-    CONVOLVE
-    output[i] = sum;
+    CORE(N);
+    assert(i == num_out);
+    fifo_read(&p->fifo, at / p->L, NULL);
+    p->at.integer = at % p->L;
   }
-  assert(max_num_out - i >= 0);
-  fifo_trim_by(output_fifo, max_num_out - i);
-  fifo_read(&p->fifo, p->at.integer / p->L, NULL);
-  p->at.integer = p->at.integer % p->L;
 }
 
 #undef _
+#undef CORE
+#undef cc
+#undef core
+#undef N
+#undef BEGINNING
+#undef MIDDLE
+#undef END
 #undef CONVOLVE
 #undef FIR_LENGTH
 #undef FUNCTION
diff --git a/soxr/src/rate.h b/soxr/src/rate.h
deleted file mode 100644
index f6d055a..0000000
--- a/soxr/src/rate.h
+++ /dev/null
@@ -1,726 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-14 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#include <math.h>
-#include <assert.h>
-#include <string.h>
-#include <stdlib.h>
-
-#include "filter.h"
-
-#if defined SOXR_LIB
-#include "internal.h"
-
-typedef void (* fn_t)(void);
-extern fn_t RDFT_CB[11];
-
-#define rdft_forward_setup    (*(void * (*)(int))RDFT_CB[0])
-#define rdft_backward_setup   (*(void * (*)(int))RDFT_CB[1])
-#define rdft_delete_setup     (*(void (*)(void *))RDFT_CB[2])
-#define rdft_forward          (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[3])
-#define rdft_oforward         (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[4])
-#define rdft_backward         (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[5])
-#define rdft_obackward        (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[6])
-#define rdft_convolve         (*(void (*)(int, void *, sample_t *, sample_t const *))RDFT_CB[7])
-#define rdft_convolve_portion (*(void (*)(int, sample_t *, sample_t const *))RDFT_CB[8])
-#define rdft_multiplier       (*(int (*)(void))RDFT_CB[9])
-#define rdft_reorder_back     (*(void (*)(int, void *, sample_t *, sample_t *))RDFT_CB[10])
-
-#endif
-
-#if RATE_SIMD /* Align for SIMD: */
-  #include "simd.h"
-#if 0 /* Not using this yet. */
-  #define RATE_SIMD_POLY 1
-  #define num_coefs4 ((num_coefs + 3) & ~3)
-  #define coefs4_check(i) ((i) < num_coefs)
-#else
-  #define RATE_SIMD_POLY 0
-  #define num_coefs4 num_coefs
-  #define coefs4_check(i) 1
-#endif
-
-  #define aligned_free    _soxr_simd_aligned_free
-  #define aligned_malloc  _soxr_simd_aligned_malloc
-  #define aligned_calloc  _soxr_simd_aligned_calloc
-#if 0
-  #define FIFO_REALLOC    aligned_realloc
-  #define FIFO_MALLOC     aligned_malloc
-  #define FIFO_FREE       aligned_free
-
-  static void * aligned_realloc(void * q, size_t nb_bytes, size_t copy_bytes) {
-    void * p = aligned_malloc(nb_bytes);
-    if (p) memcpy(p, q, copy_bytes);
-    aligned_free(q);
-    return p;
-  }
-#endif
-#else
-  #define RATE_SIMD_POLY 0
-  #define num_coefs4 num_coefs
-  #define coefs4_check(i) 1
-
-  #define aligned_free    free
-  #define aligned_malloc  malloc
-  #define aligned_calloc  calloc
-#endif
-
-#define  FIFO_SIZE_T int
-#include "fifo.h"
-
-typedef union { /* Int64 in parts */
-  #if WORDS_BIGENDIAN
-  struct {int32_t ms; uint32_t ls;} parts;
-  #else
-  struct {uint32_t ls; int32_t ms;} parts;
-  #endif
-  int64_t all;
-} int64p_t;
-
-typedef union { /* Uint64 in parts */
-  #if WORDS_BIGENDIAN
-  struct {uint32_t ms, ls;} parts;
-  #else
-  struct {uint32_t ls, ms;} parts;
-  #endif
-  uint64_t all;
-} uint64p_t;
-
-#define FLOAT_HI_PREC_CLOCK 0    /* Non-float hi-prec has ~96 bits. */
-#define float_step_t long double /* __float128 is also a (slow) option */
-
-#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) coef_p[(fir_len) * ((interp_order) + 1) * (phase_num) + ((interp_order) + 1) * (fir_coef_num) + (interp_order - coef_interp_num)]
-
-#define raw_coef_t double
-
-static sample_t * prepare_coefs(raw_coef_t const * coefs, int num_coefs,
-    int num_phases, int interp_order, double multiplier)
-{
-  int i, j, length = num_coefs4 * num_phases;
-  sample_t * result = malloc((size_t)(length * (interp_order + 1)) * sizeof(*result));
-  double fm1 = coefs[0], f1 = 0, f2 = 0;
-
-  for (i = num_coefs4 - 1; i >= 0; --i)
-    for (j = num_phases - 1; j >= 0; --j) {
-      double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */
-      int pos = i * num_phases + j - 1;
-      fm1 = coefs4_check(i) && pos > 0 ? coefs[pos - 1] * multiplier : 0;
-      switch (interp_order) {
-        case 1: b = f1 - f0; break;
-        case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break;
-        case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break;
-        default: if (interp_order) assert(0);
-      }
-      #define coef_coef(x) \
-        coef(result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i)
-      coef_coef(0) = (sample_t)f0;
-      if (interp_order > 0) coef_coef(1) = (sample_t)b;
-      if (interp_order > 1) coef_coef(2) = (sample_t)c;
-      if (interp_order > 2) coef_coef(3) = (sample_t)d;
-      #undef coef_coef
-      f2 = f1, f1 = f0;
-    }
-  return result;
-}
-
-typedef struct {
-  int        dft_length, num_taps, post_peak;
-  void       * dft_forward_setup, * dft_backward_setup;
-  sample_t   * coefs;
-} dft_filter_t;
-
-typedef struct { /* So generated filter coefs may be shared between channels */
-  sample_t   * poly_fir_coefs;
-  dft_filter_t dft_filter[2];
-} rate_shared_t;
-
-typedef enum {
-  irrational_stage = 1,
-  cubic_stage,
-  dft_stage,
-  half_stage,
-  rational_stage
-} stage_type_t;
-
-struct stage;
-typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
-#define MULT32 (65536. * 65536.)
-
-typedef union { /* Fixed point arithmetic */
-  struct {uint64p_t ls; int64p_t ms;} fix;
-  float_step_t flt;
-} step_t;
-
-typedef struct stage {
-  /* Common to all stage types: */
-  stage_type_t type;
-  stage_fn_t fn;
-  fifo_t     fifo;
-  int        pre;       /* Number of past samples to store */
-  int        pre_post;  /* pre + number of future samples to store */
-  int        preload;   /* Number of zero samples to pre-load the fifo */
-  double     out_in_ratio; /* For buffer management. */
-
-  /* For a stage with variable (run-time generated) filter coefs: */
-  rate_shared_t * shared;
-  unsigned   dft_filter_num; /* Which, if any, of the 2 DFT filters to use */
-  sample_t   * dft_scratch, * dft_out;
-
-  /* For a stage with variable L/M: */
-  step_t     at, step;
-  bool       use_hi_prec_clock;
-  int        L, remM;
-  int        n, phase_bits, block_len;
-  double     mult, phase0;
-} stage_t;
-
-#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
-#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
-
-static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo)
-{
-  int i, num_in = stage_occupancy(p), max_num_out = 1 + (int)(num_in*p->out_in_ratio);
-  sample_t const * input = stage_read_p(p);
-  sample_t * output = fifo_reserve(output_fifo, max_num_out);
-
-#define integer  fix.ms.parts.ms
-#define fraction fix.ms.parts.ls
-#define whole    fix.ms.all
-  for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
-    sample_t const * s = input + p->at.integer;
-    double x = p->at.fraction * (1 / MULT32);
-    double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b);
-    double c = s[1]-*s-a-b;
-    output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s));
-  }
-  assert(max_num_out - i >= 0);
-  fifo_trim_by(output_fifo, max_num_out - i);
-  fifo_read(&p->fifo, p->at.integer, NULL);
-  p->at.integer = 0;
-}
-
-#if RATE_SIMD
-  #define dft_out p->dft_out
-#else
-  #define dft_out output
-#endif
-
-static void dft_stage_fn(stage_t * p, fifo_t * output_fifo)
-{
-  sample_t * output;
-  int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
-  rate_shared_t const * s = p->shared;
-  dft_filter_t const * f = &s->dft_filter[p->dft_filter_num];
-  int const overlap = f->num_taps - 1;
-
-  while (p->at.integer + p->L * num_in >= f->dft_length) {
-    div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L);
-    sample_t const * input = fifo_read_ptr(&p->fifo);
-    fifo_read(&p->fifo, divd.quot, NULL);
-    num_in -= divd.quot;
-
-    output = fifo_reserve(output_fifo, f->dft_length);
-
-    if (lsx_is_power_of_2(p->L)) { /* F-domain */
-      int portion = f->dft_length / p->L;
-      memcpy(dft_out, input, (unsigned)portion * sizeof(*dft_out));
-      rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch);
-      for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
-        dft_out[i] = dft_out[(portion << 1) - i],
-        dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
-      dft_out[portion] = dft_out[1];
-      dft_out[portion + 1] = 0;
-      dft_out[1] = dft_out[0];
-
-      for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
-        memcpy(dft_out + i, dft_out, (size_t)portion * sizeof(*dft_out));
-        dft_out[i + 1] = 0;
-      }
-      if (p->step.integer > 0)
-        rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
-    } else {
-      if (p->L == 1)
-        memcpy(dft_out, input, (size_t)f->dft_length * sizeof(*dft_out));
-      else {
-        memset(dft_out, 0, (size_t)f->dft_length * sizeof(*dft_out));
-        for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
-          dft_out[i] = input[j];
-        p->at.integer = p->L - 1 - divd.rem;
-      }
-      if (p->step.integer > 0)
-        rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
-      else
-        rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
-    }
-
-    if (p->step.integer > 0) {
-      rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs);
-      rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
-#if RATE_SIMD
-      if (p->step.integer == 1)
-        memcpy(output, dft_out, (size_t)f->dft_length * sizeof(sample_t));
-#endif
-      if (p->step.integer != 1) {
-        for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
-            i += p->step.integer)
-          output[j] = dft_out[i];
-        p->remM = i - (f->dft_length - overlap);
-        fifo_trim_by(output_fifo, f->dft_length - j);
-      }
-      else fifo_trim_by(output_fifo, overlap);
-    }
-    else { /* F-domain */
-      int m = -p->step.integer;
-      rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs);
-      rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch);
-#if RATE_SIMD
-      memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof(sample_t));
-#endif
-      fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m);
-    }
-  }
-}
-
-#undef dft_out
-
-/* Set to 4 x nearest power of 2 */
-/* or half of that if danger of causing too many cache misses. */
-static int set_dft_length(int num_taps, int min, int large)
-{
-  double d = log((double)num_taps) / log(2.);
-  return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large));
-}
-
-static void dft_stage_init(
-    unsigned instance, double Fp, double Fs, double Fn, double att,
-    double phase, stage_t * p, int L, int M, double * multiplier,
-    int min_dft_size, int large_dft_size)
-{
-  dft_filter_t * f = &p->shared->dft_filter[instance];
-  int num_taps = 0, dft_length = f->dft_length, i;
-  bool f_domain_m = abs(3-M) == 1 && Fs <= 1;
-
-  if (!dft_length) {
-    int k = phase == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4;
-    double * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.);
-
-    if (phase != 50)
-      lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase);
-    else f->post_peak = num_taps / 2;
-
-    dft_length = set_dft_length(num_taps, min_dft_size, large_dft_size);
-    f->coefs = aligned_calloc((size_t)dft_length, sizeof(*f->coefs));
-    for (i = 0; i < num_taps; ++i)
-      f->coefs[(i + dft_length - num_taps + 1) & (dft_length - 1)]
-        = (sample_t)(h[i] * ((1. / dft_length) * rdft_multiplier() * L * *multiplier));
-    free(h);
-  }
-
-#if RATE_SIMD
-  p->dft_out = aligned_malloc(sizeof(sample_t) * (size_t)dft_length);
-#endif
-#if 1 /* In fact, currently, only pffft needs this. */
-  p->dft_scratch = aligned_malloc(2 * sizeof(sample_t) * (size_t)dft_length);
-#endif
-
-  if (!f->dft_length) {
-    void * coef_setup = rdft_forward_setup(dft_length);
-    int Lp = lsx_is_power_of_2(L)? L : 1;
-    int Mp = f_domain_m? M : 1;
-    f->dft_forward_setup = rdft_forward_setup(dft_length / Lp);
-    f->dft_backward_setup = rdft_backward_setup(dft_length / Mp);
-    if (Mp == 1)
-      rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch);
-    else
-      rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch);
-    rdft_delete_setup(coef_setup);
-    f->num_taps = num_taps;
-    f->dft_length = dft_length;
-    lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i",
-        num_taps, dft_length, Fp, Fs, Fn, att, L, M);
-  }
-  *multiplier = 1;
-  p->out_in_ratio = (double)L / M;
-  p->type = dft_stage;
-  p->fn = dft_stage_fn;
-  p->preload = f->post_peak / L;
-  p->at.integer = f->post_peak % L;
-  p->L = L;
-  p->step.integer = f_domain_m? -M/2 : M;
-  p->dft_filter_num = instance;
-  p->block_len = f->dft_length - (f->num_taps - 1);
-  p->phase0 = p->at.integer / p->L;
-}
-
-#include "filters.h"
-
-typedef struct {
-  double     factor;
-  uint64_t   samples_in, samples_out;
-  int        num_stages;
-  stage_t    * stages;
-} rate_t;
-
-#define pre_stage       p->stages[shift]
-#define arb_stage       p->stages[shift + have_pre_stage]
-#define post_stage      p->stages[shift + have_pre_stage + have_arb_stage]
-#define have_pre_stage  (preM  * preL  != 1)
-#define have_arb_stage  (arbM  * arbL  != 1)
-#define have_post_stage (postM * postL != 1)
-
-#define TO_3dB(a)       ((1.6e-6*a-7.5e-4)*a+.646)
-#define LOW_Q_BW0       (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
-
-typedef enum {
-  rolloff_none, rolloff_small /* <= 0.01 dB */, rolloff_medium /* <= 0.35 dB */
-} rolloff_t;
-
-
-static char const * rate_init(
-  /* Private work areas (to be supplied by the client):                       */
-  rate_t * p,                /* Per audio channel.                            */
-  rate_shared_t * shared,    /* Between channels (undergoing same rate change)*/
-
-  /* Public parameters:                                             Typically */
-  double factor,             /* Input rate divided by output rate.            */
-  double bits,               /* Required bit-accuracy (pass + stop)  16|20|28 */
-  double phase,              /* Linear/minimum etc. filter phase.       50    */
-  double passband_end,       /* 0dB pt. bandwidth to preserve; nyquist=1 0.913*/
-  double stopband_begin,     /* Aliasing/imaging control; > passband_end  1   */
-  rolloff_t rolloff,         /* Pass-band roll-off                    small   */
-  bool maintain_3dB_pt,      /*                                        true   */
-  double multiplier,         /* Linear gain to apply during conversion.   1   */
-
-  /* Primarily for test/development purposes:                                 */
-  bool use_hi_prec_clock,    /* Increase irrational ratio accuracy.   false   */
-  int interpolator,          /* Force a particular coef interpolator.   -1    */
-  size_t max_coefs_size,     /* k bytes of coefs to try to keep below.  400   */
-  bool noSmallIntOpt,        /* Disable small integer optimisations.  false   */
-  int log2_min_dft_size,
-  int log2_large_dft_size)
-{
-  double att = (bits + 1) * linear_to_dB(2.), attArb = att;    /* pass + stop */
-  double tbw0 = 1 - passband_end, Fs_a = stopband_begin;
-  double arbM = factor, tbw_tighten = 1;
-  int n = 0, i, preL = 1, preM = 1, shift = 0, arbL = 1, postL = 1, postM = 1;
-  bool upsample = false, rational = false, iOpt = !noSmallIntOpt;
-  int mode = rolloff > rolloff_small? factor > 1 || passband_end > LOW_Q_BW0:
-    (int)ceil(2 + (bits - 17) / 4);
-  stage_t * s;
-
-  assert(factor > 0);
-  assert(!bits || (15 <= bits && bits <= 33));
-  assert(0 <= phase && phase <= 100);
-  assert(.53 <= passband_end);
-  assert(stopband_begin <= 1.2);
-  assert(passband_end + .005 < stopband_begin);
-
-  p->factor = factor;
-  if (bits) while (!n++) {                               /* Determine stages: */
-    int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 :
-      (int)ceil((double)max_coefs_size * 1000. / (U100_l * sizeof(sample_t)));
-    double d, epsilon = 0, frac;
-    upsample = arbM < 1;
-    for (i = (int)(arbM * .5), shift = 0; i >>= 1; arbM *= .5, ++shift);
-    preM = upsample || (arbM > 1.5 && arbM < 2);
-    postM = 1 + (arbM > 1 && preM), arbM /= postM;
-    preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL;
-    if ((frac = arbM - (int)arbM))
-      epsilon = fabs((uint32_t)(frac * MULT32 + .5) / (frac * MULT32) - 1);
-    for (i = 1, rational = !frac; i <= maxL && !rational; ++i) {
-      d = frac * i, try = (int)(d + .5);
-      if ((rational = fabs(try / d - 1) <= epsilon)) {    /* No long doubles! */
-        if (try == i)
-          arbM = ceil(arbM), shift += arbM > 2, arbM /= 1 + (arbM > 2);
-        else arbM = i * (int)arbM + try, arbL = i;
-      }
-    }
-    L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x;
-    if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) {
-      for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1);
-      arbM = arbM * postL / arbL / preL, arbL = 1, n = 0;
-    } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt))
-      preL = L, preM = M, arbM = arbL = postM = 1;
-    if (!mode && (!rational || !n))
-      ++mode, n = 0;
-  }
-
-  p->num_stages = shift + have_pre_stage + have_arb_stage + have_post_stage;
-  if (!p->num_stages && multiplier != 1) {
-    bits = arbL = 0;                         /* Use cubic_stage in this case. */
-    ++p->num_stages;
-  }
-  p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages));
-  for (i = 0; i < p->num_stages; ++i)
-    p->stages[i].shared = shared;
-
-  if ((n = p->num_stages) > 1) {                              /* Att. budget: */
-    if (have_arb_stage)
-      att += linear_to_dB(2.), attArb = att, --n;
-    att += linear_to_dB((double)n);
-  }
-
-  for (n = 0; (size_t)n + 1 < array_length(half_firs) && att > half_firs[n].att; ++n);
-  for (i = 0, s = p->stages; i < shift; ++i, ++s) {
-    s->type = half_stage;
-    s->fn = half_firs[n].fn;
-    s->pre_post = 4 * half_firs[n].num_coefs;
-    s->preload = s->pre = s->pre_post >> 1;
-  }
-
-  if (have_pre_stage) {
-    if (maintain_3dB_pt && have_post_stage) {    /* Trans. bands overlapping. */
-      double tbw3 = tbw0 * TO_3dB(att);                /* FFS: consider Fs_a. */
-      double x = ((2.1429e-4 - 5.2083e-7 * att) * att - .015863) * att + 3.95;
-      x = att * pow((tbw0 - tbw3) / (postM / (factor * postL) - 1 + tbw0), x);
-      if (x > .035) {
-        tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014;
-        lsx_debug("x=%g tbw_tighten=%g", x, tbw_tighten);
-      }
-    }
-    dft_stage_init(0, 1 - tbw0 * tbw_tighten, Fs_a, preM? max(preL, preM) :
-        arbM / arbL, att, phase, &pre_stage, preL, max(preM, 1), &multiplier,
-        log2_min_dft_size, log2_large_dft_size);
-  }
-
-  if (!bits && have_arb_stage) {                  /* `Quick' cubic arb stage: */
-    arb_stage.type = cubic_stage;
-    arb_stage.fn = cubic_stage_fn;
-    arb_stage.mult = multiplier, multiplier = 1;
-    arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5);
-    arb_stage.pre_post = max(3, arb_stage.step.integer);
-    arb_stage.preload = arb_stage.pre = 1;
-    arb_stage.out_in_ratio = MULT32 / (double)arb_stage.step.whole;
-  }
-  else if (have_arb_stage) {                     /* Higher quality arb stage: */
-    poly_fir_t const * f = &poly_firs[6*(upsample + !!preM) + mode - !upsample];
-    int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases;
-    size_t coefs_size;
-    double x = .5, at, Fp, Fs, Fn, mult = upsample? 1 : arbL / arbM;
-    poly_fir1_t const * f1;
-
-    Fn = !upsample && preM? x = arbM / arbL : 1;
-    Fp = !preM? mult : mode? .5 : 1;
-    Fs = 2 - Fp;           /* Ignore Fs_a; it would have little benefit here. */
-    Fp *= 1 - tbw0;
-    if (rolloff > rolloff_small && mode)
-      Fp = !preM? mult * .5 - .125 : mult * .05 + .1;
-    else if (rolloff == rolloff_small)
-      Fp = Fs - (Fs - .148 * x - Fp * .852) * (.00813 * bits + .973);
-
-    i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1;
-    do {
-      f1 = &f->interp[++i];
-      assert(f1->fn);
-      if (i)
-        arbM /= arbL, arbL = 1, rational = false;
-      phase_bits = (int)ceil(f1->scalar + log(mult)/log(2.));
-      phases = !rational? (1 << phase_bits) : arbL;
-      if (!f->interp[0].scalar) {
-        int phases0 = max(phases, 19), n0 = 0;
-        lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta);
-        num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM;
-      }
-      if ((num_coefs & 1) && rational && (arbL & 1))
-        phases <<= 1, arbL <<= 1, arbM *= 2;
-      at = arbL * (arb_stage.phase0 = .5 * (num_coefs & 1));
-      order = i + (i && mode > 4);
-      coefs_size = (size_t)(num_coefs4 * phases * (order + 1)) * sizeof(sample_t);
-    } while (interpolator < 0 && i < 2 && f->interp[i+1].fn &&
-        coefs_size / 1000 > max_coefs_size);
-
-    if (!arb_stage.shared->poly_fir_coefs) {
-      int num_taps = num_coefs * phases - 1;
-      raw_coef_t * coefs = lsx_design_lpf(
-          Fp, Fs, Fn, attArb, &num_taps, phases, f->beta);
-      arb_stage.shared->poly_fir_coefs = prepare_coefs(
-          coefs, num_coefs, phases, order, multiplier);
-      lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk",
-          num_coefs, phases, order, (double)coefs_size / 1000.);
-      free(coefs);
-    }
-    multiplier = 1;
-    arb_stage.type = rational? rational_stage : irrational_stage;
-    arb_stage.fn = f1->fn;
-    arb_stage.pre_post = num_coefs4 - 1;
-    arb_stage.preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs);
-    arb_stage.n = num_coefs4;
-    arb_stage.phase_bits = phase_bits;
-    arb_stage.L = arbL;
-    arb_stage.use_hi_prec_clock = mode > 1 && use_hi_prec_clock && !rational;
-#if FLOAT_HI_PREC_CLOCK
-    if (arb_stage.use_hi_prec_clock) {
-      arb_stage.at.flt = at;
-      arb_stage.step.flt = arbM;
-      arb_stage.out_in_ratio = (double)(arbL / arb_stage.step.flt);
-    } else
-#endif
-    {
-      arb_stage.at.whole = (int64_t)(at * MULT32 + .5);
-#if !FLOAT_HI_PREC_CLOCK
-      if (arb_stage.use_hi_prec_clock) {
-        arb_stage.at.fix.ls.parts.ms = 0x80000000ul;
-        arbM *= MULT32;
-        arb_stage.step.whole = (int64_t)arbM;
-        arbM -= (double)arb_stage.step.whole;
-        arbM *= MULT32 * MULT32;
-        arb_stage.step.fix.ls.all = (uint64_t)arbM;
-      } else
-#endif
-        arb_stage.step.whole = (int64_t)(arbM * MULT32 + .5);
-      arb_stage.out_in_ratio = MULT32 * arbL / (double)arb_stage.step.whole;
-    }
-  }
-
-  if (have_post_stage)
-    dft_stage_init(1, 1 - (1 - (1 - tbw0) *
-        (upsample? factor * postL / postM : 1)) * tbw_tighten, Fs_a,
-        (double)max(postL, postM), att, phase, &post_stage, postL, postM,
-        &multiplier, log2_min_dft_size, log2_large_dft_size);
-
-
-  lsx_debug("%g: »%i⋅%i/%i⋅%i/%g⋅%i/%i",
-      1/factor, shift, preL, preM, arbL, arbM, postL, postM);
-  for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
-    fifo_create(&s->fifo, (int)sizeof(sample_t));
-    memset(fifo_reserve(&s->fifo, s->preload), 0, sizeof(sample_t) * (size_t)s->preload);
-    lsx_debug("%5i|%-5i preload=%i remL=%i o/i=%g",
-        s->pre, s->pre_post - s->pre, s->preload, s->at.integer, s->out_in_ratio);
-  }
-  fifo_create(&s->fifo, (int)sizeof(sample_t));
-  return 0;
-}
-
-static void rate_process(rate_t * p)
-{
-  stage_t * stage = p->stages;
-  int i;
-  for (i = 0; i < p->num_stages; ++i, ++stage)
-    stage->fn(stage, &(stage+1)->fifo);
-}
-
-static sample_t * rate_input(rate_t * p, sample_t const * samples, size_t n)
-{
-  p->samples_in += n;
-  return fifo_write(&p->stages[0].fifo, (int)n, samples);
-}
-
-static sample_t const * rate_output(rate_t * p, sample_t * samples, size_t * n)
-{
-  fifo_t * fifo = &p->stages[p->num_stages].fifo;
-  p->samples_out += *n = min(*n, (size_t)fifo_occupancy(fifo));
-  return fifo_read(fifo, (int)*n, samples);
-}
-
-static void rate_flush(rate_t * p)
-{
-  fifo_t * fifo = &p->stages[p->num_stages].fifo;
-#if defined _MSC_VER && _MSC_VER == 1200
-  uint64_t samples_out = (uint64_t)(int64_t)((double)(int64_t)p->samples_in / p->factor + .5);
-#else
-  uint64_t samples_out = (uint64_t)((double)p->samples_in / p->factor + .5);
-#endif
-  size_t remaining = (size_t)(samples_out - p->samples_out);
-
-  if ((size_t)fifo_occupancy(fifo) < remaining) {
-    uint64_t samples_in = p->samples_in;
-    sample_t * buff = calloc(1024, sizeof(*buff));
-
-    while ((size_t)fifo_occupancy(fifo) < remaining) {
-      rate_input(p, buff, 1024);
-      rate_process(p);
-    }
-    fifo_trim_to(fifo, (int)remaining);
-    p->samples_in = samples_in;
-    free(buff);
-  }
-}
-
-static void rate_close(rate_t * p)
-{
-  rate_shared_t * shared = p->stages[0].shared;
-  int i;
-
-  for (i = 0; i <= p->num_stages; ++i) {
-    stage_t * s = &p->stages[i];
-    aligned_free(s->dft_scratch);
-    aligned_free(s->dft_out);
-    fifo_delete(&s->fifo);
-  }
-  if (shared) {
-    for (i = 0; i < 2; ++i) {
-      dft_filter_t * f= &shared->dft_filter[i];
-      aligned_free(f->coefs);
-      rdft_delete_setup(f->dft_forward_setup);
-      rdft_delete_setup(f->dft_backward_setup);
-    }
-    free(shared->poly_fir_coefs);
-    memset(shared, 0, sizeof(*shared));
-  }
-  free(p->stages);
-}
-
-#if defined SOXR_LIB
-static double rate_delay(rate_t * p)
-{
-#if defined _MSC_VER && _MSC_VER == 1200
-  double samples_out = (double)(int64_t)p->samples_in / p->factor;
-  return max(0, samples_out - (double)(int64_t)p->samples_out);
-#else
-  double samples_out = (double)p->samples_in / p->factor;
-  return max(0, samples_out - (double)p->samples_out);
-#endif
-}
-
-static void rate_sizes(size_t * shared, size_t * channel)
-{
-  *shared = sizeof(rate_shared_t);
-  *channel = sizeof(rate_t);
-}
-
-#include "soxr.h"
-
-static char const * rate_create(
-    void * channel,
-    void * shared,
-    double io_ratio,
-    soxr_quality_spec_t * q_spec,
-    soxr_runtime_spec_t * r_spec,
-    double scale)
-{
-  return rate_init(
-      channel, shared,
-      io_ratio,
-      q_spec->precision,
-      q_spec->phase_response,
-      q_spec->passband_end,
-      q_spec->stopband_begin,
-      "\1\2\0"[q_spec->flags & 3],
-      !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT),
-      scale,
-      !!(q_spec->flags & SOXR_HI_PREC_CLOCK),
-      (int)(r_spec->flags & 3) - 1,
-      r_spec->coef_size_kbytes,
-      !!(r_spec->flags & SOXR_NOSMALLINTOPT),
-      (int)r_spec->log2_min_dft_size,
-      (int)r_spec->log2_large_dft_size);
-}
-
-static char const * id(void)
-{
-  return RATE_ID;
-}
-
-fn_t RATE_CB[] = {
-  (fn_t)rate_input,
-  (fn_t)rate_process,
-  (fn_t)rate_output,
-  (fn_t)rate_flush,
-  (fn_t)rate_close,
-  (fn_t)rate_delay,
-  (fn_t)rate_sizes,
-  (fn_t)rate_create,
-  (fn_t)0,
-  (fn_t)id,
-};
-#endif
diff --git a/soxr/src/rate32.c b/soxr/src/rate32.c
deleted file mode 100644
index d6dd3b9..0000000
--- a/soxr/src/rate32.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#define sample_t   float
-#define RATE_SIMD  0
-#define RDFT_CB    _soxr_rdft32_cb
-#define RATE_CB    _soxr_rate32_cb
-#define RATE_ID    "single-precision"
-#include "rate.h"
diff --git a/soxr/src/rate32s.c b/soxr/src/rate32s.c
deleted file mode 100644
index 26a371a..0000000
--- a/soxr/src/rate32s.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#define sample_t   float
-#define RATE_SIMD  1
-#define RDFT_CB    _soxr_rdft32s_cb
-#define RATE_CB    _soxr_rate32s_cb
-#define RATE_ID    "single-precision-SIMD"
-#include "rate.h"
diff --git a/soxr/src/rate64.c b/soxr/src/rate64.c
deleted file mode 100644
index 6289911..0000000
--- a/soxr/src/rate64.c
+++ /dev/null
@@ -1,9 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#define sample_t   double
-#define RATE_SIMD  0
-#define RDFT_CB    _soxr_rdft64_cb
-#define RATE_CB    _soxr_rate64_cb
-#define RATE_ID    "double-precision"
-#include "rate.h"
diff --git a/soxr/src/rdft.h b/soxr/src/rdft.h
index 59ba174..4ecd247 100644
--- a/soxr/src/rdft.h
+++ b/soxr/src/rdft.h
@@ -1,9 +1,11 @@
 /* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
-void ORDERED_CONVOLVE(int n, void * not_used, DFT_FLOAT * a, const DFT_FLOAT * b)
+void ORDERED_CONVOLVE(int n, void * not_used, void * A, const void * B)
 {
   int i;
+  DFT_FLOAT* a = A;
+  const DFT_FLOAT* b = B;
   a[0] *= b[0];
   a[1] *= b[1];
   for (i = 2; i < n; i += 2) {
@@ -14,9 +16,11 @@ void ORDERED_CONVOLVE(int n, void * not_used, DFT_FLOAT * a, const DFT_FLOAT * b
   (void)not_used;
 }
 
-void ORDERED_PARTIAL_CONVOLVE(int n, DFT_FLOAT * a, const DFT_FLOAT * b)
+void ORDERED_PARTIAL_CONVOLVE(int n, void * A, const void * B)
 {
   int i;
+  DFT_FLOAT* a = A;
+  const DFT_FLOAT* b = B;
   a[0] *= b[0];
   for (i = 2; i < n; i += 2) {
     DFT_FLOAT tmp = a[i];
diff --git a/soxr/src/rdft_t.h b/soxr/src/rdft_t.h
new file mode 100644
index 0000000..7e44134
--- /dev/null
+++ b/soxr/src/rdft_t.h
@@ -0,0 +1,40 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+typedef struct {
+  void * (* forward_setup)(int);
+  void * (* backward_setup)(int);
+  void (* delete_setup)(void *);
+  void (* forward)(int, void *, void *, void *);
+  void (* oforward)(int, void *, void *, void *);
+  void (* backward)(int, void *, void *, void *);
+  void (* obackward)(int, void *, void *, void *);
+  void (* convolve)(int, void *, void *, void const *);
+  void (* convolve_portion)(int, void *, void const *);
+  int (* multiplier)(void);
+  void (* reorder_back)(int, void *, void *, void *);
+  void * (* malloc)(size_t);
+  void * (* calloc)(size_t, size_t);
+  void (* free)(void *);
+  int (* flags)(void);
+} rdft_cb_table;
+
+#define rdft_forward_setup    RDFT_CB->forward_setup
+#define rdft_backward_setup   RDFT_CB->backward_setup
+#define rdft_delete_setup     RDFT_CB->delete_setup
+#define rdft_forward          RDFT_CB->forward
+#define rdft_oforward         RDFT_CB->oforward
+#define rdft_backward         RDFT_CB->backward
+#define rdft_obackward        RDFT_CB->obackward
+#define rdft_convolve         RDFT_CB->convolve
+#define rdft_convolve_portion RDFT_CB->convolve_portion
+#define rdft_multiplier       RDFT_CB->multiplier
+#define rdft_reorder_back     RDFT_CB->reorder_back
+#define rdft_malloc           RDFT_CB->malloc
+#define rdft_calloc           RDFT_CB->calloc
+#define rdft_free             RDFT_CB->free
+#define rdft_flags            RDFT_CB->flags
+
+/* Flag templates: */
+#define RDFT_IS_SIMD       1
+#define RDFT_NEEDS_SCRATCH 2
diff --git a/soxr/src/rint-clip.h b/soxr/src/rint-clip.h
index 06764a8..bfb6458 100644
--- a/soxr/src/rint-clip.h
+++ b/soxr/src/rint-clip.h
@@ -1,9 +1,9 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #if defined DITHER
 
-#define DITHERING (1./32)*(int)(((ran1>>=3)&31)-((ran2>>=3)&31))
+#define DITHERING + (1./32)*(int)(((ran1>>=3)&31)-((ran2>>=3)&31))
 #define DITHER_RAND (seed = 1664525UL * seed + 1013904223UL) >> 3
 #define DITHER_VARS unsigned long ran1 = DITHER_RAND, ran2 = DITHER_RAND
 #define SEED_ARG , unsigned long * seed0
@@ -12,10 +12,11 @@
 #define COPY_SEED1 unsigned long seed1 = seed
 #define PASS_SEED1 , &seed1
 #define PASS_SEED  , &seed
+#define FLOATD double
 
 #else
 
-#define DITHERING 0
+#define DITHERING
 #define DITHER_VARS
 #define SEED_ARG
 #define SAVE_SEED
@@ -23,9 +24,12 @@
 #define COPY_SEED1
 #define PASS_SEED1
 #define PASS_SEED
+#define FLOATD FLOATX
 
 #endif
 
+#define DO_16 _;_;_;_;_;_;_;_;_;_;_;_;_;_;_;_
+
 
 
 #if defined FE_INVALID && defined FPU_RINT
@@ -35,8 +39,8 @@ static void RINT_CLIP(RINT_T * const dest, FLOATX const * const src,
   COPY_SEED
   DITHER_VARS;
   for (; i < n; ++i) {
-    double d = src[i] + DITHERING;
-    dest[stride * i] = RINT(d);
+    FLOATD const d = src[i] DITHERING;
+    RINT(dest[stride * i], d);
     if (fe_test_invalid()) {
       fe_clear_invalid();
       dest[stride * i] = d > 0? RINT_MAX : -RINT_MAX - 1;
@@ -56,29 +60,29 @@ static size_t LSX_RINT_CLIP(void * * const dest0, FLOATX const * const src,
   RINT_T * dest = *dest0;
   COPY_SEED
 #if defined FE_INVALID && defined FPU_RINT
-#define _ dest[i] = RINT(src[i] + DITHERING), ++i,
-  fe_clear_invalid();
-  for (i = 0; i < (n & ~7u);) {
+#define _ RINT(dest[i], src[i] DITHERING); ++i
+  for (i = 0; i < (n & ~15u);) {
     COPY_SEED1;
     DITHER_VARS;
-    _ _ _ _ _ _ _ _ (void)0;
+    DO_16;
     if (fe_test_invalid()) {
       fe_clear_invalid();
-      RINT_CLIP(dest, src, 1, i - 8, i, &clips PASS_SEED1);
+      RINT_CLIP(dest, src, 1, i - 16, i, &clips PASS_SEED1);
     }
   }
   RINT_CLIP(dest, src, 1, i, n, &clips PASS_SEED);
 #else
-#define _ d = src[i] + DITHERING, dest[i++] = (RINT_T)(d > 0? d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5),
+#define _ d = src[i] DITHERING, dest[i++] = (RINT_T)(d > 0? \
+    d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5)
   const double N = 1. + RINT_MAX;
   double d;
-  for (i = 0; i < (n & ~7u);) {
+  for (i = 0; i < (n & ~15u);) {
     DITHER_VARS;
-    _ _ _ _ _ _ _ _ (void)0;
+    DO_16;
   }
   {
     DITHER_VARS;
-    for (; i < n; _ (void)0);
+    for (; i < n; _);
   }
 #endif
   SAVE_SEED;
@@ -97,34 +101,34 @@ static size_t LSX_RINT_CLIP_2(void * * dest0, FLOATX const * const * srcs,
   RINT_T * dest = *dest0;
   COPY_SEED
 #if defined FE_INVALID && defined FPU_RINT
-#define _ dest[stride * i] = RINT(src[i] + DITHERING), ++i,
-  fe_clear_invalid();
+#define _ RINT(dest[stride * i], src[i] DITHERING); ++i
   for (j = 0; j < stride; ++j, ++dest) {
     FLOATX const * const src = srcs[j];
-    for (i = 0; i < (n & ~7u);) {
+    for (i = 0; i < (n & ~15u);) {
       COPY_SEED1;
       DITHER_VARS;
-      _ _ _ _ _ _ _ _ (void)0;
+      DO_16;
       if (fe_test_invalid()) {
         fe_clear_invalid();
-        RINT_CLIP(dest, src, stride, i - 8, i, &clips PASS_SEED1);
+        RINT_CLIP(dest, src, stride, i - 16, i, &clips PASS_SEED1);
       }
     }
     RINT_CLIP(dest, src, stride, i, n, &clips PASS_SEED);
   }
 #else
-#define _ d = src[i] + DITHERING, dest[stride * i++] = (RINT_T)(d > 0? d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5),
+#define _ d = src[i] DITHERING, dest[stride * i++] = (RINT_T)(d > 0? \
+    d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5)
   const double N = 1. + RINT_MAX;
   double d;
   for (j = 0; j < stride; ++j, ++dest) {
     FLOATX const * const src = srcs[j];
-    for (i = 0; i < (n & ~7u);) {
+    for (i = 0; i < (n & ~15u);) {
       DITHER_VARS;
-      _ _ _ _ _ _ _ _ (void)0;
+      DO_16;
     }
     {
       DITHER_VARS;
-      for (; i < n; _ (void)0);
+      for (; i < n; _);
     }
   }
 #endif
@@ -134,6 +138,7 @@ static size_t LSX_RINT_CLIP_2(void * * dest0, FLOATX const * const * srcs,
 }
 #undef _
 
+#undef FLOATD
 #undef PASS_SEED
 #undef PASS_SEED1
 #undef COPY_SEED1
diff --git a/soxr/src/rint.h b/soxr/src/rint.h
index 529e4bb..2f1dfbe 100644
--- a/soxr/src/rint.h
+++ b/soxr/src/rint.h
@@ -1,68 +1,102 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #if !defined soxr_rint_included
 #define soxr_rint_included
 
-#include "soxr-config.h"
-
-
-
-#if HAVE_LRINT && LONG_MAX == 2147483647L
-  #include <math.h>
-  #define FPU_RINT32
-  #define rint32 lrint
-#elif defined __GNUC__ && (defined __i386__ || defined __x86_64__)
-  #define FPU_RINT32
-  static __inline int32_t rint32(double input) {
-    int32_t result;
-    __asm__ __volatile__("fistpl %0": "=m"(result): "t"(input): "st");
-    return result;
-  }
-#elif defined __GNUC__ && defined __arm__
-  #define FPU_RINT32
-  static __inline int32_t rint32(double input) {
-    register int32_t result;
-    __asm__ __volatile__ ("ftosid %0, %P1": "=w"(result): "w"(input));
-    return result;
-  }
-#elif defined _MSC_VER && defined _M_IX86 /* FIXME need solution for MSVC x64 */
-  #define FPU_RINT32
-  static __inline int32_t rint32(double input) {
-    int32_t result;
-    _asm {
-      fld input
-      fistp result
-    }
-    return result;
-  }
-#else
-  #define rint32(x) (int32_t)((x) < 0? x - .5 : x + .5)
-#endif
-
+#include "std-types.h"
 
+/* For x86, compiler-supplied versions of these functions (where available)
+ * can have poor performance (e.g. mingw32), so prefer these asm versions: */
 
 #if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+  #define FPU_RINT32
   #define FPU_RINT16
-  static __inline int16_t rint16(double input) {
-    int16_t result;
-    __asm__ __volatile__("fistps %0": "=m"(result): "t"(input): "st");
-    return result;
+  #define rint32D(a,b) __asm__ __volatile__("fistpl %0": "=m"(a): "t"(b): "st")
+  #define rint16D(a,b) __asm__ __volatile__("fistps %0": "=m"(a): "t"(b): "st")
+  #define rint32F rint32D
+  #define rint16F rint16D
+  #define FE_INVALID 1
+  static __inline int fe_test_invalid(void) {
+    int status_word;
+    __asm__ __volatile__("fnstsw %%ax": "=a"(status_word));
+    return status_word & FE_INVALID;
   }
-#elif defined _MSC_VER && defined _M_IX86 /* FIXME need solution for MSVC x64 */
+  static __inline int fe_clear_invalid(void) {
+    int32_t status[7];
+    __asm__ __volatile__("fnstenv %0": "=m"(status));
+    status[1] &= ~FE_INVALID;
+    __asm__ __volatile__("fldenv %0": : "m"(*status));
+    return 0;
+  }
+#elif defined _MSC_VER && defined _M_IX86
+  #define FPU_RINT32
   #define FPU_RINT16
-  static __inline int16_t rint16(double input) {
-    int16_t result;
-    _asm {
-      fld input
-      fistp result
-    }
-    return result;
+  #define rint_fn(N,Y,X) \
+    static __inline void N(Y *y, X x) {Y t; {__asm fld x __asm fistp t} *y=t;}
+  rint_fn(rint32d, int32_t, double)
+  rint_fn(rint32f, int32_t, float )
+  rint_fn(rint16d, int16_t, double)
+  rint_fn(rint16f, int16_t, float )
+  #define rint32D(y,x) rint32d(&(y),x)
+  #define rint32F(y,x) rint32f(&(y),x)
+  #define rint16D(y,x) rint16d(&(y),x)
+  #define rint16F(y,x) rint16f(&(y),x)
+  #define FE_INVALID 1
+  static __inline int fe_test_invalid(void) {
+    short status_word;
+    __asm fnstsw status_word
+    return status_word & FE_INVALID;
   }
-#else
-  #define rint16(x) (int16_t)((x) < 0? x - .5 : x + .5)
+  static __inline int fe_clear_invalid(void) {
+    int32_t status[7];
+    __asm fnstenv status
+    status[1] &= ~FE_INVALID;
+    __asm fldenv status
+    return 0;
+  }
+#elif defined _MSC_VER && defined _M_X64
+  #include <emmintrin.h>
+  #include <float.h>
+  #define FPU_RINT32
+  #define FPU_RINT16
+  static __inline void rint32d(int32_t *y, double x) {
+    *y = _mm_cvtsd_si32(_mm_load_sd(&x));}
+  static __inline void rint32f(int32_t *y, float  x) {
+    *y = _mm_cvtss_si32(_mm_load_ss(&x));}
+  static __inline void rint16d(int16_t *y, double x) {
+    x = x*65536+32738; *y = (int16_t)(_mm_cvtsd_si32(_mm_load_sd(&x)) >> 16);}
+  #define rint32D(y,x) rint32d(&(y),x)
+  #define rint32F(y,x) rint32f(&(y),x)
+  #define rint16D(y,x) rint16d(&(y),x)
+  #define rint16F(y,x) rint16d(&(y),(double)(x))
+  #define FE_INVALID 1
+  #define fe_test_invalid() (_statusfp() & _SW_INVALID)
+  #define fe_clear_invalid _clearfp /* Note: clears all. */
+#elif HAVE_LRINT && LONG_MAX == 2147483647L && HAVE_FENV_H
+  #include <math.h>
+  #include <fenv.h>
+  #define FPU_RINT32
+  #define rint32D(y,x) ((y)=lrint(x))
+  #define rint32F(y,x) ((y)=lrintf(x))
+  #define fe_test_invalid() fetestexcept(FE_INVALID)
+  #define fe_clear_invalid() feclearexcept(FE_INVALID)
 #endif
 
+#if !defined FPU_RINT32
+  #define rint32D(y,x) ((y)=(int32_t)((x) < 0? x - .5 : x + .5))
+  #define rint32F(y,x) rint32D(y,(double)(x))
+#endif
 
+#if !defined FPU_RINT16
+  #define rint16D(y,x) ((y)=(int16_t)((x) < 0? x - .5 : x + .5))
+  #define rint16F(y,x) rint16D(y,(double)(x))
+#endif
+
+static __inline int32_t rint32(double input) {
+  int32_t result; rint32D(result, input); return result;}
+
+static __inline int16_t rint16(double input) {
+  int16_t result; rint16D(result, input); return result;}
 
 #endif
diff --git a/soxr/src/simd-dev.h b/soxr/src/simd-dev.h
deleted file mode 100644
index 019325c..0000000
--- a/soxr/src/simd-dev.h
+++ /dev/null
@@ -1,5 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#define PFFT_MACROS_ONLY
-#include "pffft.c"
diff --git a/soxr/src/simd.h b/soxr/src/simd.h
deleted file mode 100644
index 71eefc6..0000000
--- a/soxr/src/simd.h
+++ /dev/null
@@ -1,16 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-#if !defined simd_included
-#define simd_included
-
-#include <stddef.h>
-
-void * _soxr_simd_aligned_malloc(size_t);
-void * _soxr_simd_aligned_calloc(size_t, size_t);
-void _soxr_simd_aligned_free(void *);
-
-void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b);
-void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b);
-
-#endif
diff --git a/soxr/src/soxr-lsr.c b/soxr/src/soxr-lsr.c
new file mode 100644
index 0000000..58ab50a
--- /dev/null
+++ b/soxr/src/soxr-lsr.c
@@ -0,0 +1,198 @@
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Wrapper mostly compatible with `libsamplerate'. */
+
+#include <assert.h>
+#include <stdlib.h>
+#include "soxr.h"
+#include "soxr-lsr.h"
+#include "rint.h"
+
+
+
+SRC_STATE *src_new(SRC_SRCTYPE id, int channels, SRC_ERROR * error)
+{
+  return src_callback_new(0, id, channels, error, 0);
+}
+
+
+
+SRC_ERROR src_process(SRC_STATE *p, SRC_DATA * io)
+{
+  size_t idone , odone;
+
+  if (!p || !io) return -1;
+
+  soxr_set_error(
+      p, soxr_set_io_ratio(p, 1/io->src_ratio, (size_t)io->output_frames));
+
+  soxr_process(p, io->data_in,                                  /* hack: */
+      (size_t)(io->end_of_input? ~io->input_frames : io->input_frames),
+      &idone, io->data_out, (size_t)io->output_frames, &odone);
+
+  io->input_frames_used = (long)idone, io->output_frames_gen = (long)odone;
+  return -!!soxr_error(p);
+}
+
+
+
+SRC_ERROR src_set_ratio(SRC_STATE * p, double oi_ratio)
+{
+  return -!!soxr_set_io_ratio(p, 1/oi_ratio, 0);
+}
+
+
+
+SRC_ERROR src_reset(SRC_STATE * p)
+{
+  return -!!soxr_clear(p);
+}
+
+
+
+SRC_ERROR src_error(SRC_STATE * p)
+{
+  return -!!soxr_error(p);
+}
+
+
+
+SRC_STATE * src_delete(SRC_STATE * p)
+{
+  soxr_delete(p);
+  return 0;
+}
+
+
+
+SRC_STATE *src_callback_new(src_callback_t fn,
+    SRC_SRCTYPE id, int channels, SRC_ERROR * error0, void * p)
+{
+  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + (unsigned)id, 0);
+  char const * e = getenv("SOXR_LSR_NUM_THREADS");
+  soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
+  soxr_error_t error;
+  soxr_t soxr = 0;
+
+  assert (channels > 0);
+  soxr = soxr_create(0, 0, (unsigned)channels, &error, 0, &q_spec, &r_spec);
+
+  if (soxr)
+    error = soxr_set_input_fn(soxr, (soxr_input_fn_t)fn, p, 0);
+
+  if (error0)
+    *error0 = -!!error;
+
+  return soxr;
+}
+
+
+
+long src_callback_read(SRC_STATE *p, double oi_ratio, long olen, float * obuf)
+{
+  if (!p || olen < 0) return -1;
+
+  soxr_set_error(p, soxr_set_io_ratio(p, 1/oi_ratio, (size_t)olen));
+  return (long)soxr_output(p, obuf, (size_t)olen);
+}
+
+
+
+SRC_ERROR src_simple(SRC_DATA * io, SRC_SRCTYPE id, int channels)
+{
+  size_t idone, odone;
+  soxr_error_t error;
+  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + (unsigned)id, 0);
+  char const * e = getenv("SOXR_LSR_NUM_THREADS");
+  soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
+
+  if (!io || channels<=0 || io->input_frames<0 || io->output_frames<0) return-1;
+
+  error = soxr_oneshot(1, io->src_ratio, (unsigned)channels, io->data_in,
+      (size_t)io->input_frames, &idone, io->data_out, (size_t)io->output_frames,
+      &odone, 0, &q_spec, &r_spec);
+
+  io->input_frames_used = (long)idone, io->output_frames_gen = (long)odone;
+
+  return -!!error;
+}
+
+
+
+char const * src_get_name(SRC_SRCTYPE id)
+{
+  static char const * const names[] = {
+    "LSR best sinc", "LSR medium sinc", "LSR fastest sinc",
+    "LSR ZOH", "LSR linear", "SoX VHQ"};
+
+  return (unsigned)id < 5u + !getenv("SOXR_LSR_STRICT")? names[id] : 0;
+}
+
+
+
+char const * src_get_description(SRC_SRCTYPE id)
+{
+  return src_get_name(id);
+}
+
+
+
+char const * src_get_version(void)
+{
+  return soxr_version();
+}
+
+
+
+char const * src_strerror(SRC_ERROR error)
+{
+  return error == 1? "Placeholder." : error ? "soxr error" : soxr_strerror(0);
+}
+
+
+
+int src_is_valid_ratio(double oi_ratio)
+{
+  return getenv("SOXR_LSR_STRICT")?
+    oi_ratio >= 1./256 && oi_ratio <= 256 : oi_ratio > 0;
+}
+
+
+
+void src_short_to_float_array(short const * src, float * dest, int len)
+{
+  assert (src && dest);
+
+  while (len--) dest[len] = (float)(src[len] * (1 / (1. + SHRT_MAX)));
+}
+
+
+
+void src_float_to_short_array(float const * src, short * dest, int len)
+{
+  double d, N = 1. + SHRT_MAX;
+  assert (src && dest);
+
+  while (len--) d = src[len] * N, dest[len] =
+    (short)(d > N - 1? (short)(N - 1) : d < -N? (short)-N : rint16(d));
+}
+
+
+
+void src_int_to_float_array(int const * src, float * dest, int len)
+{
+  assert (src && dest);
+  while (len--) dest[len] = (float)(src[len] * (1 / (32768. * 65536.)));
+}
+
+
+
+void src_float_to_int_array(float const * src, int * dest, int len)
+{
+  double d, N = 32768. * 65536.; /* N.B. int32, not int! (Also above fn.) */
+  assert (src && dest);
+
+  while (len--) d = src[len] * N, dest[len] =
+    d >= N - 1? (int)(N - 1) : d < -N? (int)(-N) : rint32(d);
+}
diff --git a/soxr/src/soxr-lsr.h b/soxr/src/soxr-lsr.h
index c0923aa..b1cc247 100644
--- a/soxr/src/soxr-lsr.h
+++ b/soxr/src/soxr-lsr.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
  *
  * This library is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by
@@ -37,13 +37,12 @@
 #endif
 
 typedef float   SRC_SAMPLE;
-#if !defined SOXR_LIB
 enum SRC_SRCTYPE_e {SRC_SINC_BEST_QUALITY, SRC_SINC_MEDIUM_QUALITY,
                     SRC_SINC_FASTEST, SRC_ZERO_ORDER_HOLD, SRC_LINEAR};
 typedef int     SRC_SRCTYPE;
 typedef int     SRC_ERROR;
 typedef long    (* src_callback_t)(void *, SRC_SAMPLE * *);
-typedef struct  SRC_STATE SRC_STATE;
+typedef struct  soxr SRC_STATE;
 typedef struct  SRC_DATA {
   SRC_SAMPLE    * data_in, * data_out;
   long          input_frames, output_frames;
@@ -51,7 +50,6 @@ typedef struct  SRC_DATA {
   int           end_of_input;
   double        src_ratio;
 } SRC_DATA;
-#endif
 SOXR SRC_STATE *   src_new(SRC_SRCTYPE, int num_channels, SRC_ERROR *);
 SOXR SRC_ERROR     src_process  (SRC_STATE *, SRC_DATA *);
 SOXR SRC_ERROR     src_set_ratio(SRC_STATE *, double);
diff --git a/soxr/src/soxr.c b/soxr/src/soxr.c
index 5acace1..cdbfb9a 100644
--- a/soxr/src/soxr.c
+++ b/soxr/src/soxr.c
@@ -1,4 +1,4 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #include <math.h>
@@ -10,6 +10,30 @@
 #include "data-io.h"
 #include "internal.h"
 
+#if AVUTIL_FOUND
+  #include <libavutil/cpu.h>
+#endif
+
+
+
+#if WITH_DEV_TRACE
+
+#include <stdarg.h>
+#include <stdio.h>
+
+int _soxr_trace_level;
+
+void _soxr_trace(char const * fmt, ...)
+{
+  va_list args;
+  va_start(args, fmt);
+  vfprintf(stderr, fmt, args);
+  fputc('\n', stderr);
+  va_end(args);
+}
+
+#endif
+
 
 
 char const * soxr_version(void)
@@ -19,21 +43,9 @@ char const * soxr_version(void)
 
 
 
+#include "cb_t.h"
+
 typedef void sample_t; /* float or double */
-typedef void (* fn_t)(void);
-typedef fn_t control_block_t[10];
-
-#define resampler_input        (*(sample_t * (*)(void *, sample_t * samples, size_t   n))p->control_block[0])
-#define resampler_process      (*(void (*)(void *, size_t))p->control_block[1])
-#define resampler_output       (*(sample_t const * (*)(void *, sample_t * samples, size_t * n))p->control_block[2])
-#define resampler_flush        (*(void (*)(void *))p->control_block[3])
-#define resampler_close        (*(void (*)(void *))p->control_block[4])
-#define resampler_delay        (*(double (*)(void *))p->control_block[5])
-#define resampler_sizes        (*(void (*)(size_t * shared, size_t * channel))p->control_block[6])
-#define resampler_create       (*(char const * (*)(void * channel, void * shared, double io_ratio, soxr_quality_spec_t * q_spec, soxr_runtime_spec_t * r_spec, double scale))p->control_block[7])
-#define resampler_set_io_ratio (*(void (*)(void *, double io_ratio, size_t len))p->control_block[8])
-#define resampler_id           (*(char const * (*)(void))p->control_block[9])
-
 typedef void * resampler_t; /* For one channel. */
 typedef void * resampler_shared_t; /* Between channels. */
 typedef void (* deinterleave_t)(sample_t * * dest,
@@ -67,45 +79,52 @@ struct soxr {
 
 
 
-#define RESET_ON_CLEAR   (1u<<31)
+#if WITH_CR32 || WITH_CR32S || WITH_CR64 || WITH_CR64S
+  #include "filter.h"
+#else
+  #define lsx_to_3dB(x) ((x)/(x))
+#endif
+
 
-/* TODO: these should not be here. */
-#define TO_3dB(a)       ((1.6e-6*a-7.5e-4)*a+.646)
-#define LOW_Q_BW0       (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
 
 soxr_quality_spec_t soxr_quality_spec(unsigned long recipe, unsigned long flags)
 {
   soxr_quality_spec_t spec, * p = &spec;
-  unsigned quality = recipe & 0xf;
+  unsigned q = recipe & 0xf;                         /* TODO: move to soxr-lsr.c: */
+  unsigned quality = q > SOXR_LSR2Q+2? SOXR_VHQ : q > SOXR_LSR2Q? SOXR_QQ : q;
   double rej;
   memset(p, 0, sizeof(*p));
-  if (quality > 13) {
+  if (quality > SOXR_PRECISIONQ) {
     p->e = "invalid quality type";
     return spec;
   }
-  flags |= quality < SOXR_LSR0Q? RESET_ON_CLEAR : 0;
-  if (quality == 13)
-    quality = 6;
-  else if (quality > 10)
-    quality = 0;
-  p->phase_response = "\62\31\144"[(recipe & 0x30) >> 4];
+  flags |= quality < SOXR_LSR0Q ? RESET_ON_CLEAR : 0;
+  p->phase_response = "\62\31\144"[(recipe & 0x30)>>4];
   p->stopband_begin = 1;
-  p->precision = !quality? 0: quality < 3? 16 : quality < 8? 4 + quality * 4 : 55 - quality * 4;
+  p->precision =
+    quality == SOXR_QQ      ?  0 :
+    quality <= SOXR_16_BITQ ? 16 :
+    quality <= SOXR_32_BITQ ?  4 + quality * 4 :
+    quality <= SOXR_LSR2Q   ? 55 - quality * 4 : /* TODO: move to soxr-lsr.c */
+    0;
   rej = p->precision * linear_to_dB(2.);
   p->flags = flags;
-  if (quality < 8) {
-    p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / TO_3dB(rej);
+  if (quality <= SOXR_32_BITQ || quality == SOXR_PRECISIONQ) {
+    #define LOW_Q_BW0     (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
+    p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / lsx_to_3dB(rej);
     if (quality <= 2)
       p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
   }
-  else {
+  else { /* TODO: move to soxr-lsr.c */
     static float const bw[] = {.931f, .832f, .663f};
-    p->passband_end = bw[quality - 8];
-    if (quality - 8 == 2)
-      p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
+    p->passband_end = bw[quality - SOXR_LSR0Q];
+    if (quality == SOXR_LSR2Q) {
+      p->flags &= ~SOXR_ROLLOFF_NONE;
+      p->flags |= SOXR_ROLLOFF_LSR2Q | SOXR_PROMOTE_TO_LQ;
+    }
   }
   if (recipe & SOXR_STEEP_FILTER)
-    p->passband_end = 1 - .01 / TO_3dB(rej);
+    p->passband_end = 1 - .01 / lsx_to_3dB(rej);
   return spec;
 }
 
@@ -163,39 +182,165 @@ soxr_io_spec_t soxr_io_spec(
 
 
 
-#if HAVE_SIMD
-static bool cpu_has_simd(void)
-{
-#if defined __x86_64__ || defined _M_X64
-  return true;
-#elif defined __GNUC__ && defined i386
-  uint32_t eax, ebx, ecx, edx;
-  __asm__ __volatile__ (
-      "pushl %%ebx   \n\t"
-      "cpuid         \n\t"
-      "movl %%ebx, %1\n\t"
-      "popl %%ebx    \n\t"
-      : "=a"(eax), "=r"(ebx), "=c"(ecx), "=d"(edx)
-      : "a"(1)
-      : "cc" );
-  return !!(edx & 0x06000000);
-#elif defined _MSC_VER && defined _M_IX86
-  uint32_t d;
-  __asm {
-    xor     eax, eax
-    inc     eax
-    push    ebx
-    cpuid
-    pop     ebx
-    mov     d, edx
-  }
-  return !!(d & 0x06000000);
-#endif
-  return false;
-}
+#if (WITH_CR32S && WITH_CR32) || (WITH_CR64S && WITH_CR64)
+  #if defined __GNUC__ && defined __x86_64__
+    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+      __asm__ __volatile__ ( \
+        "cpuid \n\t" \
+        : "=a" (eax_), "=b" (ebx_), "=c" (ecx_), "=d" (edx_) \
+        : "a" (type), "c" (0));
+  #elif defined __GNUC__ && defined __i386__
+    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+      __asm__ __volatile__ ( \
+        "mov %%ebx, %%edi \n\t" \
+        "cpuid \n\t" \
+        "xchg %%edi, %%ebx \n\t" \
+        : "=a" (eax_), "=D" (ebx_), "=c" (ecx_), "=d" (edx_) \
+        : "a" (type), "c" (0));
+  #elif defined _M_X64 && defined _MSC_VER && _MSC_VER > 1500
+     void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue);
+     #pragma intrinsic(__cpuidex)
+     #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \
+       int regs[4]; \
+       __cpuidex(regs, type, 0); \
+       eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \
+     } while(0)
+  #elif defined _M_X64 && defined _MSC_VER
+     void __cpuidex(int CPUInfo[4], int info_type);
+     #pragma intrinsic(__cpuidex)
+     #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \
+       int regs[4]; \
+       __cpuidex(regs, type); \
+       eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \
+     } while(0)
+  #elif defined _M_IX86 && defined _MSC_VER
+    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+      __asm pushad \
+      __asm mov eax, type \
+      __asm xor ecx, ecx \
+      __asm cpuid \
+      __asm mov eax_, eax \
+      __asm mov ebx_, ebx \
+      __asm mov ecx_, ecx \
+      __asm mov edx_, edx \
+      __asm popad
+  #endif
 #endif
 
-extern control_block_t _soxr_rate32s_cb, _soxr_rate32_cb, _soxr_rate64_cb, _soxr_vr32_cb;
+
+
+#if WITH_CR32S && WITH_CR32
+  static bool cpu_has_simd32(void)
+  {
+  #if defined __x86_64__ || defined _M_X64
+    return true;
+  #elif defined __i386__ || defined _M_IX86
+    enum {SSE = 1 << 25, SSE2 = 1 << 26};
+    unsigned eax_, ebx_, ecx_, edx_;
+    CPUID(1, eax_, ebx_, ecx_, edx_);
+    return (edx_ & (SSE|SSE2)) != 0;
+  #elif defined AV_CPU_FLAG_NEON
+    return !!(av_get_cpu_flags() & AV_CPU_FLAG_NEON);
+  #else
+    return false;
+  #endif
+  }
+
+  static bool should_use_simd32(void)
+  {
+    char const * e;
+    return ((e = getenv("SOXR_USE_SIMD"  )))? !!atoi(e) :
+           ((e = getenv("SOXR_USE_SIMD32")))? !!atoi(e) : cpu_has_simd32();
+  }
+#else
+  #define should_use_simd32() true
+#endif
+
+
+
+#if WITH_CR64S && WITH_CR64
+  #if defined __GNUC__
+    #define XGETBV(type, eax_, edx_) \
+      __asm__ __volatile__ ( \
+        ".byte 0x0f, 0x01, 0xd0\n" \
+        : "=a"(eax_), "=d"(edx_) : "c" (type));
+  #elif defined _M_X64 && defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219
+    #include <immintrin.h>
+    #define XGETBV(type, eax_, edx_) do { \
+      union {uint64_t x; uint32_t y[2];} a = {_xgetbv(0)}; \
+      eax_ = a.y[0], edx_ = a.y[1]; \
+     } while(0)
+  #elif defined _M_IX86 && defined _MSC_VER
+    #define XGETBV(type, eax_, edx_) \
+      __asm pushad \
+      __asm mov ecx, type \
+      __asm _emit 0x0f \
+      __asm _emit 0x01 \
+      __asm _emit 0xd0 \
+      __asm mov eax_, eax \
+      __asm mov edx_, edx \
+      __asm popad
+  #else
+    #define XGETBV(type, eax_, edx_) eax_ = edx_ = 0
+  #endif
+
+  static bool cpu_has_simd64(void)
+  {
+    enum {OSXSAVE = 1 << 27, AVX = 1 << 28};
+    unsigned eax_, ebx_, ecx_, edx_;
+    CPUID(1, eax_, ebx_, ecx_, edx_);
+    if ((ecx_ & (OSXSAVE|AVX)) == (OSXSAVE|AVX)) {
+      XGETBV(0, eax_, edx_);
+      return (eax_ & 6) == 6;
+    }
+    return false;
+  }
+
+  static bool should_use_simd64(void)
+  {
+    char const * e;
+    return ((e = getenv("SOXR_USE_SIMD"  )))? !!atoi(e) :
+           ((e = getenv("SOXR_USE_SIMD64")))? !!atoi(e) : cpu_has_simd64();
+  }
+#else
+  #define should_use_simd64() true
+#endif
+
+
+
+extern control_block_t
+  _soxr_rate32_cb,
+  _soxr_rate32s_cb,
+  _soxr_rate64_cb,
+  _soxr_rate64s_cb,
+  _soxr_vr32_cb;
+
+
+
+static void runtime_num(char const * env_name,
+    int min, int max, unsigned * field)
+{
+  char const * e = getenv(env_name);
+  if (e) {
+    int i = atoi(e);
+    if (i >= min && i <= max)
+      *field = (unsigned)i;
+  }
+}
+
+
+
+static void runtime_flag(char const * env_name,
+    unsigned n_bits, unsigned n_shift, unsigned long * flags)
+{
+  char const * e = getenv(env_name);
+  if (e) {
+    int i = atoi(e);
+    unsigned long mask = (1UL << n_bits) - 1;
+    if (i >= 0 && i <= (int)mask)
+      *flags &= ~(mask << n_shift), *flags |= ((unsigned long)i << n_shift);
+  }
+}
 
 
 
@@ -207,11 +352,30 @@ soxr_t soxr_create(
   soxr_quality_spec_t const * q_spec,
   soxr_runtime_spec_t const * runtime_spec)
 {
-  double io_ratio = output_rate? input_rate? input_rate / output_rate : -1 : input_rate? -1 : 0;
+  double io_ratio = output_rate!=0? input_rate!=0?
+    input_rate / output_rate : -1 : input_rate!=0? -1 : 0;
   static const float datatype_full_scale[] = {1, 1, 65536.*32768, 32768};
   soxr_t p = 0;
   soxr_error_t error = 0;
 
+#if WITH_DEV_TRACE
+#define _(x) (char)(sizeof(x)>=10? 'a'+(char)(sizeof(x)-10):'0'+(char)sizeof(x))
+  char const * e = getenv("SOXR_TRACE");
+  _soxr_trace_level = e? atoi(e) : 0;
+  {
+    static char const arch[] = {_(char), _(short), _(int), _(long), _(long long)
+      , ' ', _(float), _(double), _(long double)
+      , ' ', _(int *), _(int (*)(int))
+      , ' ', HAVE_BIGENDIAN ? 'B' : 'L'
+#if defined _OPENMP
+      , ' ', 'O', 'M', 'P'
+#endif
+      , 0};
+#undef _
+    lsx_debug("arch: %s", arch);
+  }
+#endif
+
   if (q_spec && q_spec->e)  error = q_spec->e;
   else if (io_spec && (io_spec->itype | io_spec->otype) >= SOXR_SPLIT * 2)
     error = "invalid io datatype(s)";
@@ -219,6 +383,8 @@ soxr_t soxr_create(
   if (!error && !(p = calloc(sizeof(*p), 1))) error = "malloc failed";
 
   if (p) {
+    control_block_t * control_block;
+
     p->q_spec = q_spec? *q_spec : soxr_quality_spec(SOXR_HQ, 0);
 
     if (q_spec) { /* Backwards compatibility with original API: */
@@ -236,35 +402,59 @@ soxr_t soxr_create(
       p->io_spec.scale = 1;
 
     p->runtime_spec = runtime_spec? *runtime_spec : soxr_runtime_spec(1);
+
+    runtime_num("SOXR_MIN_DFT_SIZE", 8, 15, &p->runtime_spec.log2_min_dft_size);
+    runtime_num("SOXR_LARGE_DFT_SIZE", 8, 20, &p->runtime_spec.log2_large_dft_size);
+    runtime_num("SOXR_COEFS_SIZE", 100, 800, &p->runtime_spec.coef_size_kbytes);
+    runtime_num("SOXR_NUM_THREADS", 0, 64, &p->runtime_spec.num_threads);
+    runtime_flag("SOXR_COEF_INTERP", 2, 0, &p->runtime_spec.flags);
+
+    runtime_flag("SOXR_STRICT_BUF", 1, 2, &p->runtime_spec.flags);
+    runtime_flag("SOXR_NOSMALLINTOPT", 1, 3, &p->runtime_spec.flags);
+
     p->io_spec.scale *= datatype_full_scale[p->io_spec.otype & 3] /
                         datatype_full_scale[p->io_spec.itype & 3];
+
     p->seed = (unsigned long)time(0) ^ (unsigned long)(size_t)p;
 
-#if HAVE_SINGLE_PRECISION
-    if (!HAVE_DOUBLE_PRECISION || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION))
-        || (p->q_spec.flags & SOXR_VR)) {
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
+    if (0
+#if WITH_VR32
+        || ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR))
+#endif
+#if WITH_CR32 || WITH_CR32S
+        || !(WITH_CR64 || WITH_CR64S) || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION))
+#endif
+        ) {
       p->deinterleave = (deinterleave_t)_soxr_deinterleave_f;
       p->interleave = (interleave_t)_soxr_interleave_f;
-      memcpy(&p->control_block,
-          (p->q_spec.flags & SOXR_VR)? &_soxr_vr32_cb :
-#if HAVE_SIMD
-          cpu_has_simd()? &_soxr_rate32s_cb :
+      control_block =
+#if WITH_VR32
+          ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR))? &_soxr_vr32_cb :
 #endif
-          &_soxr_rate32_cb, sizeof(p->control_block));
+#if WITH_CR32S
+          !WITH_CR32 || should_use_simd32()? &_soxr_rate32s_cb :
+#endif
+          &_soxr_rate32_cb;
     }
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
     else
 #endif
 #endif
-#if HAVE_DOUBLE_PRECISION
+#if WITH_CR64 || WITH_CR64S
     {
       p->deinterleave = (deinterleave_t)_soxr_deinterleave;
       p->interleave = (interleave_t)_soxr_interleave;
-      memcpy(&p->control_block, &_soxr_rate64_cb, sizeof(p->control_block));
+      control_block =
+#if WITH_CR64S
+          !WITH_CR64 || should_use_simd64()? &_soxr_rate64s_cb :
+#endif
+          &_soxr_rate64_cb;
     }
 #endif
+    memcpy(&p->control_block, control_block, sizeof(p->control_block));
 
-    if (p->num_channels && io_ratio)
+    if (p->num_channels && io_ratio!=0)
       error = soxr_set_io_ratio(p, io_ratio, 0);
   }
   if (error)
@@ -307,7 +497,8 @@ static void soxr_delete0(soxr_t p)
 
 double soxr_delay(soxr_t p)
 {
-  return (p && !p->error && p->resamplers)? resampler_delay(p->resamplers[0]) : 0;
+  return
+    (p && !p->error && p->resamplers)? resampler_delay(p->resamplers[0]) : 0;
 }
 
 
@@ -375,13 +566,13 @@ soxr_error_t soxr_set_io_ratio(soxr_t p, double io_ratio, size_t slew_len)
     p->io_ratio = io_ratio;
     return initialise(p);
   }
-  if (p->control_block[8]) {
+  if (resampler_set_io_ratio) {
     for (i = 0; !error && i < p->num_channels; ++i)
       resampler_set_io_ratio(p->resamplers[i], io_ratio, slew_len);
     return error;
   }
   return fabs(p->io_ratio - io_ratio) < 1e-15? 0 :
-    "Varying O/I ratio is not supported with this quality level";
+    "varying O/I ratio is not supported with this quality level";
 }
 
 
@@ -406,7 +597,7 @@ soxr_error_t soxr_clear(soxr_t p) /* TODO: this, properly. */
     p->io_spec = tmp.io_spec;
     p->num_channels = tmp.num_channels;
     p->input_fn_state = tmp.input_fn_state;
-    memcpy(p->control_block, tmp.control_block, sizeof(p->control_block));
+    memcpy(&p->control_block, &tmp.control_block, sizeof(p->control_block));
     p->deinterleave = tmp.deinterleave;
     p->interleave = tmp.interleave;
     return (p->q_spec.flags & RESET_ON_CLEAR)?
@@ -481,13 +672,8 @@ static size_t soxr_output_no_callback(soxr_t p, soxr_buf_t out, size_t len)
       done = done1;
   } else
 #endif
-  {
-    if (p->num_channels > 1) {
-      for (u = 0; u < p->num_channels; ++u)
-        done = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], len, separated);
-    } else
-        done = soxr_output_1ch(p, 0, out, len, separated);
-  }
+  for (u = 0; u < p->num_channels; ++u)
+    done = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], len, separated);
 
   if (!separated)
     p->clips += (p->interleave)(p->io_spec.otype, &out, (sample_t const * const *)p->channel_ptrs,
@@ -616,7 +802,7 @@ soxr_error_t soxr_oneshot(
     soxr_quality_spec_t const * q_spec,
     soxr_runtime_spec_t const * runtime_spec)
 {
-  soxr_t resampler = NULL;
+  soxr_t resampler;
   soxr_error_t error = q_spec? q_spec->e : 0;
   if (!error) {
     soxr_quality_spec_t q_spec1;
diff --git a/soxr/src/soxr.h b/soxr/src/soxr.h
index 8d9622d..09ec7c4 100644
--- a/soxr/src/soxr.h
+++ b/soxr/src/soxr.h
@@ -1,4 +1,4 @@
-/* SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
  *
  * This library is free software; you can redistribute it and/or modify it
  * under the terms of the GNU Lesser General Public License as published by
@@ -65,8 +65,8 @@ input or output (e.g. ilen, olen).                                            */
 /* E.g. #if SOXR_THIS_VERSION >= SOXR_VERSION(0,1,1) ...                      */
 
 #define SOXR_VERSION(x,y,z)     (((x)<<16)|((y)<<8)|(z))
-#define SOXR_THIS_VERSION       SOXR_VERSION(0,1,2)
-#define SOXR_THIS_VERSION_STR               "0.1.2"
+#define SOXR_THIS_VERSION       SOXR_VERSION(0,1,3)
+#define SOXR_THIS_VERSION_STR               "0.1.3"
 
 
 
@@ -173,7 +173,7 @@ SOXR size_t /*odone*/ soxr_output(/* Resample and output a block of data.*/
 SOXR soxr_error_t soxr_error(soxr_t);   /* Query error status. */
 SOXR size_t   * soxr_num_clips(soxr_t); /* Query int. clip counter (for R/W). */
 SOXR double     soxr_delay(soxr_t);  /* Query current delay in output samples.*/
-SOXR char const * soxr_engine(soxr_t p); /* Query resampling engine name. */
+SOXR char const * soxr_engine(soxr_t);  /* Query resampling engine name. */
 
 SOXR soxr_error_t soxr_clear(soxr_t); /* Ready for fresh signal, same config. */
 SOXR void         soxr_delete(soxr_t);  /* Free resources. */
@@ -249,7 +249,6 @@ struct soxr_quality_spec {                                       /* Typically */
 #define SOXR_ROLLOFF_MEDIUM    1u    /* <= 0.35 dB */
 #define SOXR_ROLLOFF_NONE      2u    /* For Chebyshev bandwidth. */
 
-#define SOXR_MAINTAIN_3DB_PT   4u  /* Reserved for internal use. */
 #define SOXR_HI_PREC_CLOCK     8u  /* Increase `irrational' ratio accuracy. */
 #define SOXR_DOUBLE_PRECISION 16u  /* Use D.P. calcs even if precision <= 20. */
 #define SOXR_VR               32u  /* Variable-rate resampling. */
@@ -257,21 +256,18 @@ struct soxr_quality_spec {                                       /* Typically */
 
 
 struct soxr_runtime_spec {                                       /* Typically */
-  unsigned log2_min_dft_size;/* For DFT efficiency. [8,15]              10    */
-  unsigned log2_large_dft_size;/* For DFT efficiency. [16,20]           17    */
-  unsigned coef_size_kbytes; /* For SOXR_COEF_INTERP_AUTO (below).      400   */
-  unsigned num_threads;      /* If built so. 0 means `automatic'.        1    */
-  void * e;                  /* Reserved for internal use.               0    */
-  unsigned long flags;       /* Per the following #defines.              0    */
+  unsigned log2_min_dft_size;   /* For DFT efficiency. [8,15]           10    */
+  unsigned log2_large_dft_size; /* For DFT efficiency. [8,20]           17    */
+  unsigned coef_size_kbytes;    /* For SOXR_COEF_INTERP_AUTO (below).   400   */
+  unsigned num_threads;         /* 0: per OMP_NUM_THREADS; 1: 1 thread.  1    */
+  void * e;                     /* Reserved for internal use.            0    */
+  unsigned long flags;          /* Per the following #defines.           0    */
 };
                                    /* For `irrational' ratios only: */
 #define SOXR_COEF_INTERP_AUTO  0u    /* Auto select coef. interpolation. */
 #define SOXR_COEF_INTERP_LOW   2u    /* Man. select: less CPU, more memory. */
 #define SOXR_COEF_INTERP_HIGH  3u    /* Man. select: more CPU, less memory. */
 
-#define SOXR_STRICT_BUFFERING  4u  /* Reserved for future use. */
-#define SOXR_NOSMALLINTOPT     8u  /* For test purposes only. */
-
 
 
 /* -------------------------- API type constructors ------------------------- */
@@ -296,7 +292,7 @@ SOXR soxr_quality_spec_t soxr_quality_spec(
 #define SOXR_24_BITQ            5
 #define SOXR_28_BITQ            6
 #define SOXR_32_BITQ            7
-                                    /* Libsamplerate equivalent qualities: */
+                                /* Reserved for internal use (to be removed): */
 #define SOXR_LSR0Q              8     /* 'Best sinc'. */
 #define SOXR_LSR1Q              9     /* 'Medium sinc'. */
 #define SOXR_LSR2Q              10    /* 'Fast sinc'. */
@@ -304,8 +300,8 @@ SOXR soxr_quality_spec_t soxr_quality_spec(
 #define SOXR_LINEAR_PHASE       0x00
 #define SOXR_INTERMEDIATE_PHASE 0x10
 #define SOXR_MINIMUM_PHASE      0x30
+
 #define SOXR_STEEP_FILTER       0x40
-#define SOXR_ALLOW_ALIASING     0x80  /* Reserved for future use. */
 
 
 
diff --git a/soxr/src/sse2neon.h b/soxr/src/sse2neon.h
deleted file mode 100644
index 65efed3..0000000
--- a/soxr/src/sse2neon.h
+++ /dev/null
@@ -1,6292 +0,0 @@
-#ifndef SSE2NEON_H
-#define SSE2NEON_H
-
-// This header file provides a simple API translation layer
-// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
-//
-// This header file does not yet translate all of the SSE intrinsics.
-//
-// Contributors to this work are:
-//   John W. Ratcliff <jratcliffscarab@gmail.com>
-//   Brandon Rowlett <browlett@nvidia.com>
-//   Ken Fast <kfast@gdeb.com>
-//   Eric van Beurden <evanbeurden@nvidia.com>
-//   Alexander Potylitsin <apotylitsin@nvidia.com>
-//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
-//   Jim Huang <jserv@biilabs.io>
-//   Mark Cheng <marktwtn@biilabs.io>
-//   Malcolm James MacLeod <malcolm@gulden.com>
-//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
-//   Sebastian Pop <spop@amazon.com>
-//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
-//   Danila Kutenin <danilak@google.com>
-//   François Turban (JishinMaster) <francois.turban@gmail.com>
-//   Pei-Hsuan Hung <afcidk@gmail.com>
-//   Yang-Hao Yuan <yanghau@biilabs.io>
-
-/*
- * sse2neon is freely redistributable under the MIT License.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-/* Tunable configurations */
-
-/* Enable precise implementation of _mm_min_ps and _mm_max_ps
- * This would slow down the computation a bit, but gives consistent result with
- * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
- */
-#ifndef SSE2NEON_PRECISE_MINMAX
-#define SSE2NEON_PRECISE_MINMAX (0)
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma push_macro("FORCE_INLINE")
-#pragma push_macro("ALIGN_STRUCT")
-#define FORCE_INLINE static inline __attribute__((always_inline))
-#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
-#else
-#error "Macro name collisions may happen with unsupported compiler."
-#ifdef FORCE_INLINE
-#undef FORCE_INLINE
-#endif
-#define FORCE_INLINE static inline
-#ifndef ALIGN_STRUCT
-#define ALIGN_STRUCT(x) __declspec(align(x))
-#endif
-#endif
-
-#include <stdint.h>
-#include <stdlib.h>
-
-/* Architecture-specific build options */
-/* FIXME: #pragma GCC push_options is only available on GCC */
-#if defined(__GNUC__)
-#if defined(__arm__) && __ARM_ARCH == 7
-/* According to ARM C Language Extensions Architecture specification,
- * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
- * architecture supported.
- */
-#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
-#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
-#endif
-#pragma GCC push_options
-#pragma GCC target("fpu=neon")
-#elif defined(__aarch64__)
-#pragma GCC push_options
-#pragma GCC target("+simd")
-#else
-#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
-#endif
-#endif
-
-#include <arm_neon.h>
-
-/* Rounding functions require either Aarch64 instructions or libm failback */
-#if !defined(__aarch64__)
-#include <math.h>
-#endif
-
-/* "__has_builtin" can be used to query support for built-in functions
- * provided by gcc/clang and other compilers that support it.
- */
-#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
-/* Compatibility with gcc <= 9 */
-#if __GNUC__ <= 9
-#define __has_builtin(x) HAS##x
-#define HAS__builtin_popcount 1
-#define HAS__builtin_popcountll 1
-#else
-#define __has_builtin(x) 0
-#endif
-#endif
-
-/**
- * MACRO for shuffle parameter for _mm_shuffle_ps().
- * Argument fp3 is a digit[0123] that represents the fp from argument "b"
- * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
- * for fp2 in result. fp1 is a digit[0123] that represents the fp from
- * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
- * fp0 is the same for fp0 of result.
- */
-#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
-    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
-
-/* Rounding mode macros. */
-#define _MM_FROUND_TO_NEAREST_INT 0x00
-#define _MM_FROUND_TO_NEG_INF 0x01
-#define _MM_FROUND_TO_POS_INF 0x02
-#define _MM_FROUND_TO_ZERO 0x03
-#define _MM_FROUND_CUR_DIRECTION 0x04
-#define _MM_FROUND_NO_EXC 0x08
-
-/* indicate immediate constant argument in a given range */
-#define __constrange(a, b) const
-
-/* A few intrinsics accept traditional data types like ints or floats, but
- * most operate on data types that are specific to SSE.
- * If a vector type ends in d, it contains doubles, and if it does not have
- * a suffix, it contains floats. An integer vector type can contain any type
- * of integer, from chars to shorts to unsigned long longs.
- */
-typedef int64x1_t __m64;
-typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
-// On ARM 32-bit architecture, the float64x2_t is not supported.
-// The data type __m128d should be represented in a different way for related
-// intrinsic conversion.
-#if defined(__aarch64__)
-typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
-#else
-typedef float32x4_t __m128d;
-#endif
-typedef int64x2_t __m128i; /* 128-bit vector containing integers */
-
-/* type-safe casting between types */
-
-#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
-#define vreinterpretq_m128_f32(x) (x)
-#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
-
-#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
-#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
-#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
-#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
-#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
-#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
-
-#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
-#define vreinterpretq_f32_m128(x) (x)
-#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
-
-#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
-#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
-#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
-#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
-#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
-#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
-#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
-#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
-#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
-#define vreinterpretq_m128i_s64(x) (x)
-
-#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
-#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
-#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
-#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
-
-#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
-#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
-#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
-#define vreinterpretq_s64_m128i(x) (x)
-
-#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
-#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
-#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
-#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
-
-#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
-#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
-#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
-#define vreinterpret_m64_s64(x) (x)
-
-#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
-#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
-#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
-#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
-
-#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
-#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
-#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
-
-#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
-#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
-#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
-#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
-
-#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
-#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
-#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
-#define vreinterpret_s64_m64(x) (x)
-
-#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
-
-#if defined(__aarch64__)
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
-
-#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
-#define vreinterpretq_m128d_f64(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
-
-#define vreinterpretq_f64_m128d(x) (x)
-#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
-#else
-#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
-#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
-#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
-
-#define vreinterpretq_m128d_f32(x) (x)
-
-#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
-
-#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
-
-#define vreinterpretq_f32_m128d(x) (x)
-#endif
-
-// A struct is defined in this header file called 'SIMDVec' which can be used
-// by applications which attempt to access the contents of an _m128 struct
-// directly.  It is important to note that accessing the __m128 struct directly
-// is bad coding practice by Microsoft: @see:
-// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
-//
-// However, some legacy source code may try to access the contents of an __m128
-// struct directly so the developer can use the SIMDVec as an alias for it.  Any
-// casting must be done manually by the developer, as you cannot cast or
-// otherwise alias the base NEON data type for intrinsic operations.
-//
-// union intended to allow direct access to an __m128 variable using the names
-// that the MSVC compiler provides.  This union should really only be used when
-// trying to access the members of the vector as integer values.  GCC/clang
-// allow native access to the float members through a simple array access
-// operator (in C since 4.6, in C++ since 4.8).
-//
-// Ideally direct accesses to SIMD vectors should not be used since it can cause
-// a performance hit.  If it really is needed however, the original __m128
-// variable can be aliased with a pointer to this union and used to access
-// individual components.  The use of this union should be hidden behind a macro
-// that is used throughout the codebase to access the members instead of always
-// declaring this type of variable.
-typedef union ALIGN_STRUCT(16) SIMDVec {
-    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
-    int8_t m128_i8[16];    // as signed 8-bit integers.
-    int16_t m128_i16[8];   // as signed 16-bit integers.
-    int32_t m128_i32[4];   // as signed 32-bit integers.
-    int64_t m128_i64[2];   // as signed 64-bit integers.
-    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
-    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
-    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
-    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
-} SIMDVec;
-
-// casting using SIMDVec
-#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
-#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
-#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
-
-/* Backwards compatibility for compilers with lack of specific type support */
-
-// Older gcc does not define vld1q_u8_x4 type
-#if defined(__GNUC__) && !defined(__clang__)
-#if __GNUC__ <= 9
-FORCE_INLINE uint8x16x4_t vld1q_u8_x4(const uint8_t *p)
-{
-    uint8x16x4_t ret;
-    ret.val[0] = vld1q_u8(p + 0);
-    ret.val[1] = vld1q_u8(p + 16);
-    ret.val[2] = vld1q_u8(p + 32);
-    ret.val[3] = vld1q_u8(p + 48);
-    return ret;
-}
-#endif
-#endif
-
-/* Function Naming Conventions
- * The naming convention of SSE intrinsics is straightforward. A generic SSE
- * intrinsic function is given as follows:
- *   _mm_<name>_<data_type>
- *
- * The parts of this format are given as follows:
- * 1. <name> describes the operation performed by the intrinsic
- * 2. <data_type> identifies the data type of the function's primary arguments
- *
- * This last part, <data_type>, is a little complicated. It identifies the
- * content of the input values, and can be set to any of the following values:
- * + ps - vectors contain floats (ps stands for packed single-precision)
- * + pd - vectors cantain doubles (pd stands for packed double-precision)
- * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            signed integers
- * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
- *                            unsigned integers
- * + si128 - unspecified 128-bit vector or 256-bit vector
- * + m128/m128i/m128d - identifies input vector types when they are different
- *                      than the type of the returned vector
- *
- * For example, _mm_setzero_ps. The _mm implies that the function returns
- * a 128-bit vector. The _ps at the end implies that the argument vectors
- * contain floats.
- *
- * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
- *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
- *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
- *   // Set packed 8-bit integers
- *   // 128 bits, 16 chars, per 8 bits
- *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
- *                                  4, 5, 12, 13, 6, 7, 14, 15);
- *   // Shuffle packed 8-bit integers
- *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
- *
- * Data (Number, Binary, Byte Index):
-    +------+------+-------------+------+------+-------------+
-    |      1      |      2      |      3      |      4      | Number
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
-    +------+------+------+------+------+------+------+------+
-    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |      5      |      6      |      7      |      8      | Number
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
-    +------+------+------+------+------+------+------+------+
-    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
-    +------+------+------+------+------+------+------+------+
- * Index (Byte Index):
-    +------+------+------+------+------+------+------+------+
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
-    +------+------+------+------+------+------+------+------+
- * Result:
-    +------+------+------+------+------+------+------+------+
-    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
-    +------+------+------+------+------+------+------+------+
-    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
-    +------+------+------+------+------+------+------+------+
-    |     256     |      2      |      5      |      6      | Number
-    +------+------+------+------+------+------+------+------+
-
-    +------+------+------+------+------+------+------+------+
-    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
-    +------+------+------+------+------+------+------+------+
-    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
-    +------+------+------+------+------+------+------+------+
-    |      3      |      7      |      4      |      8      | Number
-    +------+------+------+------+------+------+-------------+
- */
-
-/* Set/get methods */
-
-/* Constants for use with _mm_prefetch.  */
-enum _mm_hint {
-    _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
-    _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
-    _MM_HINT_T1 = 2,   /* load data to L2 cache only */
-    _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
-    _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
-    _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
-    _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
-    _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
-};
-
-// Loads one cache line of data from address p to a location closer to the
-// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
-FORCE_INLINE void _mm_prefetch(const void *p, int i)
-{
-    (void) i;
-    __builtin_prefetch(p);
-}
-
-// Copy the lower single-precision (32-bit) floating-point element of a to dst.
-//
-//   dst[31:0] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
-FORCE_INLINE float _mm_cvtss_f32(__m128 a)
-{
-    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
-#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
-FORCE_INLINE int _mm_cvtss_si64(__m128 a)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_s64(
-        vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
-#else
-    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t diff = data - floor(data);
-    if (diff > 0.5)
-        return (int64_t) ceil(data);
-    if (diff == 0.5) {
-        int64_t f = (int64_t) floor(data);
-        int64_t c = (int64_t) ceil(data);
-        return c & 1 ? f : c;
-    }
-    return (int64_t) floor(data);
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := 32*j
-//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
-FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
-{
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
-FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
-{
-    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers with truncation, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := 32*j
-//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
-#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer with truncation, and store the result in dst.
-//
-//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
-#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
-FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
-{
-    return vgetq_lane_s64(
-        vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
-}
-
-// Sets the 128-bit value to zero
-// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_setzero_si128(void)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
-}
-
-// Clears the four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setzero_ps(void)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(0));
-}
-
-// Return vector of type __m128d with all elements set to zero.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
-FORCE_INLINE __m128d _mm_setzero_pd(void)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
-#else
-    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
-#endif
-}
-
-// Sets the four single-precision, floating-point values to w.
-//
-//   r0 := r1 := r2 := r3 := w
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set1_ps(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to w.
-// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps1(float _w)
-{
-    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs.
-// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Copy single-precision (32-bit) floating-point element a to the lower element
-// of dst, and zero the upper 3 elements.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
-FORCE_INLINE __m128 _mm_set_ss(float a)
-{
-    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Sets the four single-precision, floating-point values to the four inputs in
-// reverse order.
-// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
-{
-    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-}
-
-// Sets the 8 signed 16-bit integer values in reverse order.
-//
-// Return Value
-//   r0 := w0
-//   r1 := w1
-//   ...
-//   r7 := w7
-FORCE_INLINE __m128i _mm_setr_epi16(short w0,
-                                    short w1,
-                                    short w2,
-                                    short w3,
-                                    short w4,
-                                    short w5,
-                                    short w6,
-                                    short w7)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
-    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
-}
-
-// Sets the 4 signed 32-bit integer values in reverse order
-// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Set packed 64-bit integers in dst with the supplied values in reverse order.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
-FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
-{
-    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
-}
-
-// Sets the 16 signed 8-bit integer values to b.
-//
-//   r0 := b
-//   r1 := b
-//   ...
-//   r15 := b
-//
-// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
-{
-    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
-}
-
-// Broadcast double-precision (64-bit) floating-point value a to all elements of
-// dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
-FORCE_INLINE __m128d _mm_set1_pd(double d)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
-#endif
-}
-
-// Sets the 8 signed 16-bit integer values to w.
-//
-//   r0 := w
-//   r1 := w
-//   ...
-//   r7 := w
-//
-// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set1_epi16(short w)
-{
-    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
-}
-
-// Sets the 16 signed 8-bit integer values.
-// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
-                                  signed char b14,
-                                  signed char b13,
-                                  signed char b12,
-                                  signed char b11,
-                                  signed char b10,
-                                  signed char b9,
-                                  signed char b8,
-                                  signed char b7,
-                                  signed char b6,
-                                  signed char b5,
-                                  signed char b4,
-                                  signed char b3,
-                                  signed char b2,
-                                  signed char b1,
-                                  signed char b0)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Sets the 8 signed 16-bit integer values.
-// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_set_epi16(short i7,
-                                   short i6,
-                                   short i5,
-                                   short i4,
-                                   short i3,
-                                   short i2,
-                                   short i1,
-                                   short i0)
-{
-    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
-    return vreinterpretq_m128i_s16(vld1q_s16(data));
-}
-
-// Sets the 16 signed 8-bit integer values in reverse order.
-// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
-                                   signed char b1,
-                                   signed char b2,
-                                   signed char b3,
-                                   signed char b4,
-                                   signed char b5,
-                                   signed char b6,
-                                   signed char b7,
-                                   signed char b8,
-                                   signed char b9,
-                                   signed char b10,
-                                   signed char b11,
-                                   signed char b12,
-                                   signed char b13,
-                                   signed char b14,
-                                   signed char b15)
-{
-    int8_t ALIGN_STRUCT(16)
-        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
-                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
-                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
-                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
-    return (__m128i) vld1q_s8(data);
-}
-
-// Sets the 4 signed 32-bit integer values to i.
-//
-//   r0 := i
-//   r1 := i
-//   r2 := i
-//   r3 := I
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set1_epi32(int _i)
-{
-    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
-}
-
-// Sets the 2 signed 64-bit integer values to i.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
-FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
-}
-
-// Sets the 2 signed 64-bit integer values to i.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
-FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
-{
-    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
-}
-
-// Sets the 4 signed 32-bit integer values.
-// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
-{
-    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
-    return vreinterpretq_m128i_s32(vld1q_s32(data));
-}
-
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
-{
-    int64_t ALIGN_STRUCT(16) data[2] = {i2, i1};
-    return vreinterpretq_m128i_s64(vld1q_s64(data));
-}
-
-// Returns the __m128i structure with its two 64-bit integer values
-// initialized to the values of the two 64-bit integers passed in.
-// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
-{
-    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
-}
-
-// Set packed double-precision (64-bit) floating-point elements in dst with the
-// supplied values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
-FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
-{
-    double ALIGN_STRUCT(16) data[2] = {e0, e1};
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
-#else
-    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
-#endif
-}
-
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
-{
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Stores four 32-bit integer values as (as a __m128i value) at the address p.
-// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
-FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
-{
-    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
-}
-
-// Stores the lower single - precision, floating - point value.
-// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
-FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
-{
-    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
-}
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
-// or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
-FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
-#else
-    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
-FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
-{
-#if defined(__aarch64__)
-    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
-    vst1q_f64((float64_t *) mem_addr,
-              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
-#else
-    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
-    vst1q_f32((float32_t *) mem_addr,
-              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
-#endif
-}
-
-// Store the lower double-precision (64-bit) floating-point element from a into
-// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
-// boundary or a general-protection exception may be generated.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
-#define _mm_store1_pd _mm_store_pd1
-
-// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
-// elements) from a into memory. mem_addr does not need to be aligned on any
-// particular boundary.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
-FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
-{
-    _mm_store_pd(mem_addr, a);
-}
-
-// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
-// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
-{
-    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
-    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
-    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
-}
-
-// Stores the lower two single-precision floating point values of a to the
-// address p.
-//
-//   *p0 := a0
-//   *p1 := a1
-//
-// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
-FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_low_f32(a));
-}
-
-// Stores the upper two single-precision, floating-point values of a to the
-// address p.
-//
-//   *p0 := a2
-//   *p1 := a3
-//
-// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
-FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
-{
-    *p = vreinterpret_m64_f32(vget_high_f32(a));
-}
-
-// Loads a single single-precision, floating-point value, copying it into all
-// four words
-// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load1_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
-}
-
-// Load a single-precision (32-bit) floating-point element from memory into all
-// elements of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[63:32] := MEM[mem_addr+31:mem_addr]
-//   dst[95:64] := MEM[mem_addr+31:mem_addr]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
-#define _mm_load_ps1 _mm_load1_ps
-
-// Sets the lower two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the upper two values are passed
-// through from a.
-//
-// Return Value
-//   r0 := *p0
-//   r1 := *p1
-//   r2 := a2
-//   r3 := a3
-//
-// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
-}
-
-// Load 4 single-precision (32-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
-//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
-//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
-//   dst[127:96] := MEM[mem_addr+31:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
-FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
-{
-    float32x4_t v = vrev64q_f32(vld1q_f32(p));
-    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
-}
-
-// Sets the upper two single-precision, floating-point values with 64
-// bits of data loaded from the address p; the lower two values are passed
-// through from a.
-//
-//   r0 := a0
-//   r1 := a1
-//   r2 := *p0
-//   r3 := *p1
-//
-// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
-FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
-}
-
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_load_ps(const float *p)
-{
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Loads four single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
-{
-    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
-    // equivalent for neon
-    return vreinterpretq_m128_f32(vld1q_f32(p));
-}
-
-// Load unaligned 16-bit integer from memory into the first element of dst.
-//
-//   dst[15:0] := MEM[mem_addr+15:mem_addr]
-//   dst[MAX:16] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
-FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
-{
-    return vreinterpretq_m128i_s16(
-        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
-}
-
-// Load unaligned 64-bit integer from memory into the first element of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[MAX:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
-FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower of dst, and zero the upper element. mem_addr does not need to be
-// aligned on any particular boundary.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
-FORCE_INLINE __m128d _mm_load_sd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Loads two double-precision from 16-byte aligned memory, floating-point
-// values.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
-FORCE_INLINE __m128d _mm_load_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_f64(p));
-#else
-    const float *fp = (const float *) p;
-    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
-    return vreinterpretq_m128d_f32(vld1q_f32(data));
-#endif
-}
-
-// Loads two double-precision from unaligned memory, floating-point values.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
-FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
-{
-    return _mm_load_pd(p);
-}
-
-// Loads an single - precision, floating - point value into the low word and
-// clears the upper three words.
-// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_load_ss(const float *p)
-{
-    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
-}
-
-FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
-{
-    /* Load the lower 64 bits of the value pointed to by p into the
-     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
-     */
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// lower element of dst, and copy the upper element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := a[127:64]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
-FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
-#else
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vld1_f32((const float *) p),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-#endif
-}
-
-// Load 2 double-precision (64-bit) floating-point elements from memory into dst
-// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
-FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
-{
-#if defined(__aarch64__)
-    float64x2_t v = vld1q_f64(p);
-    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
-#else
-    int64x2_t v = vld1q_s64((const int64_t *) p);
-    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
-#endif
-}
-
-// Sets the low word to the single-precision, floating-point value of b
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
-FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
-                       vreinterpretq_f32_m128(a), 0));
-}
-
-// Move the lower double-precision (64-bit) floating-point element from b to the
-// lower element of dst, and copy the upper element from a to the upper element
-// of dst.
-//
-//   dst[63:0] := b[63:0]
-//   dst[127:64] := a[127:64]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
-FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_f32(
-        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
-                     vget_high_f32(vreinterpretq_f32_m128d(a))));
-}
-
-// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
-// upper element.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
-FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
-}
-
-/* Logic/Binary operations */
-
-// Computes the bitwise AND-NOT of the four single-precision, floating-point
-// values of a and b.
-//
-//   r0 := ~a0 & b0
-//   r1 := ~a1 & b1
-//   r2 := ~a2 & b2
-//   r3 := ~a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vbicq_s32(vreinterpretq_s32_m128(b),
-                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
-}
-
-// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
-// elements in a and then AND with b, and store the results in dst.
-//
-//   FOR j := 0 to 1
-// 	     i := j*64
-// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
-FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
-{
-    // *NOTE* argument swap
-    return vreinterpretq_m128d_s64(
-        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
-}
-
-// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
-// 128-bit value in a.
-//
-//   r := (~a) & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vbicq_s32(vreinterpretq_s32_m128i(b),
-                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
-}
-
-// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
-// b.
-//
-//   r := a & b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the bitwise AND of the four single-precision, floating-point values
-// of a and b.
-//
-//   r0 := a0 & b0
-//   r1 := a1 & b1
-//   r2 := a2 & b2
-//   r3 := a3 & b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise AND of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
-FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Computes the bitwise OR of the four single-precision, floating-point values
-// of a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Computes bitwise EXOR (exclusive-or) of the four single-precision,
-// floating-point values of a and b.
-// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_s32(
-        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
-}
-
-// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//      i := j*64
-//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
-FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Compute the bitwise OR of packed double-precision (64-bit) floating-point
-// elements in a and b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
-FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
-{
-    return vreinterpretq_m128d_s64(
-        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
-}
-
-// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
-//
-//   r := a | b
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
-// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Duplicate odd-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
-FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
-{
-#if __has_builtin(__builtin_shufflevector)
-    return vreinterpretq_m128_f32(__builtin_shufflevector(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
-#else
-    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
-    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Duplicate even-indexed single-precision (32-bit) floating-point elements
-// from a, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
-FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
-{
-#if __has_builtin(__builtin_shufflevector)
-    return vreinterpretq_m128_f32(__builtin_shufflevector(
-        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
-#else
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
-    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
-    return vreinterpretq_m128_f32(vld1q_f32(data));
-#endif
-}
-
-// Moves the upper two values of B into the lower two values of A.
-//
-//   r3 := a3
-//   r2 := a2
-//   r1 := b3
-//   r0 := b2
-FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
-    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
-}
-
-// Moves the lower two values of B into the upper two values of A.
-//
-//   r3 := b1
-//   r2 := b0
-//   r1 := a1
-//   r0 := a0
-FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     dst[i+31:i] := ABS(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
-FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     dst[i+15:i] := ABS(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
-FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
-{
-    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 15
-//     i := j*8
-//     dst[i+7:i] := ABS(a[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
-FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
-{
-    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
-}
-
-// Compute the absolute value of packed signed 32-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*32
-//     dst[i+31:i] := ABS(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
-FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
-{
-    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
-}
-
-// Compute the absolute value of packed signed 16-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := ABS(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
-FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
-{
-    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
-}
-
-// Compute the absolute value of packed signed 8-bit integers in a, and store
-// the unsigned results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := ABS(a[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
-FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
-{
-    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
-}
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of b and places it into the high end of the result.
-FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in high
-// end of result takes the higher two 32 bit values from b and swaps them and
-// places in low end of result.
-FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
-{
-    float32x2_t a21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
-{
-    float32x2_t a03 = vget_low_f32(
-        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
-    float32x2_t b21 = vget_high_f32(
-        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
-    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
-}
-
-// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
-// high
-FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
-{
-    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
-{
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
-{
-    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t b22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
-    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
-{
-    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32x2_t a22 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
-    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
-{
-    float32x2_t a33 =
-        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
-    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
-{
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
-{
-    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
-}
-
-FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
-{
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32_t b2 = vgetq_lane_f32(b, 2);
-    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
-    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
-    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
-}
-
-// NEON does not support a general purpose permute intrinsic
-// Selects four specific single-precision, floating-point values from a and b,
-// based on the mask i.
-//
-// C equivalent:
-//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
-//                                 __constrange(0, 255) int imm) {
-//       __m128 ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
-#define _mm_shuffle_ps_default(a, b, imm)                                  \
-    __extension__({                                                        \
-        float32x4_t ret;                                                   \
-        ret = vmovq_n_f32(                                                 \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                       \
-        ret = vsetq_lane_f32(                                              \
-            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                       \
-        vreinterpretq_m128_f32(ret);                                       \
-    })
-
-// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
-// int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_ps(a, b, imm)                                \
-    __extension__({                                              \
-        float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
-        float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
-        float32x4_t _shuf = __builtin_shufflevector(             \
-            _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-            (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
-        vreinterpretq_m128_f32(_shuf);                           \
-    })
-#else  // generic
-#define _mm_shuffle_ps(a, b, imm)                          \
-    __extension__({                                        \
-        __m128 ret;                                        \
-        switch (imm) {                                     \
-        case _MM_SHUFFLE(1, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_1032((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 3, 0, 1):                      \
-            ret = _mm_shuffle_ps_2301((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 3, 2, 1):                      \
-            ret = _mm_shuffle_ps_0321((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 1, 0, 3):                      \
-            ret = _mm_shuffle_ps_2103((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 1, 0):                      \
-            ret = _mm_movelh_ps((a), (b));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_1001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 1, 0, 1):                      \
-            ret = _mm_shuffle_ps_0101((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 1, 0):                      \
-            ret = _mm_shuffle_ps_3210((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 1, 1):                      \
-            ret = _mm_shuffle_ps_0011((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(0, 0, 2, 2):                      \
-            ret = _mm_shuffle_ps_0022((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 2, 0, 0):                      \
-            ret = _mm_shuffle_ps_2200((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 0, 2):                      \
-            ret = _mm_shuffle_ps_3202((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(3, 2, 3, 2):                      \
-            ret = _mm_movehl_ps((b), (a));                 \
-            break;                                         \
-        case _MM_SHUFFLE(1, 1, 3, 3):                      \
-            ret = _mm_shuffle_ps_1133((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 1, 0):                      \
-            ret = _mm_shuffle_ps_2010((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 0, 1):                      \
-            ret = _mm_shuffle_ps_2001((a), (b));           \
-            break;                                         \
-        case _MM_SHUFFLE(2, 0, 3, 2):                      \
-            ret = _mm_shuffle_ps_2032((a), (b));           \
-            break;                                         \
-        default:                                           \
-            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
-            break;                                         \
-        }                                                  \
-        ret;                                               \
-    })
-#endif
-
-// Takes the upper 64 bits of a and places it in the low end of the result
-// Takes the lower 64 bits of a and places it into the high end of the result.
-FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
-}
-
-// takes the lower two 32-bit values from a and swaps them and places in low end
-// of result takes the higher two 32 bit values from a and swaps them and places
-// in high end of result.
-FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
-}
-
-// rotates the least significant 32 bits into the most signficant 32 bits, and
-// shifts the rest down
-FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
-}
-
-// rotates the most significant 32 bits into the least signficant 32 bits, and
-// shifts the rest up
-FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
-}
-
-// gets the lower 64 bits of a, and places it in the upper 64 bits
-// gets the lower 64 bits of a and places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
-{
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
-// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
-}
-
-// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
-// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
-// places it in the lower 64 bits
-FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
-{
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
-{
-    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
-{
-    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
-    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
-    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
-}
-
-FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
-{
-    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
-    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
-}
-
-// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-// corresponding 8-bit element of b, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
-FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
-{
-    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
-    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
-    uint8x16_t idx_masked =
-        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
-#elif defined(__GNUC__)
-    int8x16_t ret;
-    // %e and %f represent the even and odd D registers
-    // respectively.
-    __asm__ __volatile__(
-        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
-        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
-        : [ret] "=&w"(ret)
-        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
-    return vreinterpretq_m128i_s8(ret);
-#else
-    // use this line if testing on aarch64
-    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
-                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
-#endif
-}
-
-// C equivalent:
-//   __m128i _mm_shuffle_epi32_default(__m128i a,
-//                                     __constrange(0, 255) int imm) {
-//       __m128i ret;
-//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
-//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
-//       return ret;
-//   }
-#define _mm_shuffle_epi32_default(a, imm)                                   \
-    __extension__({                                                         \
-        int32x4_t ret;                                                      \
-        ret = vmovq_n_s32(                                                  \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
-            ret, 1);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
-            ret, 2);                                                        \
-        ret = vsetq_lane_s32(                                               \
-            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
-            ret, 3);                                                        \
-        vreinterpretq_m128i_s32(ret);                                       \
-    })
-
-// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
-// int imm)
-#if defined(__aarch64__)
-#define _mm_shuffle_epi32_splat(a, imm)                          \
-    __extension__({                                              \
-        vreinterpretq_m128i_s32(                                 \
-            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
-    })
-#else
-#define _mm_shuffle_epi32_splat(a, imm)                                      \
-    __extension__({                                                          \
-        vreinterpretq_m128i_s32(                                             \
-            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
-    })
-#endif
-
-// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
-// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
-//                                        __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shuffle_epi32(a, imm)                              \
-    __extension__({                                            \
-        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
-        int32x4_t _shuf = __builtin_shufflevector(             \
-            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
-            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
-        vreinterpretq_m128i_s32(_shuf);                        \
-    })
-#else  // generic
-#define _mm_shuffle_epi32(a, imm)                        \
-    __extension__({                                      \
-        __m128i ret;                                     \
-        switch (imm) {                                   \
-        case _MM_SHUFFLE(1, 0, 3, 2):                    \
-            ret = _mm_shuffle_epi_1032((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 3, 0, 1):                    \
-            ret = _mm_shuffle_epi_2301((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 3, 2, 1):                    \
-            ret = _mm_shuffle_epi_0321((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 1, 0, 3):                    \
-            ret = _mm_shuffle_epi_2103((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 1, 0):                    \
-            ret = _mm_shuffle_epi_1010((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(1, 0, 0, 1):                    \
-            ret = _mm_shuffle_epi_1001((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 0, 1):                    \
-            ret = _mm_shuffle_epi_0101((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 1, 1):                    \
-            ret = _mm_shuffle_epi_2211((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 1, 2, 2):                    \
-            ret = _mm_shuffle_epi_0122((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 2):                    \
-            ret = _mm_shuffle_epi_3332((a));             \
-            break;                                       \
-        case _MM_SHUFFLE(0, 0, 0, 0):                    \
-            ret = _mm_shuffle_epi32_splat((a), 0);       \
-            break;                                       \
-        case _MM_SHUFFLE(1, 1, 1, 1):                    \
-            ret = _mm_shuffle_epi32_splat((a), 1);       \
-            break;                                       \
-        case _MM_SHUFFLE(2, 2, 2, 2):                    \
-            ret = _mm_shuffle_epi32_splat((a), 2);       \
-            break;                                       \
-        case _MM_SHUFFLE(3, 3, 3, 3):                    \
-            ret = _mm_shuffle_epi32_splat((a), 3);       \
-            break;                                       \
-        default:                                         \
-            ret = _mm_shuffle_epi32_default((a), (imm)); \
-            break;                                       \
-        }                                                \
-        ret;                                             \
-    })
-#endif
-
-// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
-// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
-#define _mm_shufflelo_epi16_function(a, imm)                                  \
-    __extension__({                                                           \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
-        int16x4_t lowBits = vget_low_s16(ret);                                \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
-                             1);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
-                             2);                                              \
-        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
-                             3);                                              \
-        vreinterpretq_m128i_s16(ret);                                         \
-    })
-
-// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflelo_epi16(a, imm)                                  \
-    __extension__({                                                  \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
-        int16x8_t _shuf = __builtin_shufflevector(                   \
-            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
-            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
-        vreinterpretq_m128i_s16(_shuf);                              \
-    })
-#else  // generic
-#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
-#endif
-
-// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
-// by imm.
-// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
-//                                                   __constrange(0,255) int
-//                                                   imm)
-#define _mm_shufflehi_epi16_function(a, imm)                                   \
-    __extension__({                                                            \
-        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
-        int16x4_t highBits = vget_high_s16(ret);                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
-                             5);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
-                             6);                                               \
-        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
-                             7);                                               \
-        vreinterpretq_m128i_s16(ret);                                          \
-    })
-
-// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
-//                                          __constrange(0,255) int imm)
-#if __has_builtin(__builtin_shufflevector)
-#define _mm_shufflehi_epi16(a, imm)                             \
-    __extension__({                                             \
-        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
-        int16x8_t _shuf = __builtin_shufflevector(              \
-            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
-            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
-            (((imm) >> 6) & 0x3) + 4);                          \
-        vreinterpretq_m128i_s16(_shuf);                         \
-    })
-#else  // generic
-#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
-#endif
-
-// Blend packed 16-bit integers from a and b using control mask imm8, and store
-// the results in dst.
-//
-//   FOR j := 0 to 7
-//       i := j*16
-//       IF imm8[j]
-//           dst[i+15:i] := b[i+15:i]
-//       ELSE
-//           dst[i+15:i] := a[i+15:i]
-//       FI
-//   ENDFOR
-// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
-//                                      __constrange(0,255) int imm)
-#define _mm_blend_epi16(a, b, imm)                                        \
-    __extension__({                                                       \
-        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 1)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 2)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 3)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 4)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 5)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 6)) ? 0xFFFF : 0x0000,  \
-                                   ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \
-        uint16x8_t _mask_vec = vld1q_u16(_mask);                          \
-        uint16x8_t _a = vreinterpretq_u16_m128i(a);                       \
-        uint16x8_t _b = vreinterpretq_u16_m128i(b);                       \
-        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));            \
-    })
-
-// Blend packed 8-bit integers from a and b using mask, and store the results in
-// dst.
-//
-//   FOR j := 0 to 15
-//       i := j*8
-//       IF mask[i+7]
-//           dst[i+7:i] := b[i+7:i]
-//       ELSE
-//           dst[i+7:i] := a[i+7:i]
-//       FI
-//   ENDFOR
-FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
-{
-    // Use a signed shift right to create a mask with the sign bit
-    uint8x16_t mask =
-        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    uint8x16_t b = vreinterpretq_u8_m128i(_b);
-    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
-}
-
-/* Shifts */
-
-
-// Shift packed 16-bit integers in a right by imm while shifting in sign
-// bits, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
-FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
-{
-    const int count = (imm & ~15) ? 15 : imm;
-    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
-}
-
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
-//
-//   r0 := a0 << count
-//   r1 := a1 << count
-//   ...
-//   r7 := a7 << count
-//
-// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
-#define _mm_slli_epi16(a, imm)                                   \
-    __extension__({                                              \
-        __m128i ret;                                             \
-        if ((imm) <= 0) {                                        \
-            ret = a;                                             \
-        } else if ((imm) > 15) {                                 \
-            ret = _mm_setzero_si128();                           \
-        } else {                                                 \
-            ret = vreinterpretq_m128i_s16(                       \
-                vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
-        }                                                        \
-        ret;                                                     \
-    })
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros. :
-// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
-// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
-FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
-{
-    if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
-        return a;
-    if (imm > 31) /* TODO: add unlikely macro */
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s32(
-        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
-}
-
-// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
-// store the results in dst.
-FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
-{
-    if (imm <= 0) /* TODO: add constant range macro: [0, 255] */
-        return a;
-    if (imm > 63) /* TODO: add unlikely macro */
-        return _mm_setzero_si128();
-    return vreinterpretq_m128i_s64(
-        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
-}
-
-// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-//
-//   FOR j := 0 to 7
-//     i := j*16
-//     IF imm8[7:0] > 15
-//       dst[i+15:i] := 0
-//     ELSE
-//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
-#define _mm_srli_epi16(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if ((imm) == 0) {                                                  \
-            ret = a;                                                       \
-        } else if (0 < (imm) && (imm) < 16) {                              \
-            ret = vreinterpretq_m128i_u16(                                 \
-                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
-        } else {                                                           \
-            ret = _mm_setzero_si128();                                     \
-        }                                                                  \
-        ret;                                                               \
-    })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF imm8[7:0] > 31
-//       dst[i+31:i] := 0
-//     ELSE
-//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
-// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_epi32(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if ((imm) == 0) {                                                  \
-            ret = a;                                                       \
-        } else if (0 < (imm) && (imm) < 32) {                              \
-            ret = vreinterpretq_m128i_u32(                                 \
-                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
-        } else {                                                           \
-            ret = _mm_setzero_si128();                                     \
-        }                                                                  \
-        ret;                                                               \
-    })
-
-// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
-// store the results in dst.
-//
-//   FOR j := 0 to 1
-//     i := j*64
-//     IF imm8[7:0] > 63
-//       dst[i+63:i] := 0
-//     ELSE
-//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
-#define _mm_srli_epi64(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if ((imm) == 0) {                                                  \
-            ret = a;                                                       \
-        } else if (0 < (imm) && (imm) < 64) {                              \
-            ret = vreinterpretq_m128i_u64(                                 \
-                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
-        } else {                                                           \
-            ret = _mm_setzero_si128();                                     \
-        }                                                                  \
-        ret;                                                               \
-    })
-
-// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
-// and store the results in dst.
-//
-//   FOR j := 0 to 3
-//     i := j*32
-//     IF imm8[7:0] > 31
-//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
-//     ELSE
-//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
-//     FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
-// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
-#define _mm_srai_epi32(a, imm)                                             \
-    __extension__({                                                        \
-        __m128i ret;                                                       \
-        if ((imm) == 0) {                                                  \
-            ret = a;                                                       \
-        } else if (0 < (imm) && (imm) < 32) {                              \
-            ret = vreinterpretq_m128i_s32(                                 \
-                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
-        } else {                                                           \
-            ret = vreinterpretq_m128i_s32(                                 \
-                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
-        }                                                                  \
-        ret;                                                               \
-    })
-
-// Shifts the 128 - bit value in a right by imm bytes while shifting in
-// zeros.imm must be an immediate.
-//
-//   r := srl(a, imm*8)
-//
-// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
-// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_srli_si128(a, imm)                                              \
-    __extension__({                                                         \
-        __m128i ret;                                                        \
-        if ((imm) <= 0) {                                                   \
-            ret = a;                                                        \
-        } else if ((imm) > 15) {                                            \
-            ret = _mm_setzero_si128();                                      \
-        } else {                                                            \
-            ret = vreinterpretq_m128i_s8(                                   \
-                vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
-        }                                                                   \
-        ret;                                                                \
-    })
-
-// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
-// must be an immediate.
-//
-//   r := a << (imm * 8)
-//
-// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
-// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
-#define _mm_slli_si128(a, imm)                                          \
-    __extension__({                                                     \
-        __m128i ret;                                                    \
-        if ((imm) <= 0) {                                               \
-            ret = a;                                                    \
-        } else if ((imm) > 15) {                                        \
-            ret = _mm_setzero_si128();                                  \
-        } else {                                                        \
-            ret = vreinterpretq_m128i_s8(vextq_s8(                      \
-                vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
-        }                                                               \
-        ret;                                                            \
-    })
-
-// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
-// shifting in zeros.
-//
-//   r0 := a0 << count
-//   r1 := a1 << count
-//   ...
-//   r7 := a7 << count
-//
-// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 15)
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16((int16_t) c);
-    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
-}
-
-// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-// r2 := a2 << count
-// r3 := a3 << count
-//
-// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 31)
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32((int32_t) c);
-    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
-}
-
-// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
-// shifting in zeros.
-//
-// r0 := a0 << count
-// r1 := a1 << count
-//
-// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 63)
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64((int64_t) c);
-    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
-}
-
-// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// ...
-// r7 := srl(a7, count)
-//
-// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 15)
-        return _mm_setzero_si128();
-
-    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
-    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
-}
-
-// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-// r2 := srl(a2, count)
-// r3 := srl(a3, count)
-//
-// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 31)
-        return _mm_setzero_si128();
-
-    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
-    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
-}
-
-// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
-// while shifting in zeros.
-//
-// r0 := srl(a0, count)
-// r1 := srl(a1, count)
-//
-// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
-{
-    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
-    if (c > 63)
-        return _mm_setzero_si128();
-
-    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
-    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
-}
-
-// NEON does not provide a version of this function.
-// Creates a 16-bit mask from the most significant bits of the 16 signed or
-// unsigned 8-bit integers in a and zero extends the upper bits.
-// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_epi8(__m128i a)
-{
-#if defined(__aarch64__)
-    uint8x16_t input = vreinterpretq_u8_m128i(a);
-    const int8_t ALIGN_STRUCT(16)
-        xr[16] = {-7, -6, -5, -4, -3, -2, -1, 0, -7, -6, -5, -4, -3, -2, -1, 0};
-    const uint8x16_t mask_and = vdupq_n_u8(0x80);
-    const int8x16_t mask_shift = vld1q_s8(xr);
-    const uint8x16_t mask_result =
-        vshlq_u8(vandq_u8(input, mask_and), mask_shift);
-    uint8x8_t lo = vget_low_u8(mask_result);
-    uint8x8_t hi = vget_high_u8(mask_result);
-
-    return vaddv_u8(lo) + (vaddv_u8(hi) << 8);
-#else
-    // Use increasingly wide shifts+adds to collect the sign bits
-    // together.
-    // Since the widening shifts would be rather confusing to follow in little
-    // endian, everything will be illustrated in big endian order instead. This
-    // has a different result - the bits would actually be reversed on a big
-    // endian machine.
-
-    // Starting input (only half the elements are shown):
-    // 89 ff 1d c0 00 10 99 33
-    uint8x16_t input = vreinterpretq_u8_m128i(a);
-
-    // Shift out everything but the sign bits with an unsigned shift right.
-    //
-    // Bytes of the vector::
-    // 89 ff 1d c0 00 10 99 33
-    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
-    //  |  |  |  |  |  |  |  |
-    // 01 01 00 01 00 00 01 00
-    //
-    // Bits of first important lane(s):
-    // 10001001 (89)
-    // \______
-    //        |
-    // 00000001 (01)
-    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
-
-    // Merge the even lanes together with a 16-bit unsigned shift right + add.
-    // 'xx' represents garbage data which will be ignored in the final result.
-    // In the important bytes, the add functions like a binary OR.
-    //
-    // 01 01 00 01 00 00 01 00
-    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
-    //    \|    \|    \|    \|
-    // xx 03 xx 01 xx 00 xx 02
-    //
-    // 00000001 00000001 (01 01)
-    //        \_______ |
-    //                \|
-    // xxxxxxxx xxxxxx11 (xx 03)
-    uint32x4_t paired16 =
-        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
-
-    // Repeat with a wider 32-bit shift + add.
-    // xx 03 xx 01 xx 00 xx 02
-    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
-    //     14))
-    //          \|          \|
-    // xx xx xx 0d xx xx xx 02
-    //
-    // 00000011 00000001 (03 01)
-    //        \\_____ ||
-    //         '----.\||
-    // xxxxxxxx xxxx1101 (xx 0d)
-    uint64x2_t paired32 =
-        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
-
-    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
-    // lanes. xx xx xx 0d xx xx xx 02
-    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
-    //            28))
-    //                      \|
-    // xx xx xx xx xx xx xx d2
-    //
-    // 00001101 00000010 (0d 02)
-    //     \   \___ |  |
-    //      '---.  \|  |
-    // xxxxxxxx 11010010 (xx d2)
-    uint8x16_t paired64 =
-        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
-
-    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
-    // xx xx xx xx xx xx xx d2
-    //                      ||  return paired64[0]
-    //                      d2
-    // Note: Little endian would return the correct value 4b (01001011) instead.
-    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
-#endif
-}
-
-// Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
-FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
-{
-    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
-}
-
-// Copy the 64-bit integer a to the lower element of dst, and zero the upper
-// element.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
-FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
-{
-    return vreinterpretq_m128i_s64(
-        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
-}
-
-// NEON does not provide this method
-// Creates a 4-bit mask from the most significant bits of the four
-// single-precision, floating-point values.
-// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
-FORCE_INLINE int _mm_movemask_ps(__m128 a)
-{
-    uint32x4_t input = vreinterpretq_u32_m128(a);
-#if defined(__aarch64__)
-    static const int32x4_t shift = {0, 1, 2, 3};
-    uint32x4_t tmp = vshrq_n_u32(input, 31);
-    return vaddvq_u32(vshlq_u32(tmp, shift));
-#else
-    // Uses the exact same method as _mm_movemask_epi8, see that for details.
-    // Shift out everything but the sign bits with a 32-bit unsigned shift
-    // right.
-    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
-    // Merge the two pairs together with a 64-bit unsigned shift right + add.
-    uint8x16_t paired =
-        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
-    // Extract the result.
-    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
-#endif
-}
-
-// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
-// all 1's, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
-FORCE_INLINE int _mm_test_all_ones(__m128i a)
-{
-    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
-           ~(uint64_t) 0;
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and
-// mask, and return 1 if the result is zero, otherwise return 0.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
-FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
-{
-    int64x2_t a_and_mask =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
-    return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
-                                                                           : 1;
-}
-
-/* Math operations */
-
-// Subtracts the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Subtract the lower single-precision (32-bit) floating-point element in b from
-// the lower single-precision (32-bit) floating-point element in a, store the
-// result in the lower element of dst, and copy the upper 3 packed elements from
-// a to the upper elements of dst.
-//
-//   dst[31:0] := a[31:0] - b[31:0]
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
-FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_sub_ps(a, b));
-}
-
-// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
-// and store the results in dst.
-//    r0 := a0 - b0
-//    r1 := a1 - b1
-FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
-// unsigned 32-bit integers of a.
-//
-//   r0 := a0 - b0
-//   r1 := a1 - b1
-//   r2 := a2 - b2
-//   r3 := a3 - b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
-//
-//   dst[63:0] := a[63:0] - b[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
-FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
-// integers of a and saturates..
-// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
-// integers of a and saturates.
-//
-//   r0 := UnsignedSaturate(a0 - b0)
-//   r1 := UnsignedSaturate(a1 - b1)
-//   ...
-//   r15 := UnsignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r15 := SignedSaturate(a15 - b15)
-//
-// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
-// of a and saturates.
-//
-//   r0 := SignedSaturate(a0 - b0)
-//   r1 := SignedSaturate(a1 - b1)
-//   ...
-//   r7 := SignedSaturate(a7 - b7)
-//
-// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
-FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed
-// 8-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..15
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
-{
-    int8x16_t a = vreinterpretq_s8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
-#else
-    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
-#endif
-
-    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
-    // based on ltMask
-    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x16_t res = vbicq_s8(masked, zeroMask);
-
-    return vreinterpretq_m128i_s8(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed
-// 16-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..7
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
-#else
-    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
-    // 'a') based on ltMask
-    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x8_t res = vbicq_s16(masked, zeroMask);
-    return vreinterpretq_m128i_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed
-// 32-bit integer in b is negative, and store the results in dst.
-// Element in dst are zeroed out when the corresponding element
-// in b is zero.
-//
-//   for i in 0..3
-//     if b[i] < 0
-//       r[i] := -a[i]
-//     else if b[i] == 0
-//       r[i] := 0
-//     else
-//       r[i] := a[i]
-//     fi
-//   done
-FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
-#else
-    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
-#endif
-
-    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
-    // 'a') based on ltMask
-    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x4_t res = vbicq_s32(masked, zeroMask);
-    return vreinterpretq_m128i_s32(res);
-}
-
-// Negate packed 16-bit integers in a when the corresponding signed 16-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      IF b[i+15:i] < 0
-//        dst[i+15:i] := -(a[i+15:i])
-//      ELSE IF b[i+15:i] == 0
-//        dst[i+15:i] := 0
-//      ELSE
-//        dst[i+15:i] := a[i+15:i]
-//      FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
-FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
-{
-    int16x4_t a = vreinterpret_s16_m64(_a);
-    int16x4_t b = vreinterpret_s16_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFF : 0
-    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
-
-    // (b == 0) ? 0xFFFF : 0
-#if defined(__aarch64__)
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
-#else
-    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
-#endif
-
-    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
-    // based on ltMask
-    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
-    // res = masked & (~zeroMask)
-    int16x4_t res = vbic_s16(masked, zeroMask);
-
-    return vreinterpret_m64_s16(res);
-}
-
-// Negate packed 32-bit integers in a when the corresponding signed 32-bit
-// integer in b is negative, and store the results in dst. Element in dst are
-// zeroed out when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 1
-//      i := j*32
-//      IF b[i+31:i] < 0
-//        dst[i+31:i] := -(a[i+31:i])
-//      ELSE IF b[i+31:i] == 0
-//        dst[i+31:i] := 0
-//      ELSE
-//        dst[i+31:i] := a[i+31:i]
-//      FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
-FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
-{
-    int32x2_t a = vreinterpret_s32_m64(_a);
-    int32x2_t b = vreinterpret_s32_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFFFFFFFF : 0
-    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
-
-    // (b == 0) ? 0xFFFFFFFF : 0
-#if defined(__aarch64__)
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
-#else
-    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
-#endif
-
-    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
-    // based on ltMask
-    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
-    // res = masked & (~zeroMask)
-    int32x2_t res = vbic_s32(masked, zeroMask);
-
-    return vreinterpret_m64_s32(res);
-}
-
-// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
-// in b is negative, and store the results in dst. Element in dst are zeroed out
-// when the corresponding element in b is zero.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      IF b[i+7:i] < 0
-//        dst[i+7:i] := -(a[i+7:i])
-//      ELSE IF b[i+7:i] == 0
-//        dst[i+7:i] := 0
-//      ELSE
-//        dst[i+7:i] := a[i+7:i]
-//      FI
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
-FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
-{
-    int8x8_t a = vreinterpret_s8_m64(_a);
-    int8x8_t b = vreinterpret_s8_m64(_b);
-
-    // signed shift right: faster than vclt
-    // (b < 0) ? 0xFF : 0
-    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
-
-    // (b == 0) ? 0xFF : 0
-#if defined(__aarch64__)
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
-#else
-    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
-#endif
-
-    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
-    // based on ltMask
-    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
-    // res = masked & (~zeroMask)
-    int8x8_t res = vbic_s8(masked, zeroMask);
-
-    return vreinterpret_m64_s8(res);
-}
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
-FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(
-        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
-FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Average packed unsigned 8-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 7
-//     i := j*8
-//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
-#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
-
-// Average packed unsigned 16-bit integers in a and b, and store the results in
-// dst.
-//
-//   FOR j := 0 to 3
-//     i := j*16
-//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
-#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
-
-// Computes the average of the 16 unsigned 8-bit integers in a and the 16
-// unsigned 8-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r15 := (a15 + b15) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the average of the 8 unsigned 16-bit integers in a and the 8
-// unsigned 16-bit integers in b and rounds.
-//
-//   r0 := (a0 + b0) / 2
-//   r1 := (a1 + b1) / 2
-//   ...
-//   r7 := (a7 + b7) / 2
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
-{
-    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
-                                 vreinterpretq_u16_m128i(b));
-}
-
-// Adds the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Add packed double-precision (64-bit) floating-point elements in a and b, and
-// store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
-FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] + db[0];
-    c[1] = da[1] + db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Add 64-bit integers a and b, and store the result in dst.
-//
-//   dst[63:0] := a[63:0] + b[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
-FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s64(
-        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
-}
-
-// adds the scalar single-precision floating point values of a and b.
-// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
-{
-    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
-    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
-    // the upper values in the result must be the remnants of <a>.
-    return vreinterpretq_m128_f32(vaddq_f32(a, value));
-}
-
-// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s64(
-        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-}
-
-// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
-// unsigned 32-bit integers in b.
-//
-//   r0 := a0 + b0
-//   r1 := a1 + b1
-//   r2 := a2 + b2
-//   r3 := a3 + b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
-// unsigned 16-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
-// unsigned 8-bit integers in b.
-// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
-FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
-// and saturates.
-//
-//   r0 := SignedSaturate(a0 + b0)
-//   r1 := SignedSaturate(a1 + b1)
-//   ...
-//   r7 := SignedSaturate(a7 + b7)
-//
-// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Add packed signed 8-bit integers in a and b using saturation, and store the
-// results in dst.
-//
-//   FOR j := 0 to 15
-//     i := j*8
-//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
-FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
-// b and saturates..
-// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
-// unsigned 16-bit integers from b.
-//
-//   r0 := (a0 * b0)[15:0]
-//   r1 := (a1 * b1)[15:0]
-//   ...
-//   r7 := (a7 * b7)[15:0]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
-// unsigned 32-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      tmp[31:0] := a[i+15:i] * b[i+15:i]
-//      dst[i+15:i] := tmp[31:16]
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
-#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
-
-// Multiplies the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 * b0
-//   r1 := a1 * b1
-//   r2 := a2 * b2
-//   r3 := a3 * b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_f32(
-        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Multiply packed double-precision (64-bit) floating-point elements in a and b,
-// and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
-FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
-#else
-    double *da = (double *) &a;
-    double *db = (double *) &b;
-    double c[2];
-    c[0] = da[0] * db[0];
-    c[1] = da[1] * db[1];
-    return vld1q_f32((float32_t *) c);
-#endif
-}
-
-// Multiply the lower single-precision (32-bit) floating-point element in a and
-// b, store the result in the lower element of dst, and copy the upper 3 packed
-// elements from a to the upper elements of dst.
-//
-//   dst[31:0] := a[31:0] * b[31:0]
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
-FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_mul_ps(a, b));
-}
-
-// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
-// a and b, and store the unsigned 64-bit results in dst.
-//
-//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
-//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
-FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
-{
-    // vmull_u32 upcasts instead of masking, so we downcast.
-    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
-    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
-    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
-}
-
-// Multiply the low unsigned 32-bit integers from a and b, and store the
-// unsigned 64-bit result in dst.
-//
-//   dst[63:0] := a[31:0] * b[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
-FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u64(vget_low_u64(
-        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
-}
-
-// Multiply the low signed 32-bit integers from each packed 64-bit element in
-// a and b, and store the signed 64-bit results in dst.
-//
-//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
-//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
-FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
-{
-    // vmull_s32 upcasts instead of masking, so we downcast.
-    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
-    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-//   r0 := (a0 * b0) + (a1 * b1)
-//   r1 := (a2 * b2) + (a3 * b3)
-//   r2 := (a4 * b4) + (a5 * b5)
-//   r3 := (a6 * b6) + (a7 * b7)
-// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
-{
-    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                              vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                               vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
-    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
-
-    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
-}
-
-// Multiply packed signed 16-bit integers in a and b, producing intermediate
-// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
-// the packed 16-bit integers in dst.
-//
-//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
-//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
-//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
-//   ...
-//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
-FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
-{
-    // Has issues due to saturation
-    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
-
-    // Multiply
-    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
-                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
-
-    // Rounding narrowing shift right
-    // narrow = (int16_t)((mul + 16384) >> 15);
-    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
-    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
-
-    // Join together
-    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
-}
-
-// Vertically multiply each unsigned 8-bit integer from a with the corresponding
-// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
-// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
-// and pack the saturated results in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*16
-//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
-//      a[i+7:i]*b[i+7:i] )
-//   ENDFOR
-FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    uint8x16_t a = vreinterpretq_u8_m128i(_a);
-    int8x16_t b = vreinterpretq_s8_m128i(_b);
-    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
-                             vmovl_s8(vget_low_s8(b)));
-    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
-                             vmovl_s8(vget_high_s8(b)));
-    return vreinterpretq_m128i_s16(
-        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
-#else
-    // This would be much simpler if x86 would choose to zero extend OR sign
-    // extend, not both. This could probably be optimized better.
-    uint16x8_t a = vreinterpretq_u16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-
-    // Zero extend a
-    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
-    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
-
-    // Sign extend by shifting left then shifting right.
-    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
-    int16x8_t b_odd = vshrq_n_s16(b, 8);
-
-    // multiply
-    int16x8_t prod1 = vmulq_s16(a_even, b_even);
-    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
-
-    // saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
-#endif
-}
-
-// Computes the fused multiple add product of 32-bit floating point numbers.
-//
-// Return Value
-// Multiplies A and B, and adds C to the temporary result before returning it.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
-FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
-                                            vreinterpretq_f32_m128(b),
-                                            vreinterpretq_f32_m128(a)));
-#else
-    return _mm_add_ps(_mm_mul_ps(a, b), c);
-#endif
-}
-
-// Alternatively add and subtract packed single-precision (32-bit)
-// floating-point elements in a to/from packed elements in b, and store the
-// results in dst.
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
-FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
-{
-    __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
-    return _mm_fmadd_ps(b, mask, a);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce two
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of 64-bit elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
-FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
-{
-    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
-    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
-    uint16_t r4 = t[4] + t[5] + t[6] + t[7];
-    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
-    return (__m128i) vsetq_lane_u16(r4, r, 4);
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
-FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
-{
-    uint16x4_t t =
-        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
-    return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
-}
-
-// Compute the absolute differences of packed unsigned 8-bit integers in a and
-// b, then horizontally sum each consecutive 8 differences to produce four
-// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
-// 16 bits of dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
-//   ENDFOR
-//   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
-//   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
-#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
-
-// Divides the four single-precision, floating-point values of a and b.
-//
-//   r0 := a0 / b0
-//   r1 := a1 / b1
-//   r2 := a2 / b2
-//   r3 := a3 / b3
-//
-// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x4_t recip0 = vrecpeq_f32(vreinterpretq_f32_m128(b));
-    float32x4_t recip1 =
-        vmulq_f32(recip0, vrecpsq_f32(recip0, vreinterpretq_f32_m128(b)));
-    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip1));
-#endif
-}
-
-// Divides the scalar single-precision floating point value of a by b.
-// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Compute the approximate reciprocal of packed single-precision (32-bit)
-// floating-point elements in a, and store the results in dst. The maximum
-// relative error for this approximation is less than 1.5*2^-12.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
-FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vdivq_f32(vdupq_n_f32(1.0f), vreinterpretq_f32_m128(in)));
-#else
-    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
-    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
-    return vreinterpretq_m128_f32(recip);
-#endif
-}
-
-// Compute the approximate reciprocal of the lower single-precision (32-bit)
-// floating-point element in a, store the result in the lower element of dst,
-// and copy the upper 3 packed elements from a to the upper elements of dst. The
-// maximum relative error for this approximation is less than 1.5*2^-12.
-//
-//   dst[31:0] := (1.0 / a[31:0])
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
-FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
-{
-    return _mm_move_ss(a, _mm_rcp_ps(a));
-}
-
-// Computes the approximations of square roots of the four single-precision,
-// floating-point values of a. First computes reciprocal square roots and then
-// reciprocals of the four values.
-//
-//   r0 := sqrt(a0)
-//   r1 := sqrt(a1)
-//   r2 := sqrt(a2)
-//   r3 := sqrt(a3)
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
-#else
-    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
-    float32x4_t sq = vrecpeq_f32(recipsq);
-    // ??? use step versions of both sqrt and recip for better accuracy?
-    return vreinterpretq_m128_f32(sq);
-#endif
-}
-
-// Computes the approximation of the square root of the scalar single-precision
-// floating point value of in.
-// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
-{
-    float32_t value =
-        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
-}
-
-// Computes the approximations of the reciprocal square roots of the four
-// single-precision floating point values of in.
-// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
-{
-    return vreinterpretq_m128_f32(vrsqrteq_f32(vreinterpretq_f32_m128(in)));
-}
-
-// Compute the approximate reciprocal square root of the lower single-precision
-// (32-bit) floating-point element in a, store the result in the lower element
-// of dst, and copy the upper 3 packed elements from a to the upper elements of
-// dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
-FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
-{
-    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
-#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
-
-// Computes the maximums of the four single-precision, floating-point values of
-// a and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
-#else
-    return vreinterpretq_m128_f32(
-        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
-#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Compare packed signed 16-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
-#define _m_pminsw(a, b) _mm_min_pi16(a, b)
-
-// Computes the minima of the four single-precision, floating-point values of a
-// and b.
-// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
-{
-#if SSE2NEON_PRECISE_MINMAX
-    float32x4_t _a = vreinterpretq_f32_m128(a);
-    float32x4_t _b = vreinterpretq_f32_m128(b);
-    return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
-#else
-    return vreinterpretq_m128_f32(
-        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#endif
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u8(
-        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
-}
-
-// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
-// values in dst.
-//
-//   FOR j := 0 to 7
-//      i := j*8
-//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
-#define _m_pminub(a, b) _mm_min_pu8(a, b)
-
-// Computes the maximum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the minimum of the two lower scalar single-precision floating point
-// values of a and b.
-// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
-{
-    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
-}
-
-// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
-// 16 unsigned 8-bit integers from b.
-// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
-FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
-}
-
-// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
-FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
-FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Compare packed signed 8-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
-FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
-FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
-}
-
-// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
-// signed 16-bit integers from b.
-// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// epi versions of min/max
-// Computes the pariwise maximums of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 > b0) ? a0 : b0
-//   r1 := (a1 > b1) ? a1 : b1
-//   r2 := (a2 > b2) ? a2 : b2
-//   r3 := (a3 > b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Computes the pariwise minima of the four signed 32-bit integer values of a
-// and b.
-//
-// A 128-bit parameter that can be defined with the following equations:
-//   r0 := (a0 < b0) ? a0 : b0
-//   r1 := (a1 < b1) ? a1 : b1
-//   r2 := (a2 < b2) ? a2 : b2
-//   r3 := (a3 < b3) ? a3 : b3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s32(
-        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
-// values in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
-FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
-FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_u16(vshrn_n_u32(
-        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
-}
-
-// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
-// integers from b.
-//
-//   r0 := (a0 * b0)[31:16]
-//   r1 := (a1 * b1)[31:16]
-//   ...
-//   r7 := (a7 * b7)[31:16]
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
-{
-    /* FIXME: issue with large values because of result saturation */
-    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
-    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
-    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
-    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
-    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-}
-
-// Multiply the packed unsigned 16-bit integers in a and b, producing
-// intermediate 32-bit integers, and store the high 16 bits of the intermediate
-// integers in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
-FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
-{
-    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
-#if defined(__aarch64__)
-    uint32x4_t ab7654 =
-        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
-    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
-                              vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r);
-#else
-    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
-    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
-    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
-    uint16x8x2_t r =
-        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
-    return vreinterpretq_m128i_u16(r.val[1]);
-#endif
-}
-
-// Computes pairwise add of each argument as single-precision, floating-point
-// values a and b.
-// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
-FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
-#endif
-}
-
-// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
-{
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
-#else
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
-                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
-#endif
-}
-
-// Horizontally substract adjacent pairs of single-precision (32-bit)
-// floating-point elements in a and b, and pack the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
-FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(vsubq_f32(
-        vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
-        vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
-#else
-    float32x4x2_t c =
-        vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
-    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
-#endif
-}
-
-// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
-// signed 16-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
-FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s16(
-        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
-}
-
-// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
-// signed 32-bit results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
-FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
-{
-    return vreinterpret_m64_s32(
-        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
-}
-
-// Computes pairwise difference of each argument as a 16-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Subtract
-    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
-}
-
-// Computes saturated pairwise sub of each argument as a 16-bit signed
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated add
-    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
-#endif
-}
-
-// Computes saturated pairwise difference of each argument as a 16-bit signed
-// integer values a and b.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
-FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
-{
-#if defined(__aarch64__)
-    int16x8_t a = vreinterpretq_s16_m128i(_a);
-    int16x8_t b = vreinterpretq_s16_m128i(_b);
-    return vreinterpretq_s64_s16(
-        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
-#else
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|a4|a6|b0|b2|b4|b6]
-    // [a1|a3|a5|a7|b1|b3|b5|b7]
-    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
-    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
-    // Saturated subtract
-    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
-#endif
-}
-
-// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
-// values a and b.
-FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
-{
-    int32x4_t a = vreinterpretq_s32_m128i(_a);
-    int32x4_t b = vreinterpretq_s32_m128i(_b);
-    return vreinterpretq_m128i_s32(
-        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
-                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
-}
-
-// Computes pairwise difference of each argument as a 32-bit signed or unsigned
-// integer values a and b.
-FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
-{
-    int64x2_t a = vreinterpretq_s64_m128i(_a);
-    int64x2_t b = vreinterpretq_s64_m128i(_b);
-    // Interleave using vshrn/vmovn
-    // [a0|a2|b0|b2]
-    // [a1|a2|b1|b3]
-    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
-    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
-    // Subtract
-    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
-}
-
-// Kahan summation for accurate summation of floating-point numbers.
-// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
-FORCE_INLINE void sse2neon_kadd_f32(float *sum, float *c, float y)
-{
-    y -= *c;
-    float t = *sum + y;
-    *c = (t - *sum) - y;
-    *sum = t;
-}
-
-// Conditionally multiply the packed single-precision (32-bit) floating-point
-// elements in a and b using the high 4 bits in imm8, sum the four products,
-// and conditionally store the sum in dst using the low 4 bits of imm.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
-FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
-{
-#if defined(__aarch64__)
-    /* shortcuts */
-    if (imm == 0xFF) {
-        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
-    }
-    if (imm == 0x7F) {
-        float32x4_t m = _mm_mul_ps(a, b);
-        m[3] = 0;
-        return _mm_set1_ps(vaddvq_f32(m));
-    }
-#endif
-
-    float s = 0, c = 0;
-    float32x4_t f32a = vreinterpretq_f32_m128(a);
-    float32x4_t f32b = vreinterpretq_f32_m128(b);
-
-    /* To improve the accuracy of floating-point summation, Kahan algorithm
-     * is used for each operation.
-     */
-    if (imm & (1 << 4))
-        sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
-    if (imm & (1 << 5))
-        sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
-    if (imm & (1 << 6))
-        sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
-    if (imm & (1 << 7))
-        sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
-    s += c;
-
-    float32x4_t res = {
-        (imm & 0x1) ? s : 0,
-        (imm & 0x2) ? s : 0,
-        (imm & 0x4) ? s : 0,
-        (imm & 0x8) ? s : 0,
-    };
-    return vreinterpretq_m128_f32(res);
-}
-
-/* Compare operations */
-
-// Compares for less than
-// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for less than
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
-FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
-}
-
-// Compares for greater than.
-//
-//   r0 := (a0 > b0) ? 0xffffffff : 0x0
-//   r1 := (a1 > b1) ? 0xffffffff : 0x0
-//   r2 := (a2 > b2) ? 0xffffffff : 0x0
-//   r3 := (a3 > b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
-}
-
-// Compares for greater than or equal.
-// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
-}
-
-// Compares for less than or equal.
-//
-//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
-//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
-//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
-//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmple_ps(a, b));
-}
-
-// Compares for equality.
-// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-}
-
-// Compares for equality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
-}
-
-// Compares for inequality.
-// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
-{
-    return vreinterpretq_m128_u32(vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
-}
-
-// Compares for inequality.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
-}
-
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
-{
-    return _mm_cmplt_ps(a, b);
-}
-
-// Compares for not greater than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
-{
-    return _mm_cmplt_ss(a, b);
-}
-
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
-{
-    return _mm_cmple_ps(a, b);
-}
-
-// Compares for not greater than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
-{
-    return _mm_cmple_ss(a, b);
-}
-
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
-{
-    return _mm_cmpgt_ps(a, b);
-}
-
-// Compares for not less than or equal.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
-{
-    return _mm_cmpgt_ss(a, b);
-}
-
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
-{
-    return _mm_cmpge_ps(a, b);
-}
-
-// Compares for not less than.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
-{
-    return _mm_cmpge_ss(a, b);
-}
-
-// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
-// unsigned 8-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
-// unsigned 16-bit integers in b for equality.
-// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compare packed 32-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compare packed 64-bit integers in a and b for equality, and store the results
-// in dst
-FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
-#else
-    // ARMv7 lacks vceqq_u64
-    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
-    uint32x4_t cmp =
-        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
-    uint32x4_t swapped = vrev64q_u32(cmp);
-    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
-#endif
-}
-
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for lesser than.
-// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
-// in b for greater than.
-//
-//   r0 := (a0 > b0) ? 0xff : 0x0
-//   r1 := (a1 > b1) ? 0xff : 0x0
-//   ...
-//   r15 := (a15 > b15) ? 0xff : 0x0
-//
-// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-}
-
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for less than.
-//
-//   r0 := (a0 < b0) ? 0xffff : 0x0
-//   r1 := (a1 < b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 < b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
-// in b for greater than.
-//
-//   r0 := (a0 > b0) ? 0xffff : 0x0
-//   r1 := (a1 > b1) ? 0xffff : 0x0
-//   ...
-//   r7 := (a7 > b7) ? 0xffff : 0x0
-//
-// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-}
-
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for less than.
-// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
-// in b for greater than.
-// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u32(
-        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-}
-
-// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
-// in b for greater than.
-FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_u64(
-        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
-#else
-    // ARMv7 lacks vcgtq_s64.
-    // This is based off of Clang's SSE2 polyfill:
-    // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi))
-
-    // Mask the sign bit out since we need a signed AND an unsigned comparison
-    // and it is ugly to try and split them.
-    int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull));
-    int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask);
-    int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask);
-    // Check if a > b
-    int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask));
-    // Copy upper mask to lower mask
-    // a_hi > b_hi
-    int64x2_t gt_hi = vshrq_n_s64(greater, 63);
-    // Copy lower mask to upper mask
-    // a_lo > b_lo
-    int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32);
-    // Compare for equality
-    int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask));
-    // Copy upper mask to lower mask
-    // a_hi == b_hi
-    int64x2_t eq_hi = vshrq_n_s64(equal, 63);
-    // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi)
-    int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi));
-    return vreinterpretq_m128i_s64(ret);
-#endif
-}
-
-// Compares the four 32-bit floats in a and b to check if any values are NaN.
-// Ordered compare between each value returns true for "orderable" and false for
-// "not orderable" (NaN).
-// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
-// also:
-// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
-// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
-FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
-{
-    // Note: NEON does not have ordered compare builtin
-    // Need to compare a eq a and b eq b to check for NaN
-    // Do AND of results to get final
-    uint32x4_t ceqaa =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t ceqbb =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
-}
-
-// Compares for ordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
-}
-
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
-{
-    uint32x4_t f32a =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t f32b =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
-}
-
-// Compares for unordered.
-// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
-FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
-{
-    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than operation. :
-// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
-// note!! The documentation on MSDN is incorrect!  If either of the values is a
-// NAN the docs say you will get a one, but in fact, it will return a zero!!
-FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
-{
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_lt_b =
-        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than operation. :
-// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
-FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
-{
-    // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_gt_b =
-        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a less than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
-FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
-{
-    // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_le_b =
-        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using a greater than or equal operation. :
-// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
-FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
-{
-    // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_ge_b =
-        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using an equality operation. :
-// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
-FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
-{
-    // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
-    uint32x4_t a_eq_b =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
-    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
-}
-
-// Compares the lower single-precision floating point scalar values of a and b
-// using an inequality operation. :
-// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
-FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
-{
-    // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
-    // vreinterpretq_f32_m128(b)), 0);
-    uint32x4_t a_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
-    uint32x4_t b_not_nan =
-        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
-    uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
-    uint32x4_t a_neq_b = vmvnq_u32(
-        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-    return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
-}
-
-// according to the documentation, these intrinsics behave the same as the
-// non-'u' versions.  We'll just alias them here.
-#define _mm_ucomilt_ss _mm_comilt_ss
-#define _mm_ucomile_ss _mm_comile_ss
-#define _mm_ucomigt_ss _mm_comigt_ss
-#define _mm_ucomige_ss _mm_comige_ss
-#define _mm_ucomieq_ss _mm_comieq_ss
-#define _mm_ucomineq_ss _mm_comineq_ss
-
-/* Conversions */
-
-// Convert packed signed 32-bit integers in b to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, and copy the upper 2 packed elements from a to the upper elements of
-// dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-//   dst[95:64] := a[95:64]
-//   dst[127:96] := a[127:96]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
-FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
-FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Convert the signed 32-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
-#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
-
-// Convert the signed 64-bit integer b to a single-precision (32-bit)
-// floating-point element, store the result in the lower element of dst, and
-// copy the upper 3 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
-//   dst[127:32] := a[127:32]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
-FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
-{
-    return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
-}
-
-// Convert the lower single-precision (32-bit) floating-point element in a to a
-// 32-bit integer, and store the result in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
-FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
-#else
-    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    float32_t diff = data - floor(data);
-    if (diff > 0.5)
-        return (int32_t) ceil(data);
-    if (diff == 0.5) {
-        int32_t f = (int32_t) floor(data);
-        int32_t c = (int32_t) ceil(data);
-        return c & 1 ? f : c;
-    }
-    return (int32_t) floor(data);
-#endif
-}
-
-// Convert packed 16-bit integers in a to packed single-precision (32-bit)
-// floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      m := j*32
-//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
-FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
-}
-
-// Convert packed 32-bit integers in b to packed single-precision (32-bit)
-// floating-point elements, store the results in the lower 2 elements of dst,
-// and copy the upper 2 packed elements from a to the upper elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
-//   dst[95:64] := a[95:64]
-//   dst[127:96] := a[127:96]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
-FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(
-        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
-                     vget_high_f32(vreinterpretq_f32_m128(a))));
-}
-
-// Convert packed signed 32-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, store the results in the lower 2 elements
-// of dst, then covert the packed signed 32-bit integers in b to
-// single-precision (32-bit) floating-point element, and store the results in
-// the upper 2 elements of dst.
-//
-//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
-//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
-//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
-//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
-FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
-}
-
-// Convert the lower packed 8-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*8
-//      m := j*32
-//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
-FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(
-        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
-}
-
-// Convert packed unsigned 16-bit integers in a to packed single-precision
-// (32-bit) floating-point elements, and store the results in dst.
-//
-//   FOR j := 0 to 3
-//      i := j*16
-//      m := j*32
-//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
-FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(
-        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
-}
-
-// Convert the lower packed unsigned 8-bit integers in a to packed
-// single-precision (32-bit) floating-point elements, and store the results in
-// dst.
-//
-//   FOR j := 0 to 3
-//      i := j*8
-//      m := j*32
-//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
-FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_u32(
-        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
-}
-
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values using truncate.
-// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
-FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
-{
-#if defined(__aarch64__)
-    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
-#else
-    double ret = *((double *) &a);
-    return (int64_t) ret;
-#endif
-}
-
-// Convert the lower double-precision (64-bit) floating-point element in a to a
-// 64-bit integer with truncation, and store the result in dst.
-//
-//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
-#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
-
-// Converts the four signed 32-bit integer values of a to single-precision,
-// floating-point values
-// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
-FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
-{
-    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
-}
-
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_u16(u16x8);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
-FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_u32(u32x4);
-}
-
-// Converts the two unsigned 8-bit integers in the lower 16 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
-{
-    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 16 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
-    return vreinterpretq_m128i_s16(s16x8);
-}
-
-// Converts the four unsigned 8-bit integers in the lower 32 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
-    return vreinterpretq_m128i_s32(s32x4);
-}
-
-// Converts the two signed 8-bit integers in the lower 32 bits to four
-// signed 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
-{
-    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
-    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Converts the four signed 16-bit integers in the lower 64 bits to four signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_s32(
-        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
-}
-
-// Converts the two signed 16-bit integers in the lower 32 bits two signed
-// 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
-{
-    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
-    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_s64(s64x2);
-}
-
-// Converts the four unsigned 16-bit integers in the lower 64 bits to four
-// unsigned 32-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
-{
-    return vreinterpretq_m128i_u32(
-        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
-}
-
-// Converts the two unsigned 16-bit integers in the lower 32 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
-{
-    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
-    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
-    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
-    return vreinterpretq_m128i_u64(u64x2);
-}
-
-// Converts the two unsigned 32-bit integers in the lower 64 bits to two
-// unsigned 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_u64(
-        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
-}
-
-// Converts the two signed 32-bit integers in the lower 64 bits to two signed
-// 64-bit integers.
-FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
-{
-    return vreinterpretq_m128i_s64(
-        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
-}
-
-// Converts the four single-precision, floating-point values of a to signed
-// 32-bit integer values.
-//
-//   r0 := (int) a0
-//   r1 := (int) a1
-//   r2 := (int) a2
-//   r3 := (int) a3
-//
-// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
-// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
-// does not support! It is supported on ARMv8-A however.
-FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
-#else
-    uint32x4_t signmask = vdupq_n_u32(0x80000000);
-    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
-                                 vdupq_n_f32(0.5f)); /* +/- 0.5 */
-    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
-        vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
-    int32x4_t r_trunc =
-        vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
-    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
-        vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
-    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
-                                 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
-    float32x4_t delta = vsubq_f32(
-        vreinterpretq_f32_m128(a),
-        vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
-    uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
-    return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
-#endif
-}
-
-// Copy the lower 32-bit integer in a to dst.
-//
-//   dst[31:0] := a[31:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
-FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
-{
-    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
-FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
-{
-    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
-}
-
-// Copy the lower 64-bit integer in a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
-#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
-
-// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
-// zero extending the upper bits.
-//
-//   r0 := a
-//   r1 := 0x0
-//   r2 := 0x0
-//   r3 := 0x0
-//
-// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
-{
-    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
-}
-
-// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
-// zero extending the upper bits.
-//
-//   r0 := a
-//   r1 := 0x0
-FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
-{
-    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
-}
-
-// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
-FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
-{
-    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
-}
-
-// Applies a type cast to reinterpret four 32-bit floating point values passed
-// in as a 128-bit parameter as packed 32-bit integers.
-// https://msdn.microsoft.com/en-us/library/bb514099.aspx
-FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
-{
-    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
-}
-
-// Applies a type cast to reinterpret four 32-bit integers passed in as a
-// 128-bit parameter as packed 32-bit floating point values.
-// https://msdn.microsoft.com/en-us/library/bb514029.aspx
-FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
-{
-    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
-}
-
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
-FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
-FORCE_INLINE __m128d _mm_load1_pd(const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
-#else
-    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into the
-// upper element of dst, and copy the lower element from a to dst. mem_addr does
-// not need to be aligned on any particular boundary.
-//
-//   dst[63:0] := a[63:0]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
-FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
-#else
-    return vreinterpretq_m128d_f32(vcombine_f32(
-        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
-#endif
-}
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
-#define _mm_load_pd1 _mm_load1_pd
-
-// Load a double-precision (64-bit) floating-point element from memory into both
-// elements of dst.
-//
-//   dst[63:0] := MEM[mem_addr+63:mem_addr]
-//   dst[127:64] := MEM[mem_addr+63:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
-#define _mm_loaddup_pd _mm_load1_pd
-
-// Loads 128-bit value. :
-// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
-FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
-{
-    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
-}
-
-// Load unaligned 32-bit integer from memory into the first element of dst.
-//
-//   dst[31:0] := MEM[mem_addr+31:mem_addr]
-//   dst[MAX:32] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
-FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
-{
-    return vreinterpretq_m128i_s32(
-        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
-}
-
-// Convert packed double-precision (64-bit) floating-point elements in a to
-// packed single-precision (32-bit) floating-point elements, and store the
-// results in dst.
-//
-//   FOR j := 0 to 1
-//     i := 32*j
-//     k := 64*j
-//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
-//   ENDFOR
-//   dst[127:64] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
-FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
-{
-#if defined(__aarch64__)
-    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
-    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
-#else
-    float a0 = (float) ((double *) &a)[0];
-    float a1 = (float) ((double *) &a)[1];
-    return _mm_set_ps(0, 0, a1, a0);
-#endif
-}
-
-// Copy the lower double-precision (64-bit) floating-point element of a to dst.
-//
-//   dst[63:0] := a[63:0]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
-FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
-{
-#if defined(__aarch64__)
-    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
-#else
-    return ((double *) &a)[0];
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed double-precision (64-bit) floating-point elements, and store the
-// results in dst.
-//
-//   FOR j := 0 to 1
-//     i := 64*j
-//     k := 32*j
-//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
-FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128d_f64(
-        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
-#else
-    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
-    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
-    return _mm_set_pd(a1, a0);
-#endif
-}
-
-// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
-FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
-{
-    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Cast vector of type __m128d to type __m128. This intrinsic is only used for
-// compilation and does not generate any instructions, thus it has zero latency.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
-FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
-{
-    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
-}
-
-// Blend packed single-precision (32-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
-FORCE_INLINE __m128 _mm_blendv_ps(__m128 a, __m128 b, __m128 mask)
-{
-    return vreinterpretq_m128_f32(vbslq_f32(vreinterpretq_u32_m128(mask),
-                                            vreinterpretq_f32_m128(b),
-                                            vreinterpretq_f32_m128(a)));
-}
-
-// Blend packed double-precision (64-bit) floating-point elements from a and b
-// using mask, and store the results in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
-FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
-{
-    uint64x2_t mask =
-        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
-#if defined(__aarch64__)
-    float64x2_t a = vreinterpretq_f64_m128d(_a);
-    float64x2_t b = vreinterpretq_f64_m128d(_b);
-    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
-#else
-    uint64x2_t a = vreinterpretq_u64_m128d(_a);
-    uint64x2_t b = vreinterpretq_u64_m128d(_b);
-    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a using
-// the rounding parameter, and store the results as packed single-precision
-// floating-point elements in dst.
-// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
-FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
-{
-#if defined(__aarch64__)
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
-    }
-#else
-    float *v_float = (float *) &a;
-    __m128 zero, neg_inf, pos_inf;
-
-    switch (rounding) {
-    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
-        return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
-    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
-        return (__m128){floorf(v_float[0]), floorf(v_float[1]),
-                        floorf(v_float[2]), floorf(v_float[3])};
-    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
-        return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
-                        ceilf(v_float[3])};
-    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
-        zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
-        neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
-                             floorf(v_float[2]), floorf(v_float[3]));
-        pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
-                             ceilf(v_float[2]), ceilf(v_float[3]));
-        return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
-    default:  //_MM_FROUND_CUR_DIRECTION
-        return (__m128){roundf(v_float[0]), roundf(v_float[1]),
-                        roundf(v_float[2]), roundf(v_float[3])};
-    }
-#endif
-}
-
-// Convert packed single-precision (32-bit) floating-point elements in a to
-// packed 32-bit integers, and store the results in dst.
-//
-//   FOR j := 0 to 1
-//       i := 32*j
-//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
-//   ENDFOR
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
-FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
-{
-#if defined(__aarch64__)
-    return vreinterpret_m64_s32(
-        vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
-#else
-    return vreinterpret_m64_s32(
-        vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
-            _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
-#endif
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a up to
-// an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
-FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
-{
-    return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
-}
-
-// Round the packed single-precision (32-bit) floating-point elements in a down
-// to an integer value, and store the results as packed single-precision
-// floating-point elements in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
-FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
-{
-    return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
-}
-
-
-// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
-// may perform better than _mm_loadu_si128 when the data crosses a cache line
-// boundary.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
-#define _mm_lddqu_si128 _mm_loadu_si128
-
-/* Miscellaneous Operations */
-
-// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   ...
-//   r7 := a7 >> count
-//
-// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
-FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (c > 15)
-        return _mm_cmplt_epi16(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
-}
-
-// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
-// in the sign bit.
-//
-//   r0 := a0 >> count
-//   r1 := a1 >> count
-//   r2 := a2 >> count
-//   r3 := a3 >> count
-//
-// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
-FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
-{
-    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
-    if (c > 31)
-        return _mm_cmplt_epi32(a, _mm_setzero_si128());
-    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
-}
-
-// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
-// saturates.
-// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s8(
-        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   ...
-//   r7 := UnsignedSaturate(a7)
-//   r8 := UnsignedSaturate(b0)
-//   r9 := UnsignedSaturate(b1)
-//   ...
-//   r15 := UnsignedSaturate(b7)
-//
-// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
-                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
-}
-
-// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
-// and saturates.
-//
-//   r0 := SignedSaturate(a0)
-//   r1 := SignedSaturate(a1)
-//   r2 := SignedSaturate(a2)
-//   r3 := SignedSaturate(a3)
-//   r4 := SignedSaturate(b0)
-//   r5 := SignedSaturate(b1)
-//   r6 := SignedSaturate(b2)
-//   r7 := SignedSaturate(b3)
-//
-// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_s16(
-        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
-// integers and saturates.
-//
-//   r0 := UnsignedSaturate(a0)
-//   r1 := UnsignedSaturate(a1)
-//   r2 := UnsignedSaturate(a2)
-//   r3 := UnsignedSaturate(a3)
-//   r4 := UnsignedSaturate(b0)
-//   r5 := UnsignedSaturate(b1)
-//   r6 := UnsignedSaturate(b2)
-//   r7 := UnsignedSaturate(b3)
-FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u16(
-        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
-                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
-}
-
-// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
-// 8 signed or unsigned 8-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   ...
-//   r14 := a7
-//   r15 := b7
-//
-// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
-// lower 4 signed or unsigned 16-bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//   r4 := a2
-//   r5 := b2
-//   r6 := a3
-//   r7 := b3
-//
-// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
-// lower 2 signed or unsigned 32 - bit integers in b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
-{
-    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
-}
-
-// Selects and interleaves the lower two single-precision, floating-point values
-// from a and b.
-//
-//   r0 := a0
-//   r1 := b0
-//   r2 := a1
-//   r3 := b1
-//
-// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Selects and interleaves the upper two single-precision, floating-point values
-// from a and b.
-//
-//   r0 := a2
-//   r1 := b2
-//   r2 := a3
-//   r3 := b3
-//
-// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
-FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128_f32(
-        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
-#else
-    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
-    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
-    float32x2x2_t result = vzip_f32(a1, b1);
-    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
-// 8 signed or unsigned 8-bit integers in b.
-//
-//   r0 := a8
-//   r1 := b8
-//   r2 := a9
-//   r3 := b9
-//   ...
-//   r14 := a15
-//   r15 := b15
-//
-// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s8(
-        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
-#else
-    int8x8_t a1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
-    int8x8_t b1 =
-        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
-    int8x8x2_t result = vzip_s8(a1, b1);
-    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
-// upper 4 signed or unsigned 16-bit integers in b.
-//
-//   r0 := a4
-//   r1 := b4
-//   r2 := a5
-//   r3 := b5
-//   r4 := a6
-//   r5 := b6
-//   r6 := a7
-//   r7 := b7
-//
-// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s16(
-        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
-#else
-    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
-    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
-    int16x4x2_t result = vzip_s16(a1, b1);
-    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
-// upper 2 signed or unsigned 32-bit integers in b.
-// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
-FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
-{
-#if defined(__aarch64__)
-    return vreinterpretq_m128i_s32(
-        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
-#else
-    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
-    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
-    int32x2x2_t result = vzip_s32(a1, b1);
-    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
-#endif
-}
-
-// Interleaves the upper signed or unsigned 64-bit integer in a with the
-// upper signed or unsigned 64-bit integer in b.
-//
-//   r0 := a1
-//   r1 := b1
-FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
-{
-    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
-    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
-    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
-}
-
-// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
-// in a, store the minimum and index in dst, and zero the remaining bits in dst.
-//
-//   index[2:0] := 0
-//   min[15:0] := a[15:0]
-//   FOR j := 0 to 7
-//       i := j*16
-//       IF a[i+15:i] < min[15:0]
-//           index[2:0] := j
-//           min[15:0] := a[i+15:i]
-//       FI
-//   ENDFOR
-//   dst[15:0] := min[15:0]
-//   dst[18:16] := index[2:0]
-//   dst[127:19] := 0
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
-FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
-{
-    __m128i dst;
-    uint16_t min, idx = 0;
-    // Find the minimum value
-#if defined(__aarch64__)
-    min = vminvq_u16(vreinterpretq_u16_m128i(a));
-#else
-    __m64 tmp;
-    tmp = vreinterpret_m64_u16(
-        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
-                 vget_high_u16(vreinterpretq_u16_m128i(a))));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    tmp = vreinterpret_m64_u16(
-        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
-    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
-#endif
-    // Get the index of the minimum value
-    int i;
-    for (i = 0; i < 8; i++) {
-        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
-            idx = (uint16_t) i;
-            break;
-        }
-        a = _mm_srli_si128(a, 2);
-    }
-    // Generate result
-    dst = _mm_setzero_si128();
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
-    dst = vreinterpretq_m128i_u16(
-        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
-    return dst;
-}
-
-// shift to right
-// https://msdn.microsoft.com/en-us/library/bb514041(v=vs.120).aspx
-// http://blog.csdn.net/hemmingway/article/details/44828303
-// Clang requires a macro here, as it is extremely picky about c being a
-// literal.
-#define _mm_alignr_epi8(a, b, c) \
-    ((__m128i) vextq_s8((int8x16_t)(b), (int8x16_t)(a), (c)))
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the CF value.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
-FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
-                  vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
-// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
-// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
-// otherwise set CF to 0. Return the ZF value.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
-FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
-{
-    int64x2_t s64 =
-        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
-    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
-}
-
-// Extracts the selected signed or unsigned 8-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
-#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
-
-// Inserts the least significant 8 bits of b into the selected 8-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
-//                                      __constrange(0,16) int imm)
-#define _mm_insert_epi8(a, b, imm)                                 \
-    __extension__({                                                \
-        vreinterpretq_m128i_s8(                                    \
-            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
-    })
-
-// Extracts the selected signed or unsigned 16-bit integer from a and zero
-// extends.
-// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
-// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
-#define _mm_extract_epi16(a, imm) \
-    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
-
-// Inserts the least significant 16 bits of b into the selected 16-bit integer
-// of a.
-// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
-// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
-//                                       __constrange(0,8) int imm)
-#define _mm_insert_epi16(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s16(                                     \
-            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
-    })
-
-// Copy a to dst, and insert the 16-bit integer i into dst at the location
-// specified by imm8.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
-#define _mm_insert_pi16(a, b, imm)                               \
-    __extension__({                                              \
-        vreinterpret_m64_s16(                                    \
-            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
-    })
-
-// Extracts the selected signed or unsigned 32-bit integer from a and zero
-// extends.
-// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
-#define _mm_extract_epi32(a, imm) \
-    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
-
-// Extracts the selected single-precision (32-bit) floating-point from a.
-// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
-#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
-
-// Inserts the least significant 32 bits of b into the selected 32-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
-//                                       __constrange(0,4) int imm)
-#define _mm_insert_epi32(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s32(                                     \
-            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
-    })
-
-// Extracts the selected signed or unsigned 64-bit integer from a and zero
-// extends.
-// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
-#define _mm_extract_epi64(a, imm) \
-    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
-
-// Inserts the least significant 64 bits of b into the selected 64-bit integer
-// of a.
-// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
-//                                       __constrange(0,2) int imm)
-#define _mm_insert_epi64(a, b, imm)                                  \
-    __extension__({                                                  \
-        vreinterpretq_m128i_s64(                                     \
-            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
-    })
-
-// Count the number of bits set to 1 in unsigned 32-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
-FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcount)
-    return __builtin_popcount(a);
-#else
-    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
-#endif
-#else
-    uint32_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-
-    vst1_u32(&count, count32x2_val);
-    return count;
-#endif
-}
-
-// Count the number of bits set to 1 in unsigned 64-bit integer a, and
-// return that count in dst.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
-FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
-{
-#if defined(__aarch64__)
-#if __has_builtin(__builtin_popcountll)
-    return __builtin_popcountll(a);
-#else
-    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
-#endif
-#else
-    uint64_t count = 0;
-    uint8x8_t input_val, count8x8_val;
-    uint16x4_t count16x4_val;
-    uint32x2_t count32x2_val;
-    uint64x1_t count64x1_val;
-
-    input_val = vld1_u8((uint8_t *) &a);
-    count8x8_val = vcnt_u8(input_val);
-    count16x4_val = vpaddl_u8(count8x8_val);
-    count32x2_val = vpaddl_u16(count16x4_val);
-    count64x1_val = vpaddl_u32(count32x2_val);
-    vst1_u64(&count, count64x1_val);
-    return count;
-#endif
-}
-
-// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
-// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
-// transposed matrix in these vectors (row0 now contains column 0, etc.).
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
-#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
-    do {                                                  \
-        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
-        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
-        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
-                            vget_low_f32(ROW23.val[0]));  \
-        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
-                            vget_low_f32(ROW23.val[1]));  \
-        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
-                            vget_high_f32(ROW23.val[0])); \
-        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
-                            vget_high_f32(ROW23.val[1])); \
-    } while (0)
-
-/* Crypto Extensions */
-
-#if defined(__ARM_FEATURE_CRYPTO)
-// Wraps vmull_p64
-FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
-    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
-    return vreinterpretq_u64_p128(vmull_p64(a, b));
-}
-#else  // ARMv7 polyfill
-// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
-//
-// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
-// 64-bit->128-bit polynomial multiply.
-//
-// It needs some work and is somewhat slow, but it is still faster than all
-// known scalar methods.
-//
-// Algorithm adapted to C from
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
-// from "Fast Software Polynomial Multiplication on ARM Processors Using the
-// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
-// (https://hal.inria.fr/hal-01506572)
-static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
-{
-    poly8x8_t a = vreinterpret_p8_u64(_a);
-    poly8x8_t b = vreinterpret_p8_u64(_b);
-
-    // Masks
-    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
-                                    vcreate_u8(0x00000000ffffffff));
-    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
-                                    vcreate_u8(0x0000000000000000));
-
-    // Do the multiplies, rotating with vext to get all combinations
-    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
-    uint8x16_t e =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
-    uint8x16_t f =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
-    uint8x16_t g =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
-    uint8x16_t h =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
-    uint8x16_t i =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
-    uint8x16_t j =
-        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
-    uint8x16_t k =
-        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
-
-    // Add cross products
-    uint8x16_t l = veorq_u8(e, f);  // L = E + F
-    uint8x16_t m = veorq_u8(g, h);  // M = G + H
-    uint8x16_t n = veorq_u8(i, j);  // N = I + J
-
-    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
-    // instructions.
-#if defined(__aarch64__)
-    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
-    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
-        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
-        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
-#else
-    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
-    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
-    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
-    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
-#endif
-    // t0 = (L) (P0 + P1) << 8
-    // t1 = (M) (P2 + P3) << 16
-    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
-    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
-    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
-
-    // t2 = (N) (P4 + P5) << 24
-    // t3 = (K) (P6 + P7) << 32
-    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
-    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
-    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
-
-    // De-interleave
-#if defined(__aarch64__)
-    uint8x16_t t0 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t1 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
-    uint8x16_t t2 = vreinterpretq_u8_u64(
-        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-    uint8x16_t t3 = vreinterpretq_u8_u64(
-        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
-#else
-    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
-    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
-    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
-    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
-#endif
-    // Shift the cross products
-    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
-    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
-    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
-    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
-
-    // Accumulate the products
-    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
-    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
-    uint8x16_t mix = veorq_u8(d, cross1);
-    uint8x16_t r = veorq_u8(mix, cross2);
-    return vreinterpretq_u64_u8(r);
-}
-#endif  // ARMv7 polyfill
-
-FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
-{
-    uint64x2_t a = vreinterpretq_u64_m128i(_a);
-    uint64x2_t b = vreinterpretq_u64_m128i(_b);
-    switch (imm & 0x11) {
-    case 0x00:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
-    case 0x01:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
-    case 0x10:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
-    case 0x11:
-        return vreinterpretq_m128i_u64(
-            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
-    default:
-        abort();
-    }
-}
-
-#if !defined(__ARM_FEATURE_CRYPTO)
-/* clang-format off */
-#define SSE2NEON_AES_DATA(w)                                           \
-    {                                                                  \
-        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
-        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
-        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
-        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
-        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
-        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
-        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
-        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
-        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
-        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
-        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
-        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
-        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
-        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
-        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
-        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
-        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
-        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
-        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
-        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
-        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
-        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
-        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
-        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
-        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
-        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
-        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
-        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
-        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
-        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
-        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
-        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
-        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
-        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
-        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
-        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
-        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
-    }
-/* clang-format on */
-
-/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
-#define SSE2NEON_AES_H0(x) (x)
-static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
-#undef SSE2NEON_AES_H0
-
-// In the absence of crypto extensions, implement aesenc using regular neon
-// intrinsics instead. See:
-// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
-// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
-// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
-// for more information Reproduced with permission of the author.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
-{
-#if defined(__aarch64__)
-    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
-                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
-                                         0xc, 0x1, 0x6, 0xb};
-    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
-                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
-
-    uint8x16_t v;
-    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
-
-    // shift rows
-    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
-
-    // sub bytes
-    v = vqtbl4q_u8(vld1q_u8_x4(SSE2NEON_sbox), w);
-    v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
-    v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
-    v = vqtbx4q_u8(v, vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
-
-    // mix columns
-    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
-    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
-    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
-
-    //  add round key
-    return vreinterpretq_m128i_u8(w) ^ RoundKey;
-
-#else /* ARMv7-A NEON implementation */
-#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
-    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
-     (b0))
-#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
-#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
-#define SSE2NEON_AES_U0(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
-#define SSE2NEON_AES_U1(p) \
-    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
-#define SSE2NEON_AES_U2(p) \
-    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
-#define SSE2NEON_AES_U3(p) \
-    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
-    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
-        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
-    };
-#undef SSE2NEON_AES_B2W
-#undef SSE2NEON_AES_F2
-#undef SSE2NEON_AES_F3
-#undef SSE2NEON_AES_U0
-#undef SSE2NEON_AES_U1
-#undef SSE2NEON_AES_U2
-#undef SSE2NEON_AES_U3
-
-    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
-    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
-    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
-    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
-
-    __m128i out = _mm_set_epi32(
-        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
-         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
-        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
-         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
-        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
-         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
-        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
-         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
-
-    return _mm_xor_si128(out, RoundKey);
-#endif
-}
-
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-    /* FIXME: optimized for NEON */
-    uint8_t v[4][4] = {
-        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
-        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
-        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
-        {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
-         SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
-    };
-    for (int i = 0; i < 16; i++)
-        vreinterpretq_nth_u8_m128i(a, i) =
-            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
-    return a;
-}
-
-// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
-// This instruction generates a round key for AES encryption. See
-// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
-// for details.
-//
-// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
-{
-    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
-    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
-    for (int i = 0; i < 4; ++i) {
-        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
-        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
-    }
-    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
-                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
-}
-#undef SSE2NEON_AES_DATA
-
-#else /* __ARM_FEATURE_CRYPTO */
-// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
-// AESMC and then manually applying the real key as an xor operation. This
-// unfortunately means an additional xor op; the compiler should be able to
-// optimize this away for repeated calls however. See
-// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
-// for more details.
-FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
-{
-    return vreinterpretq_m128i_u8(
-        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
-        vreinterpretq_u8_m128i(b));
-}
-
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
-FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
-{
-    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
-                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
-                         RoundKey);
-}
-
-FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
-{
-    // AESE does ShiftRows and SubBytes on A
-    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
-
-    uint8x16_t dest = {
-        // Undo ShiftRows step from AESE and extract X1 and X3
-        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
-        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
-        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
-        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
-    };
-    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
-    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
-}
-#endif
-
-/* Streaming Extensions */
-
-// Guarantees that every preceding store is globally visible before any
-// subsequent store.
-// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_sfence(void)
-{
-    __sync_synchronize();
-}
-
-// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
-// point elements) from a into memory using a non-temporal memory hint.
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
-FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, (float32x4_t *) p);
-#else
-    vst1q_f32(p, vreinterpretq_f32_m128(a));
-#endif
-}
-
-// Stores the data in a to the address p without polluting the caches.  If the
-// cache line containing address p is already in the cache, the cache will be
-// updated.
-// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
-FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    __builtin_nontemporal_store(a, p);
-#else
-    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
-#endif
-}
-
-// Load 128-bits of integer data from memory into dst using a non-temporal
-// memory hint. mem_addr must be aligned on a 16-byte boundary or a
-// general-protection exception may be generated.
-//
-//   dst[127:0] := MEM[mem_addr+127:mem_addr]
-//
-// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
-FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
-{
-#if __has_builtin(__builtin_nontemporal_store)
-    return __builtin_nontemporal_load(p);
-#else
-    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
-#endif
-}
-
-// Cache line containing p is flushed and invalidated from all caches in the
-// coherency domain. :
-// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
-FORCE_INLINE void _mm_clflush(void const *p)
-{
-    (void) p;
-    // no corollary for Neon?
-}
-
-// Allocate aligned blocks of memory.
-// https://software.intel.com/en-us/
-//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
-FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
-{
-    void *ptr;
-    if (align == 1)
-        return malloc(size);
-    if (align == 2 || (sizeof(void *) == 8 && align == 4))
-        align = sizeof(void *);
-    if (!posix_memalign(&ptr, align, size))
-        return ptr;
-    return NULL;
-}
-
-FORCE_INLINE void _mm_free(void *addr)
-{
-    free(addr);
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 8-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc ^= v;
-    for (int bit = 0; bit < 8; bit++) {
-        if (crc & 1)
-            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
-        else
-            crc = (crc >> 1);
-    }
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 16-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u8(crc, v & 0xff);
-    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 32-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
-FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u16(crc, v & 0xffff);
-    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
-#endif
-    return crc;
-}
-
-// Starting with the initial value in crc, accumulates a CRC32 value for
-// unsigned 64-bit integer v.
-// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
-FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
-{
-#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
-    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
-                         : [c] "+r"(crc)
-                         : [v] "r"(v));
-#else
-    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
-    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
-#endif
-    return crc;
-}
-
-#if defined(__GNUC__) || defined(__clang__)
-#pragma pop_macro("ALIGN_STRUCT")
-#pragma pop_macro("FORCE_INLINE")
-#endif
-
-#if defined(__GNUC__)
-#pragma GCC pop_options
-#endif
-
-#endif
diff --git a/soxr/src/std-types.h b/soxr/src/std-types.h
new file mode 100644
index 0000000..c5e8636
--- /dev/null
+++ b/soxr/src/std-types.h
@@ -0,0 +1,48 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_std_types_included
+#define soxr_std_types_included
+
+#include "soxr-config.h"
+
+#include <limits.h>
+
+#if HAVE_STDBOOL_H
+  #include <stdbool.h>
+#else
+  #undef bool
+  #undef false
+  #undef true
+  #define bool int
+  #define false 0
+  #define true 1
+#endif
+
+#if HAVE_STDINT_H
+  #include <stdint.h>
+#else
+  #undef int16_t
+  #undef int32_t
+  #undef int64_t
+  #undef uint32_t
+  #undef uint64_t
+  #define int16_t short
+  #if LONG_MAX > 2147483647L
+    #define int32_t int
+    #define int64_t long
+  #elif LONG_MAX < 2147483647L
+  #error this library requires that 'long int' has at least 32-bits
+  #else
+    #define int32_t long
+    #if defined _MSC_VER
+      #define int64_t __int64
+    #else
+      #define int64_t long long
+    #endif
+  #endif
+  #define uint32_t unsigned int32_t
+  #define uint64_t unsigned int64_t
+#endif
+
+#endif
diff --git a/soxr/src/simd.c b/soxr/src/util-simd.c
similarity index 69%
rename from soxr/src/simd.c
rename to soxr/src/util-simd.c
index 7659ab9..ec548fd 100644
--- a/soxr/src/simd.c
+++ b/soxr/src/util-simd.c
@@ -1,15 +1,15 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 #include <assert.h>
 #include <string.h>
 #include <stdlib.h>
-#include "simd.h"
-#include "simd-dev.h"
 
-#define SIMD_ALIGNMENT (sizeof(float) * 4)
+#include "soxr-config.h"
 
-void * _soxr_simd_aligned_malloc(size_t size)
+#define SIMD_ALIGNMENT (sizeof(float) * (1 + (PFFFT_DOUBLE|AVCODEC_FOUND)) * 4)
+
+void * SIMD_ALIGNED_MALLOC(size_t size)
 {
   char * p1 = 0, * p = malloc(size + SIMD_ALIGNMENT);
   if (p) {
@@ -21,9 +21,9 @@ void * _soxr_simd_aligned_malloc(size_t size)
 
 
 
-void * _soxr_simd_aligned_calloc(size_t nmemb, size_t size)
+void * SIMD_ALIGNED_CALLOC(size_t nmemb, size_t size)
 {
-  void * p = _soxr_simd_aligned_malloc(nmemb * size);
+  void * p = SIMD_ALIGNED_MALLOC(nmemb * size);
   if (p)
     memset(p, 0, nmemb * size);
   return p;
@@ -31,7 +31,7 @@ void * _soxr_simd_aligned_calloc(size_t nmemb, size_t size)
 
 
 
-void _soxr_simd_aligned_free(void * p1)
+void SIMD_ALIGNED_FREE(void * p1)
 {
   if (p1)
     free(*((void * *)p1 - 1));
@@ -39,11 +39,16 @@ void _soxr_simd_aligned_free(void * p1)
 
 
 
-void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float * b)
+#define PFFT_MACROS_ONLY
+#include "pffft.c"
+
+
+
+void ORDERED_CONVOLVE_SIMD(int n, void * not_used, float * a, float const * b)
 {
   int i;
   float ab0, ab1;
-  v4sf       * /*RESTRICT*/ va = (v4sf       *)a;
+  v4sf       *   RESTRICT   va = (v4sf       *)a;
   v4sf const *   RESTRICT   vb = (v4sf const *)b;
   assert(VALIGNED(a) && VALIGNED(b));
   ab0 = a[0] * b[0], ab1 = a[1] * b[1];
@@ -62,11 +67,11 @@ void _soxr_ordered_convolve_simd(int n, void * not_used, float * a, const float
 
 
 
-void _soxr_ordered_partial_convolve_simd(int n, float * a, const float * b)
+void ORDERED_PARTIAL_CONVOLVE_SIMD(int n, float * a, float const * b)
 {
   int i;
   float ab0;
-  v4sf       * /*RESTRICT*/ va = (v4sf       *)a;
+  v4sf       *   RESTRICT   va = (v4sf       *)a;
   v4sf const *   RESTRICT   vb = (v4sf const *)b;
   assert(VALIGNED(a) && VALIGNED(b));
   ab0 = a[0] * b[0];
diff --git a/soxr/src/util32s.c b/soxr/src/util32s.c
new file mode 100644
index 0000000..b9c9e08
--- /dev/null
+++ b/soxr/src/util32s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 0
+
+#include "util32s.h"
+
+#include "util-simd.c"
diff --git a/soxr/src/util32s.h b/soxr/src/util32s.h
new file mode 100644
index 0000000..12226e5
--- /dev/null
+++ b/soxr/src/util32s.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_util32s_included
+#define soxr_util32s_included
+
+#include <stddef.h>
+
+void * _soxr_simd32_aligned_malloc(size_t);
+void * _soxr_simd32_aligned_calloc(size_t, size_t);
+void _soxr_simd32_aligned_free(void *);
+
+#define SIMD_ALIGNED_MALLOC _soxr_simd32_aligned_malloc
+#define SIMD_ALIGNED_CALLOC _soxr_simd32_aligned_calloc
+#define SIMD_ALIGNED_FREE _soxr_simd32_aligned_free
+
+void _soxr_ordered_convolve_simd32(int n, void * not_used, float * a, float const * b);
+void _soxr_ordered_partial_convolve_simd32(int n, float * a, float const * b);
+
+#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd32
+#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd32
+
+#endif
diff --git a/soxr/src/util64s.c b/soxr/src/util64s.c
new file mode 100644
index 0000000..0faa9e9
--- /dev/null
+++ b/soxr/src/util64s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 1
+
+#include "util64s.h"
+
+#include "util-simd.c"
diff --git a/soxr/src/util64s.h b/soxr/src/util64s.h
new file mode 100644
index 0000000..7beeb89
--- /dev/null
+++ b/soxr/src/util64s.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_util64s_included
+#define soxr_util64s_included
+
+#include <stddef.h>
+
+void * _soxr_simd64_aligned_malloc(size_t);
+void * _soxr_simd64_aligned_calloc(size_t, size_t);
+void _soxr_simd64_aligned_free(void *);
+
+#define SIMD_ALIGNED_MALLOC _soxr_simd64_aligned_malloc
+#define SIMD_ALIGNED_CALLOC _soxr_simd64_aligned_calloc
+#define SIMD_ALIGNED_FREE _soxr_simd64_aligned_free
+
+void _soxr_ordered_convolve_simd64(int n, void * not_used, double * a, double const * b);
+void _soxr_ordered_partial_convolve_simd64(int n, double * a, double const * b);
+
+#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd64
+#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd64
+
+#endif
diff --git a/soxr/src/vr-coefs.c b/soxr/src/vr-coefs.c
index 14886df..a57bec8 100644
--- a/soxr/src/vr-coefs.c
+++ b/soxr/src/vr-coefs.c
@@ -103,6 +103,9 @@ static void iir(int N, double Fp, char const * name)
 
 int main(int argc, char **argv)
 {
+  puts("/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net");
+  puts(" * Licence for this file: LGPL v2.1                  See LICENCE for details. */\n");
+
   fir(241,  1, .45,  .5, 160, 32, "half_fir_coefs");
   fir( 24, .5, .25,  .5,   1, 31, "fast_half_fir_coefs");
   fir( 20, 12, .9 , 1.5, 160, 58, "coefs0_d");
diff --git a/soxr/src/vr-coefs.h b/soxr/src/vr-coefs.h
index 9790ec0..e44138e 100644
--- a/soxr/src/vr-coefs.h
+++ b/soxr/src/vr-coefs.h
@@ -1,3 +1,6 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
 static float const half_fir_coefs[] = {
  0.471112154f,  0.316907549f,  0.0286963396f, -0.101927032f,
 -0.0281272982f,  0.0568029535f,  0.027196876f, -0.0360795942f,
diff --git a/soxr/src/vr32.c b/soxr/src/vr32.c
index 65eed3f..5159603 100644
--- a/soxr/src/vr32.c
+++ b/soxr/src/vr32.c
@@ -1,16 +1,10 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Variable-rate resampling. */
 
 #include <assert.h>
-#include <math.h>
-#if !defined M_PI
-#define M_PI    3.14159265358979323846
-#endif
-#if !defined M_LN2
-#define M_LN2   0.69314718055994530942
-#endif
+#include "math-wrap.h"
 #include <string.h>
 #include <stdlib.h>
 #include "internal.h"
@@ -197,7 +191,7 @@ static float poly_fir1_u(float const * input, uint32_t frac)
 typedef struct {
   union {
     int64_t all;
-#if WORDS_BIGENDIAN
+#if HAVE_BIGENDIAN
     struct {int32_t integer; uint32_t frac;} part;
 #else
     struct {uint32_t frac; int32_t integer;} part;
@@ -316,7 +310,7 @@ static void vr_init(rate_t * p, double default_io_ratio, int num_stages, double
   }
   fifo_create(&p->output_fifo, sizeof(float));
   p->default_io_ratio = default_io_ratio;
-  if (!fade_coefs[0]) {
+  if (fade_coefs[0]==0) {
     for (i = 0; i < iAL(fade_coefs); ++i)
       fade_coefs[i] = (float)(.5 * (1 + cos(M_PI * i / (AL(fade_coefs) - 1))));
     prepare_coefs(poly_fir_coefs_u, POLY_FIR_LEN_U, PHASES0_U, PHASES_U, coefs0_u, mult);
@@ -354,8 +348,9 @@ static bool set_step_step(stream_t * p, double io_ratio, int slew_len)
   return p->step_step.all != 0;
 }
 
-static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
+static void vr_set_io_ratio(void * P, double io_ratio, size_t slew_len)
 {
+  rate_t *p = P;
   assert(io_ratio > 0);
   if (slew_len) {
     if (!set_step_step(&p->current, io_ratio, p->slew_len = (int)slew_len))
@@ -367,7 +362,7 @@ static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
     }
   }
   else {
-    if (p->default_io_ratio) { /* Then this is the first call to this fn. */
+    if (p->default_io_ratio!=0) { /* Then this is the first call to this fn. */
       int octave = (int)floor(log(io_ratio) / M_LN2);
       p->current.stage_num = octave < 0? -1 : min(octave, p->num_stages0-1);
       enter_new_stage(p, 0);
@@ -375,7 +370,7 @@ static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
     else if (p->fade_len)
       set_step(&p->fadeout, io_ratio);
     set_step(&p->current, io_ratio);
-    if (p->default_io_ratio) FRAC(p->current.at) = FRAC(p->current.step) >> 1;
+    if (p->default_io_ratio!=0) FRAC(p->current.at) = FRAC(p->current.step) >> 1;
     p->default_io_ratio = 0;
   }
 }
@@ -427,10 +422,11 @@ static bool do_input_stage(rate_t * p, int stage_num, int sign, int min_stage_nu
   return true;
 }
 
-static int vr_process(rate_t * p, int olen0)
+static void vr_process(void * P, size_t olen0)
 {
+  rate_t *p = P;
   assert(p->num_stages > 0);
-  if (p->default_io_ratio)
+  if (p->default_io_ratio!=0)
     vr_set_io_ratio(p, p->default_io_ratio, 0);
   {
     float * output = fifo_reserve(&p->output_fifo, olen0);
@@ -462,7 +458,7 @@ static int vr_process(rate_t * p, int olen0)
       olen = min(olen, (int)(AL(buf) >> 1));
       if (p->slew_len)
         olen = min(olen, p->slew_len);
-      else if (p->new_io_ratio) {
+      else if (p->new_io_ratio!=0) {
         set_step(&p->current, p->new_io_ratio);
         set_step(&p->fadeout, p->new_io_ratio);
         p->fadeout.step_step.all = p->current.step_step.all = 0;
@@ -568,17 +564,18 @@ static int vr_process(rate_t * p, int olen0)
         fifo_read(&p->stages[i].fifo, idone, NULL);
     }
     fifo_trim_by(&p->output_fifo, olen0 - odone0);
-    return odone0;
   }
 }
 
-static float * vr_input(rate_t * p, float const * input, size_t n)
+static void * vr_input(void * p, void * input, size_t n)
 {
-  return fifo_write(&p->stages[0].fifo, (int)n, input);
+  return fifo_write(&((rate_t *)p)->stages[0].fifo, (int)n, input);
 }
 
-static float const * vr_output(rate_t * p, float * output, size_t * n)
+static void const * vr_output(void * P, void * O, size_t * n)
 {
+  rate_t *p = P;
+  float *output = O;
   fifo_t * fifo = &p->output_fifo;
   if (1 || !p->num_stages0)
     return fifo_read(fifo, (int)(*n = min(*n, (size_t)fifo_occupancy(fifo))), output);
@@ -594,17 +591,19 @@ static float const * vr_output(rate_t * p, float * output, size_t * n)
   }
 }
 
-static void vr_flush(rate_t * p)
+static void vr_flush(void * P)
 {
+  rate_t *p = P;
   if (!p->flushing) {
     stage_preload(&p->stages[0]);
     ++p->flushing;
   }
 }
 
-static void vr_close(rate_t * p)
+static void vr_close(void * P)
 {
   int i;
+  rate_t *p = P;
 
   fifo_delete(&p->output_fifo);
   for (i = -1; i < p->num_stages; ++i) {
@@ -614,7 +613,7 @@ static void vr_close(rate_t * p)
   free(p->stages - 1);
 }
 
-static double vr_delay(rate_t * p)
+static double vr_delay(void * p)
 {
   return 100; /* TODO */
   (void)p;
@@ -639,19 +638,20 @@ static char const * vr_create(void * channel, void * shared,double max_io_ratio,
 
 static char const * vr_id(void)
 {
-  return "single-precision variable-rate";
+  return "vr32";
 }
 
-typedef void (* fn_t)(void);
-fn_t _soxr_vr32_cb[] = {
-  (fn_t)vr_input,
-  (fn_t)vr_process,
-  (fn_t)vr_output,
-  (fn_t)vr_flush,
-  (fn_t)vr_close,
-  (fn_t)vr_delay,
-  (fn_t)vr_sizes,
-  (fn_t)vr_create,
-  (fn_t)vr_set_io_ratio,
-  (fn_t)vr_id,
+#include "cb_t.h"
+
+control_block_t _soxr_vr32_cb = {
+  vr_input,
+  vr_process,
+  vr_output,
+  vr_flush,
+  vr_close,
+  vr_delay,
+  vr_sizes,
+  vr_create,
+  vr_set_io_ratio,
+  vr_id,
 };
diff --git a/soxr/src/vr32s.c b/soxr/src/vr32s.c
deleted file mode 100644
index cf0fdaa..0000000
--- a/soxr/src/vr32s.c
+++ /dev/null
@@ -1,665 +0,0 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
- * Licence for this file: LGPL v2.1                  See LICENCE for details. */
-
-/* Variable-rate resampling. */
-
-#include <assert.h>
-#include <math.h>
-#if !defined M_PI
-#define M_PI    3.14159265358979323846
-#endif
-#if !defined M_LN2
-#define M_LN2   0.69314718055994530942
-#endif
-#include <string.h>
-#include <stdlib.h>
-#if defined(__x86_64__) || defined(_M_X64)
-#include <xmmintrin.h>
-#elif defined(__ARM_NEON)
-#include "sse2neon.h"
-#endif
-#include "internal.h"
-#define FIFO_SIZE_T int
-#define FIFO_MIN 0x8000
-#include "fifo.h"
-#include "vr-coefs.h"
-
-#define FADE_LEN_BITS     9
-#define PHASE_BITS_D      10
-#define PHASE_BITS_U      9
-
-#define PHASES0_D         12
-#define POLY_FIR_LEN_D    20
-#define POLY_FIR_LEN_D_VEC (POLY_FIR_LEN_D / 4)
-#define PHASES0_U         6
-#define POLY_FIR_LEN_U    12
-#define POLY_FIR_LEN_U_VEC (POLY_FIR_LEN_U / 4)
-
-#define MULT32            (65536. * 65536.)
-#define PHASES_D          (1 << PHASE_BITS_D)
-#define PHASES_U          (1 << PHASE_BITS_U)
-
-#define CONVOLVE \
-    _ _ _ _ _ _ _ _ _ _  _ _ _ _ _ _ _ _ _ _ \
-    _ _ _ _ _ _ _ _ _ _  _ _ _ _ _ _ _ _ _ _ \
-    _ _ _ _ _ _ _ _ _ _  _ _ _ _ _ _ _ _ _ _
-
-#define HALF_FIR_LEN_2 (iAL(half_fir_coefs) - 1)
-#define HALF_FIR_LEN_4 (HALF_FIR_LEN_2 / 2)
-
-#define _ sum += (input[-i] + input[i]) * half_fir_coefs[i], ++i;
-static float half_fir(float const * input)
-{
-  long i = 1;
-  float sum = input[0] * half_fir_coefs[0];
-  CONVOLVE CONVOLVE
-  assert(i == HALF_FIR_LEN_2 + 1);
-  return (float)sum;
-}
-#undef _
-
-#define _ sum += (input[-i] + input[i]) * half_fir_coefs[2*i], ++i;
-static float double_fir0(float const * input)
-{
-  int i = 1;
-  float sum = input[0] * half_fir_coefs[0];
-  CONVOLVE
-  assert(i == HALF_FIR_LEN_4 + 1);
-  return (float)(sum * 2);
-}
-#undef _
-
-#define _ sum += (input[-i] + input[1+i]) * half_fir_coefs[2*i+1], ++i;
-static float double_fir1(float const * input)
-{
-  int i = 0;
-  float sum = 0;
-  CONVOLVE
-  assert(i == HALF_FIR_LEN_4 + 0);
-  return (float)(sum * 2);
-}
-#undef _
-
-static float fast_half_fir(float const * input)
-{
-  int i = 0;
-  float sum = input[0] * .5f;
-#define _ sum += (input[-(2*i+1)] + input[2*i+1]) * fast_half_fir_coefs[i], ++i;
-  _ _ _ _ _ _
-#undef _
-  return (float)sum;
-}
-
-#define IIR_FILTER _ _ _ _ _ _ _
-#define _ in1=(in1-p->y[i])*iir_coefs[i]+tmp1;tmp1=p->y[i],p->y[i]=in1;++i;\
-          in0=(in0-p->y[i])*iir_coefs[i]+tmp0;tmp0=p->y[i],p->y[i]=in0;++i;
-
-typedef struct {float x[2], y[AL(iir_coefs)];} half_iir_t;
-
-static float half_iir1(half_iir_t * p, float in0, float in1)
-{
-  int i = 0;
-  float tmp0, tmp1;
-  tmp0 = p->x[0], p->x[0] = in0;
-  tmp1 = p->x[1], p->x[1] = in1;
-  IIR_FILTER
-  p->y[i] = in1 = (in1 - p->y[i]) * iir_coefs[i] + tmp1;
-  return in1 + in0;
-}
-#undef _
-
-static void half_iir(half_iir_t * p, float * obuf, float const * ibuf, int olen)
-{
-  int i;
-  for (i=0; i < olen; obuf[i] = (float)half_iir1(p, ibuf[i*2], ibuf[i*2+1]),++i);
-}
-
-static void half_phase(half_iir_t * p, float * buf, int len)
-{
-  float const small_normal = 1/MULT32/MULT32; /* To quash denormals on path 0.*/
-  int i;
-  for (i = 0; i < len; buf[i] = (float)half_iir1(p, buf[i], 0), ++i);
-#define _ p->y[i] += small_normal, i += 2;
-  i = 0, _ IIR_FILTER
-#undef _
-#define _ p->y[i] -= small_normal, i += 2;
-  i = 0, _ IIR_FILTER
-#undef _
-}
-
-#define coefs(coef_p, fir_len, phase_num, coef_vec_num) \
-    coef_p[(fir_len) * (phase_num) + (coef_vec_num)]
-
-#define COEF(h,l,i) ((i)<0||(i)>=(l)?0:(h)[(i)>(l)/2?(l)-(i):(i)])
-static void prepare_coefs(__m128 * coefs_a, __m128 * coefs_b,
-    int n, int phases0, int phases, float const * coefs0, double multiplier)
-{
-  double k[6];
-  int length0 = n * phases0, length = n * phases, K0 = iAL(k)/2 - 1, i, j, pos;
-  float * coefs1 = malloc(((size_t)length / 2  + 1) * sizeof(*coefs1));
-  float * p = coefs1, f0, f1 = 0;
-
-  for (j = 0; j < iAL(k); k[j] = COEF(coefs0, length0, j - K0), ++j);
-  for (pos = i = 0; i < length0 / 2; ++i) {
-    double b=(1/24.)*(k[0]+k[4]+6*k[2]-4*(k[1]+k[3])),d=.5*(k[1]+k[3])-k[2]-b;
-    double a=(1/120.)*(k[5]-k[2]-9*(9*b+d)+2.5*(k[3]-k[1])-2*(k[4]-k[0]));
-    double c=(1/12.)*(k[4]-k[0]-2*(k[3]-k[1])-60*a),e=.5*(k[3]-k[1])-a-c;
-    for (; pos / phases == i; pos += phases0) {
-      double x = (double)(pos % phases) / phases;
-      *p++ = (float)(k[K0] + ((((a*x + b)*x + c)*x + d)*x + e)*x);
-    }
-    for (j = 0; j < iAL(k) - 1; k[j] = k[j + 1], ++j);
-    k[j] = COEF(coefs0, length0, i + iAL(k) / 2 + 1);
-  }
-  if (!(length & 1))
-    *p++ = (float)k[K0];
-  assert(p - coefs1 == length / 2  + 1);
-
-  for (i = 0; i < n; ++i) for (j = phases - 1; j >= 0; --j, f1 = f0) {
-    pos = (n - 1 - i) * phases + j;
-    f0 = COEF(coefs1, length, pos) * (float)multiplier;
-    ((float*)&coefs(coefs_a, n / 4, j, i / 4))[i % 4] = (float)f0;
-    ((float*)&coefs(coefs_b, n / 4, j, i / 4))[i % 4] = (float)(f1 - f0);
-  }
-  free(coefs1);
-}
-
-#define _ sum = _mm_add_ps(sum, _mm_mul_ps(_mm_add_ps(_mm_mul_ps(b, x), a), _mm_loadu_ps(&input[i*4]))), ++i;
-#define a (coefs(poly_fir_coefs_d_a, POLY_FIR_LEN_D_VEC, phase, i))
-#define b (coefs(poly_fir_coefs_d_b, POLY_FIR_LEN_D_VEC, phase, i))
-static __m128 poly_fir_coefs_d_a[POLY_FIR_LEN_D_VEC * PHASES_D];
-static __m128 poly_fir_coefs_d_b[POLY_FIR_LEN_D_VEC * PHASES_D];
-
-static float poly_fir1_d(float const * input, uint32_t frac)
-{
-  int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_D));
-  __m128 sum = _mm_set1_ps(0.f), x = _mm_set1_ps((float)(frac << PHASE_BITS_D) * (float)(1 / MULT32));
-  _ _ _ _ _
-  assert(i == POLY_FIR_LEN_D_VEC);
-  return ((float*)&sum)[0] + ((float*)&sum)[1] + ((float*)&sum)[2] + ((float*)&sum)[3];
-}
-#undef a
-#undef b
-#define a (coefs(poly_fir_coefs_u_a, POLY_FIR_LEN_U_VEC, phase, i))
-#define b (coefs(poly_fir_coefs_u_b, POLY_FIR_LEN_U_VEC, phase, i))
-static __m128 poly_fir_coefs_u_a[POLY_FIR_LEN_U_VEC * PHASES_U];
-static __m128 poly_fir_coefs_u_b[POLY_FIR_LEN_U_VEC * PHASES_U];
-
-static float poly_fir1_u(float const * input, uint32_t frac)
-{
-  int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_U));
-  __m128 sum = _mm_set1_ps(0.f), x = _mm_set1_ps((float)(frac << PHASE_BITS_U) * (float)(1 / MULT32));
-  _ _ _
-  assert(i == POLY_FIR_LEN_U_VEC);
-  return ((float*)&sum)[0] + ((float*)&sum)[1] + ((float*)&sum)[2] + ((float*)&sum)[3];
-}
-#undef a
-#undef b
-#undef _
-
-#define ADD_TO(x,y)           x.all += y.all
-#define SUBTRACT_FROM(x,y)    x.all -= y.all
-#define FRAC(x)               x.part.frac
-#define INT(x)                x.part.integer
-
-typedef struct {
-  union {
-    int64_t all;
-#if WORDS_BIGENDIAN
-    struct {int32_t integer; uint32_t frac;} part;
-#else
-    struct {uint32_t frac; int32_t integer;} part;
-#endif
-  } at, step, step_step;
-  float const * input;
-  int len, stage_num;
-  bool is_d; /* true: downsampling at x2 rate; false: upsampling at 1x rate. */
-  double step_mult;
-} stream_t;
-
-static int poly_fir_d(stream_t * s, float * output, int olen)
-{
-  int i;
-  float const * input = s->input - POLY_FIR_LEN_D / 2 + 1;
-  for (i = 0; i < olen && INT(s->at) < s->len; ++i) {
-    output[i] = poly_fir1_d(input + INT(s->at), FRAC(s->at));
-    ADD_TO(s->at, s->step);
-    if (!(INT(s->at) < s->len)) {
-      SUBTRACT_FROM(s->at, s->step);
-      break;
-    }
-    output[++i] = poly_fir1_d(input + INT(s->at), FRAC(s->at));
-    ADD_TO(s->at, s->step);
-    ADD_TO(s->step, s->step_step);
-  }
-  return i;
-}
-
-static int poly_fir_fade_d(
-    stream_t * s, float const * vol, int step, float * output, int olen)
-{
-  int i;
-  float const * input = s->input - POLY_FIR_LEN_D / 2 + 1;
-  for (i = 0; i < olen && INT(s->at) < s->len; ++i, vol += step) {
-    output[i] += *vol * poly_fir1_d(input + INT(s->at), FRAC(s->at));
-    ADD_TO(s->at, s->step);
-    if (!(INT(s->at) < s->len)) {
-      SUBTRACT_FROM(s->at, s->step);
-      break;
-    }
-    output[++i] += *(vol += step) * poly_fir1_d(input + INT(s->at),FRAC(s->at));
-    ADD_TO(s->at, s->step);
-    ADD_TO(s->step, s->step_step);
-  }
-  return i;
-}
-
-static int poly_fir_u(stream_t * s, float * output, int olen)
-{
-  int i;
-  float const * input = s->input - POLY_FIR_LEN_U / 2 + 1;
-  for (i = 0; i < olen && INT(s->at) < s->len; ++i) {
-    output[i] = poly_fir1_u(input + INT(s->at), FRAC(s->at));
-    ADD_TO(s->at, s->step);
-    ADD_TO(s->step, s->step_step);
-  }
-  return i;
-}
-
-static int poly_fir_fade_u(
-    stream_t * s, float const * vol, int step, float * output, int olen)
-{
-  int i;
-  float const * input = s->input - POLY_FIR_LEN_U / 2 + 1;
-  for (i = 0; i < olen && INT(s->at) < s->len; i += 2, vol += step) {
-    output[i] += *vol * poly_fir1_u(input + INT(s->at), FRAC(s->at));
-    ADD_TO(s->at, s->step);
-    ADD_TO(s->step, s->step_step);
-  }
-  return i;
-}
-
-#define shiftr(x,by) ((by) < 0? (x) << (-(by)) : (x) >> (by))
-#define shiftl(x,by) shiftr(x,-(by))
-#define stage_occupancy(s) (fifo_occupancy(&(s)->fifo) - 4*HALF_FIR_LEN_2)
-#define stage_read_p(s) ((float *)fifo_read_ptr(&(s)->fifo) + 2*HALF_FIR_LEN_2)
-#define stage_preload(s) memset(fifo_reserve(&(s)->fifo, (s)->preload), \
-    0, sizeof(float) * (size_t)(s)->preload);
-
-typedef struct {
-  fifo_t fifo;
-  double step_mult;
-  int is_fast, x_fade_len, preload;
-} stage_t;
-
-typedef struct {
-  int num_stages0, num_stages, flushing;
-  int fade_len, slew_len, xfade, stage_inc, switch_stage_num;
-  double new_io_ratio, default_io_ratio;
-  stage_t * stages;
-  fifo_t output_fifo;
-  half_iir_t halfer;
-  stream_t current, fadeout; /* Current/fade-in, fadeout streams. */
-} rate_t;
-
-static float fade_coefs[(2 << FADE_LEN_BITS) + 1];
-
-static void vr_init(rate_t * p, double default_io_ratio, int num_stages, double mult)
-{
-  int i;
-  assert(num_stages >= 0);
-  memset(p, 0, sizeof(*p));
-
-  p->num_stages0 = num_stages;
-  p->num_stages = num_stages = max(num_stages, 1);
-  p->stages = (stage_t *)calloc((unsigned)num_stages + 1, sizeof(*p->stages)) + 1;
-  for (i = -1; i < p->num_stages; ++i) {
-    stage_t * s = &p->stages[i];
-    fifo_create(&s->fifo, sizeof(float));
-    s->step_mult = 2 * MULT32 / shiftl(2, i);
-    s->preload = i < 0? 0 : i == 0? 2 * HALF_FIR_LEN_2 : 3 * HALF_FIR_LEN_2 / 2;
-    stage_preload(s);
-    s->is_fast = true;
-    lsx_debug("%-3i preload=%i", i, s->preload);
-  }
-  fifo_create(&p->output_fifo, sizeof(float));
-  p->default_io_ratio = default_io_ratio;
-  if (!fade_coefs[0]) {
-    for (i = 0; i < iAL(fade_coefs); ++i)
-      fade_coefs[i] = (float)(.5 * (1 + cos(M_PI * i / (AL(fade_coefs) - 1))));
-    prepare_coefs(poly_fir_coefs_u_a, poly_fir_coefs_u_b, POLY_FIR_LEN_U, PHASES0_U, PHASES_U, coefs0_u, mult);
-    prepare_coefs(poly_fir_coefs_d_a, poly_fir_coefs_d_b, POLY_FIR_LEN_D, PHASES0_D, PHASES_D, coefs0_d, mult *.5);
-  }
-  assert(fade_coefs[0]);
-}
-
-static void enter_new_stage(rate_t * p, int occupancy0)
-{
-  p->current.len = shiftr(occupancy0, p->current.stage_num);
-  p->current.input = stage_read_p(&p->stages[p->current.stage_num]);
-
-  p->current.step_mult = p->stages[p->current.stage_num].step_mult;
-  p->current.is_d = p->current.stage_num >= 0;
-  if (p->current.is_d)
-    p->current.step_mult *= .5;
-}
-
-static void set_step(stream_t * p, double io_ratio)
-{
-  p->step.all = (int64_t)(io_ratio * p->step_mult + .5);
-}
-
-static bool set_step_step(stream_t * p, double io_ratio, int slew_len)
-{
-  int64_t dif;
-  int difi;
-  stream_t tmp = *p;
-  set_step(&tmp, io_ratio);
-  dif = tmp.step.all - p->step.all;
-  dif = dif < 0? dif - (slew_len >> 1) : dif + (slew_len >> 1);
-  difi = (int)dif;   /* Try to avoid int64_t div. */
-  p->step_step.all = difi == dif? difi / slew_len : dif / slew_len;
-  return p->step_step.all != 0;
-}
-
-static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
-{
-  assert(io_ratio > 0);
-  if (slew_len) {
-    if (!set_step_step(&p->current, io_ratio, p->slew_len = (int)slew_len))
-      p->slew_len = 0, p->new_io_ratio = 0, p->fadeout.step_step.all = 0;
-    else {
-      p->new_io_ratio = io_ratio;
-      if (p->fade_len)
-        set_step_step(&p->fadeout, io_ratio, p->slew_len);
-    }
-  }
-  else {
-    if (p->default_io_ratio) { /* Then this is the first call to this fn. */
-      int octave = (int)floor(log(io_ratio) / M_LN2);
-      p->current.stage_num = octave < 0? -1 : min(octave, p->num_stages0-1);
-      enter_new_stage(p, 0);
-    }
-    else if (p->fade_len)
-      set_step(&p->fadeout, io_ratio);
-    set_step(&p->current, io_ratio);
-    if (p->default_io_ratio) FRAC(p->current.at) = FRAC(p->current.step) >> 1;
-    p->default_io_ratio = 0;
-  }
-}
-
-static bool do_input_stage(rate_t * p, int stage_num, int sign, int min_stage_num)
-{
-  int i = 0;
-  float * dest;
-  stage_t * s = &p->stages[stage_num];
-  stage_t * s1 = &p->stages[stage_num - sign];
-  float const * src = (float *)fifo_read_ptr(&s1->fifo) + HALF_FIR_LEN_2;
-  int len = shiftr(fifo_occupancy(&s1->fifo) - HALF_FIR_LEN_2 * 2, sign);
-  int already_done = fifo_occupancy(&s->fifo) - s->preload;
-  if ((len -= already_done) <= 0)
-    return false;
-  src += shiftl(already_done, sign);
-
-  dest = fifo_reserve(&s->fifo, len);
-  if (stage_num < 0) for (; i < len; ++src)
-    dest[i++] = double_fir0(src), dest[i++] = double_fir1(src);
-  else {
-    bool should_be_fast = p->stage_inc;
-    if (!s->x_fade_len && stage_num == p->switch_stage_num) {
-      p->switch_stage_num = 0;
-      if (s->is_fast != should_be_fast) {
-        s->x_fade_len = 1 << FADE_LEN_BITS, s->is_fast = should_be_fast, ++p->xfade;
-        lsx_debug("xfade level %i, inc?=%i", stage_num, p->stage_inc);
-      }
-    }
-    if (s->x_fade_len) {
-      float const * vol1 = fade_coefs + (s->x_fade_len << 1);
-      float const * vol2 = fade_coefs + (((1 << FADE_LEN_BITS) - s->x_fade_len) << 1);
-      int n = min(len, s->x_fade_len);
-      /*lsx_debug("xfade level %i, inc?=%i len=%i n=%i", stage_num, p->stage_inc, s->x_fade_len, n);*/
-      if (should_be_fast)
-        for (; i < n; vol2 += 2, vol1 -= 2, src += 2)
-          dest[i++] = *vol1 * fast_half_fir(src) + *vol2 * half_fir(src);
-      else for (; i < n; vol2 += 2, vol1 -= 2, src += 2)
-        dest[i++] = *vol2 * fast_half_fir(src) + *vol1 * half_fir(src);
-      s->x_fade_len -= n;
-      p->xfade -= !s->x_fade_len;
-    }
-    if (stage_num < min_stage_num)
-      for (; i < len; dest[i++] = fast_half_fir(src), src += 2);
-    else for (; i < len; dest[i++] = half_fir(src), src += 2);
-  }
-  if (p->flushing > 0)
-    stage_preload(s);
-  return true;
-}
-
-static int vr_process(rate_t * p, int olen0)
-{
-  assert(p->num_stages > 0);
-  if (p->default_io_ratio)
-    vr_set_io_ratio(p, p->default_io_ratio, 0);
-  {
-    float * output = fifo_reserve(&p->output_fifo, olen0);
-    int j, odone0 = 0, min_stage_num = p->current.stage_num;
-    int occupancy0, max_stage_num = min_stage_num;
-    if (p->fade_len) {
-      min_stage_num = min(min_stage_num, p->fadeout.stage_num);
-      max_stage_num = max(max_stage_num, p->fadeout.stage_num);
-    }
-
-    for (j = min(min_stage_num, 0); j <= max_stage_num; ++j)
-      if (j && !do_input_stage(p, j, j < 0? -1 : 1, min_stage_num))
-        break;
-    if (p->flushing > 0)
-      p->flushing = -1;
-
-    occupancy0 = shiftl(max(0,stage_occupancy(&p->stages[max_stage_num])), max_stage_num);
-    p->current.len = shiftr(occupancy0, p->current.stage_num);
-    p->current.input = stage_read_p(&p->stages[p->current.stage_num]);
-    if (p->fade_len) {
-      p->fadeout.len = shiftr(occupancy0, p->fadeout.stage_num);
-      p->fadeout.input = stage_read_p(&p->stages[p->fadeout.stage_num]);
-    }
-
-    while (odone0 < olen0) {
-      int odone, odone2, olen = olen0 - odone0, stage_dif = 0, shift;
-      float buf[64 << 1];
-
-      olen = min(olen, (int)(AL(buf) >> 1));
-      if (p->slew_len)
-        olen = min(olen, p->slew_len);
-      else if (p->new_io_ratio) {
-        set_step(&p->current, p->new_io_ratio);
-        set_step(&p->fadeout, p->new_io_ratio);
-        p->fadeout.step_step.all = p->current.step_step.all = 0;
-        p->new_io_ratio = 0;
-      }
-      if (!p->flushing && !p->fade_len && !p->xfade) {
-        if (p->current.is_d) {
-          if (INT(p->current.step) && FRAC(p->current.step))
-            stage_dif = 1, ++max_stage_num;
-          else if (!INT(p->current.step) && FRAC(p->current.step) < (1u << 31))
-            stage_dif = -1, --min_stage_num;
-        } else if (INT(p->current.step) > 1 && FRAC(p->current.step))
-          stage_dif = 1, ++max_stage_num;
-      }
-      if (stage_dif) {
-        int n = p->current.stage_num + stage_dif;
-        if (n >= p->num_stages)
-          --max_stage_num;
-        else {
-          p->stage_inc = stage_dif > 0;
-          p->fadeout = p->current;
-          p->current.stage_num += stage_dif;
-          if (!p->stage_inc)
-          p->switch_stage_num = p->current.stage_num;
-          if ((p->current.stage_num < 0 && stage_dif < 0) ||
-              (p->current.stage_num > 0 && stage_dif > 0)) {
-            stage_t * s = &p->stages[p->current.stage_num];
-            fifo_clear(&s->fifo);
-            stage_preload(s);
-            s->is_fast = false;
-            do_input_stage(p, p->current.stage_num, stage_dif, p->current.stage_num);
-          }
-          if (p->current.stage_num > 0 && stage_dif < 0) {
-            int idone = INT(p->current.at);
-            stage_t * s = &p->stages[p->current.stage_num];
-            fifo_trim_to(&s->fifo, 2 * HALF_FIR_LEN_2 + idone + (POLY_FIR_LEN_D >> 1));
-            do_input_stage(p, p->current.stage_num, 1, p->current.stage_num);
-          }
-          enter_new_stage(p, occupancy0);
-          shift = -stage_dif;
-#define lshift(x,by) (x)=(by)>0?(x)<<(by):(x)>>-(by)
-          lshift(p->current.at.all, shift);
-          shift += p->fadeout.is_d - p->current.is_d;
-          lshift(p->current.step.all, shift);
-          lshift(p->current.step_step.all, shift);
-          p->fade_len = AL(fade_coefs) - 1;
-          lsx_debug("switch from stage %i to %i, x2 from %i to %i", p->fadeout.stage_num, p->current.stage_num, p->fadeout.is_d, p->current.is_d);
-        }
-      }
-
-      if (p->fade_len) {
-        float const * vol1 = fade_coefs + p->fade_len;
-        float const * vol2 = fade_coefs + (iAL(fade_coefs) - 1 - p->fade_len);
-        int olen2 = (olen = min(olen, p->fade_len >> 1)) << 1;
-
-        /* x2 is more fine-grained so may fail to produce a pair of samples
-         * where x1 would not (the x1 second sample is a zero so is always
-         * available).  So do x2 first, then feed odone to the second one. */
-        memset(buf, 0, sizeof(*buf) * (size_t)olen2);
-        if (p->current.is_d && p->fadeout.is_d) {
-          odone  = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2);
-          odone2 = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, odone);
-        } else if (p->current.is_d) {
-          odone  = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2);
-          odone2 = poly_fir_fade_u(&p->fadeout, vol2, 2, buf, odone);
-        } else {
-          assert(p->fadeout.is_d);
-          odone  = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, olen2);
-          odone2 = poly_fir_fade_u(&p->current, vol1,-2, buf, odone);
-        }
-        assert(odone == odone2);
-        (void)odone2;
-        p->fade_len -= odone;
-        if (!p->fade_len) {
-          if (p->stage_inc)
-            p->switch_stage_num = min_stage_num++;
-          else
-            --max_stage_num;
-        }
-        half_iir(&p->halfer, &output[odone0], buf, odone >>= 1);
-      }
-      else if (p->current.is_d) {
-        odone = poly_fir_d(&p->current, buf, olen << 1) >> 1;
-        half_iir(&p->halfer, &output[odone0], buf, odone);
-      }
-      else {
-        odone = poly_fir_u(&p->current, &output[odone0], olen);
-        if (p->num_stages0)
-          half_phase(&p->halfer, &output[odone0], odone);
-      }
-      odone0 += odone;
-      if (p->slew_len)
-        p->slew_len -= odone;
-      if (odone != olen)
-        break; /* Need more input. */
-    } {
-      int from = max(0, max_stage_num), to = min(0, min_stage_num);
-      int i, idone = shiftr(INT(p->current.at), from - p->current.stage_num);
-      INT(p->current.at) -= shiftl(idone, from - p->current.stage_num);
-      if (p->fade_len)
-        INT(p->fadeout.at) -= shiftl(idone, from - p->fadeout.stage_num);
-      for (i = from; i >= to; --i, idone <<= 1)
-        fifo_read(&p->stages[i].fifo, idone, NULL);
-    }
-    fifo_trim_by(&p->output_fifo, olen0 - odone0);
-    return odone0;
-  }
-}
-
-static float * vr_input(rate_t * p, float const * input, size_t n)
-{
-  return fifo_write(&p->stages[0].fifo, (int)n, input);
-}
-
-static float const * vr_output(rate_t * p, float * output, size_t * n)
-{
-  fifo_t * fifo = &p->output_fifo;
-  if (1 || !p->num_stages0)
-    return fifo_read(fifo, (int)(*n = min(*n, (size_t)fifo_occupancy(fifo))), output);
-  else { /* Ignore this complication for now. */
-    int const IIR_DELAY = 2;
-    float * ptr = fifo_read_ptr(fifo);
-    int olen = min((int)*n, max(0, fifo_occupancy(fifo) - IIR_DELAY));
-    *n = (size_t)olen;
-    if (output)
-      memcpy(output, ptr + IIR_DELAY, *n * sizeof(*output));
-    fifo_read(fifo, olen, NULL);
-    return ptr + IIR_DELAY;
-  }
-}
-
-static void vr_flush(rate_t * p)
-{
-  if (!p->flushing) {
-    stage_preload(&p->stages[0]);
-    ++p->flushing;
-  }
-}
-
-static void vr_close(rate_t * p)
-{
-  int i;
-
-  fifo_delete(&p->output_fifo);
-  for (i = -1; i < p->num_stages; ++i) {
-    stage_t * s = &p->stages[i];
-    fifo_delete(&s->fifo);
-  }
-  free(p->stages - 1);
-}
-
-static double vr_delay(rate_t * p)
-{
-  return 100; /* TODO */
-  (void)p;
-}
-
-static void vr_sizes(size_t * shared, size_t * channel)
-{
-  *shared = 0;
-  *channel = sizeof(rate_t);
-}
-
-static char const * vr_create(void * channel, void * shared,double max_io_ratio,
-    void * q_spec, void * r_spec, double scale)
-{
-  double x = max_io_ratio;
-  int n;
-  for (n = 0; x > 1; x *= .5, ++n);
-  vr_init(channel, max_io_ratio, n, scale);
-  return 0;
-  (void)shared, (void)q_spec, (void)r_spec;
-}
-
-static char const * vr_id(void)
-{
-  return "single-precision variable-rate";
-}
-
-typedef void (* fn_t)(void);
-fn_t _soxr_vr32_cb[] = {
-  (fn_t)vr_input,
-  (fn_t)vr_process,
-  (fn_t)vr_output,
-  (fn_t)vr_flush,
-  (fn_t)vr_close,
-  (fn_t)vr_delay,
-  (fn_t)vr_sizes,
-  (fn_t)vr_create,
-  (fn_t)vr_set_io_ratio,
-  (fn_t)vr_id,
-};
diff --git a/soxr/tests/CMakeLists.txt b/soxr/tests/CMakeLists.txt
index fc350de..ee8dd0b 100644
--- a/soxr/tests/CMakeLists.txt
+++ b/soxr/tests/CMakeLists.txt
@@ -1,8 +1,8 @@
 # SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
-add_definitions (${PROJECT_C_FLAGS})
-link_libraries (soxr)
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}")
+link_libraries (${PROJECT_NAME} ${LIBM_LIBRARIES})
 
 file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.c)
 foreach (fe ${SOURCES})
@@ -10,7 +10,10 @@ foreach (fe ${SOURCES})
   add_executable (${f} ${fe})
 endforeach ()
 
-enable_testing ()
+# Can't use c89 for this file:
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  set_property (SOURCE throughput APPEND_STRING PROPERTY COMPILE_FLAGS "-std=gnu89")
+endif ()
 
 set (sweep_to_freq 22050)
 set (leader 1)
@@ -20,33 +23,40 @@ math (EXPR base_rate "${sweep_to_freq} + ${sweep_to_freq}")
 macro (add_vector r)
   set (output ${CMAKE_CURRENT_BINARY_DIR}/ref-${r}.s32)
   add_custom_command (OUTPUT ${output} DEPENDS vector-gen ${CMAKE_CURRENT_LIST_FILE}
-    COMMAND vector-gen ${r} ${leader} ${len} ${sweep_to_freq} 1 ${output})
+    COMMAND vector-gen ${r} ${leader} ${len} 0 ${sweep_to_freq} 1 ${output})
   set (vectors ${output} ${vectors})
 endmacro ()
 
-macro (add_cmp_test from to bits)
-  set (name ${bits}-bit-perfect-${from}-${to})
-  add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN} -DEXAMPLES_BIN=${EXAMPLES_BIN} -Dleader=${leader} -Dto=${to}
-    -Dfrom=${from} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake)
-  add_vector (${from})
-  add_vector (${to})
+macro (add_cmp_test irate orate bits)
+  set (name ${bits}-bit-perfect-${irate}-${orate})
+  add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN}
+    -DEXAMPLES_BIN=${EXAMPLES_BIN} -DlenToSkip=${leader} -Dorate=${orate}
+    -Dirate=${irate} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake)
+  add_vector (${irate})
+  add_vector (${orate})
 endmacro ()
 
 unset (test_bits)
-if (WITH_SINGLE_PRECISION)
+if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
   set (test_bits 20)
 endif ()
-if (WITH_DOUBLE_PRECISION)
-  set (test_bits ${test_bits} 24)
+if (WITH_CR64 OR WITH_CR64S)
+  set (test_bits ${test_bits} 28)
 endif ()
 
+set (rates 192000)
+if (WITH_HI_PREC_CLOCK)
+  set (rates ${rates} 65537)
+endif ()
 foreach (b ${test_bits})
-  foreach (r 96000 65537)
+  foreach (r ${rates})
     add_cmp_test (${base_rate} ${r} ${b})
     add_cmp_test (${r} ${base_rate} ${b})
   endforeach ()
 endforeach ()
 
-add_custom_target (test-vectors ALL DEPENDS ${vectors})
+if (NOT CMAKE_CROSSCOMPILING)
+  add_custom_target (test-vectors ALL DEPENDS ${vectors})
+endif ()
 
 add_test (1-delay-clear ${BIN}1-delay-clear)
diff --git a/soxr/tests/bandwidth-test b/soxr/tests/bandwidth-test
index 47c2303..4efdcc9 100755
--- a/soxr/tests/bandwidth-test
+++ b/soxr/tests/bandwidth-test
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 
 # SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -9,8 +9,9 @@ set -e
 
 
 tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
 
-spec="spectrogram -z120 -Z-20 -wd -ho"
+spec="spectrogram -z120 -Z-20 -w$w -ho"
 ext=f32; e=0
 rate1=48000
 rate2=44100
@@ -23,12 +24,12 @@ rate1n=`expr $rate1 / 2`
 sox -r $rate1 -n 0.$ext synth 8 sin 0:$rate1n gain -1
 
 for pass in `seq 79 5 99`; do
-	f=bw1-$rate2-p`printf %02u $pass`
+	f=bw1-$rate2-p`printf %02u $pass`-$w
 	$tool $rate1 $rate2 1 $e $e 4 0 $pass < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:100"
 done
 
 for pass in `seq 79 5 99`; do
-	f=bw2-$rate2-p`printf %02u $pass`
+	f=bw2-$rate2-p`printf %02u $pass`-$w
 	stop=`expr 200 - $pass`
 	$tool $rate1 $rate2 1 $e $e 4 0 $pass $stop < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:$stop"
 done
diff --git a/soxr/tests/cmp-test.cmake b/soxr/tests/cmp-test.cmake
index 8db76c5..a836322 100644
--- a/soxr/tests/cmp-test.cmake
+++ b/soxr/tests/cmp-test.cmake
@@ -1,17 +1,13 @@
 # SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
-if (${bits} STREQUAL 24)
-  set (quality 45)
-else ()
-  set (quality 44)
-endif ()
+math (EXPR quality "43 + (${bits} - 13) / 4")
+set (ofile ${irate}-${orate}-${quality}.s32)
+#message (STATUS "Output file = [${ofile}]")
 
-set (output ${from}-${to}-${quality}.s32)
-
-execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${from} ${to} 1 2 2 ${quality} a
-  INPUT_FILE ref-${from}.s32
-  OUTPUT_FILE ${output}
+execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${irate} ${orate} 1 2 2 ${quality} a
+  INPUT_FILE ref-${irate}.s32
+  OUTPUT_FILE ${ofile}
   ERROR_VARIABLE test_error
   RESULT_VARIABLE test_result)
 
@@ -19,7 +15,11 @@ if (test_result)
   message (FATAL_ERROR "Resampling failure: ${test_error}")
 endif ()
 
-execute_process(COMMAND ${BIN}vector-cmp ref-${to}.s32 ${output} ${to} ${leader} ${len} ${bits} 98
+set (percentageToCheck 98)
+math (EXPR lenToCheck "${len} * ${percentageToCheck}")
+string (REGEX REPLACE "(..)$" ".\\1" lenToCheck "${lenToCheck}") # Divide by 100
+
+execute_process(COMMAND ${BIN}vector-cmp ref-${orate}.s32 ${ofile} ${orate} ${lenToSkip} ${lenToCheck} ${bits}
   OUTPUT_VARIABLE test_output
   RESULT_VARIABLE test_result)
 
diff --git a/soxr/tests/eg-test b/soxr/tests/eg-test
index 58d085c..ccf4ce3 100755
--- a/soxr/tests/eg-test
+++ b/soxr/tests/eg-test
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 
 # SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -9,6 +9,7 @@ set -e
 
 
 len=8
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
 #vg="valgrind --leak-check=full --show-reachable=yes"
 
 
@@ -42,6 +43,6 @@ signals=(sine-wave saw-tooth-wave)
 for n in 0 1 2 3; do
   signal=${signals[`expr $n % 2 || true`]}
   variation=${variations[`expr $n / 2 || true`]}
-  $vg ./5-variable-rate $n | sox -tf32 -r44100 -c1 - -n spectrogram -z130 -hwd -o v$n.png -X 50 -c "variation:$variation signal:$signal"
+  $vg ./5-variable-rate $n | sox -tf32 -r44100 -c1 - -n spectrogram -z130 -hw$w -o v$n-$w.png -X 50 -c "variation:$variation signal:$signal"
   vg=""
 done
diff --git a/soxr/tests/io-test b/soxr/tests/io-test
index a291c78..608bc9a 100755
--- a/soxr/tests/io-test
+++ b/soxr/tests/io-test
@@ -1,7 +1,7 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 
-# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
 # Tests IO
@@ -14,22 +14,28 @@ len=16
 f=1/32768
 g=32768:0
 tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
 
 types=(f32 f64 s32 s16)
 
 zs=(180 180 180 180 180 120 120 120 120)
 
 do_one() {
-  $tool $ir $or $c $1 $2 $3 < $c.${types[$1]} |
-  sox -t ${types[`expr $2 % 4`]} -r $or -c $c - -n spectrogram -X50 -hwk -z${zs[$n]} -o io$c$n.png -c "io-test i:${types[$1]} o:${types[`expr $2 % 4`]} ($2) q:$3"
+  it=${types[$1]}; ot=${types[`expr $2 % 4 || true`]}
+  $tool $ir $or $c $1 $2 $3 < $c.$it > a.$ot
+  sox -r $or -c $c a.$ot -n spectrogram -X50 -hw$w -z${zs[$n]} -o io$c$n-$w.png -c "io-test i:$it o:$ot ($2) q:$3"
+  ./4-split-channels $ir $or $c $1 $2 $3 < $c.$it > b.$ot
+  [ $2 != 3 ] && cmp a.$ot b.$ot ||
+    test $(sox -mv-1 -r$or -c$c a.$ot -r$or -c$c b.$ot -n stats 2>&1 |grep Pk\ l|tr ' ' '\n'|grep '[0-9]'|uniq) = -84.29
+  rm [ab].$ot
   n=`expr $n + 1`
 }
 
-j=3; test z$1 != z && j=$1
+test z$1 != z && j=$1 || j=1
 
 for c in `seq 1 $j`; do
   for n in `seq 0 3`; do
-    sox -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1
+    sox -R -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1
   done
 
   n=0
diff --git a/soxr/tests/large-ratio-test b/soxr/tests/large-ratio-test
index 64f1789..540c5df 100755
--- a/soxr/tests/large-ratio-test
+++ b/soxr/tests/large-ratio-test
@@ -1,23 +1,22 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 
-# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
 # Licence for this file: LGPL v2.1                  See LICENCE for details.
 
-# Tests interpolating then decimating be the same, large ratio.
+# Tests interpolating then decimating by the same, large ratio.
 
 tool=../examples/3-options-input-fn
-q=6
-ratio=2e4
-srate=8000
-nrate=`expr $srate / 2`
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+q=4
+test x$1 = x && ratio=1e5 || ratio=$1
+test x$2 = x && rate=8000 || rate=$2
 
-rm -f lr.png
+sox -r$rate -n 1.s32 synth 10 sin 0:`expr $rate / 2` vol .9375
+sync
 
-../tests/vector-gen $srate 0 8 $nrate .9375 1.s32
+time { $tool 1 $ratio 1 2 1 $q a < 1.s32 | $tool $ratio 1 1 1 2 $q a > 2.s32;}
 
-$tool 1 $ratio 1 2 1 $q < 1.s32 | $tool $ratio 1 1 1 2 $q > 2.s32
-
-sox -M -r $srate -c1 1.s32 -r $srate -c1 2.s32 -n spectrogram -hwd -Z-10 -z180 -o lr.png -c "large-ratio-test q:$q ratio:$ratio"
+sox -mv-1 -r$rate -c1 1.s32 -r$rate -c1 2.s32 -n spectrogram -hw$w -z150 -o lr-$w.png -c "large-ratio-test q:$q ratio:$ratio"
 
 rm [12].s32
diff --git a/soxr/tests/phase-test b/soxr/tests/phase-test
index 4c491d8..3c34268 100755
--- a/soxr/tests/phase-test
+++ b/soxr/tests/phase-test
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 
 # SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -7,7 +7,8 @@ set -e
 # Tests varying phase-response.
 
 tool=./3-options-input-fn
-spec="spectrogram -z160 -Z-20 -X 2000 -wd -ho"
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+spec="spectrogram -z160 -Z-20 -X 2000 -w$w -ho"
 ext=f32; e=0
 rate1=48000
 rate2=44100
@@ -20,7 +21,7 @@ for n in 1 2; do
 	filters=(standard-filter steep-filter)
 
 	for q in `seq 0 7`; do
-		f=ph-$rate2-q$q
+		f=ph-$rate2-q$q-$w
 		name=${names[`expr $q % 4 || true`]}
 		filter=${filters[`expr $q / 4 || true`]}
 		$tool $rate1 $rate2 1 $e $e $q'6' < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test $filter $name"
@@ -28,7 +29,7 @@ for n in 1 2; do
 
 	# Test specific phase-response percentages:
 	for q in `seq 0 20 100`; do
-		f=ph-$rate2-p`printf %03u $q`
+		f=ph-$rate2-p`printf %03u $q`-$w
 		$tool $rate1 $rate2 1 $e $e 46 0 0 0 $q < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test phase:${q}%"
 	done
 
diff --git a/soxr/tests/q-test b/soxr/tests/q-test
index 7a0f0a2..f274cb5 100755
--- a/soxr/tests/q-test
+++ b/soxr/tests/q-test
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 
 # SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -9,6 +9,7 @@ set -e
 
 
 tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
 ext=f64; e=1
 c=1
 q1=0; q2=7
@@ -16,7 +17,7 @@ rates=48000
 zs=(50 87 87 87 111 135 159 180 95)
 
 zz() {
-	echo "spectrogram -z${zs[$1]} -Z-30 -wd -ho"
+	echo "spectrogram -z${zs[$1]} -Z-30 -w$w -ho"
 }
 
 for rate0 in $rates; do
@@ -36,11 +37,11 @@ sox -r $rate1 -n -c $c 0.$ext synth 8 sin 0:$rate1n gain -1
 
 for q in `seq $q1 $q2`; do
 	f=qa-$rate1-$rate2-$q
-	$tool $rate1 $rate2 $c $e $e $q  0 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f.png -c $f
+	$tool $rate1 $rate2 $c $e $e $q  0 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f-$w.png -c $f
 done
 q=8
 f=qa-$rate1-$rate2-v
-$tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f.png -c $f
+$tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f-$w.png -c $f
 
 
 
diff --git a/soxr/tests/scripts b/soxr/tests/scripts
index f245919..8b6023f 100755
--- a/soxr/tests/scripts
+++ b/soxr/tests/scripts
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 
 # SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -6,8 +6,9 @@ set -e
 
 ../../tests/bandwidth-test
 ../../tests/eg-test
-../../tests/io-test
+../../tests/io-test 3
 ../../tests/large-ratio-test
 ../../tests/phase-test
 ../../tests/q-test
-../../tests/time-test
+../../tests/time-test 1
+../../tests/time-test 2
diff --git a/soxr/tests/throughput-test b/soxr/tests/throughput-test
new file mode 100644
index 0000000..aef36f6
--- /dev/null
+++ b/soxr/tests/throughput-test
@@ -0,0 +1,11 @@
+#!/bin/sh
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+test -r throughput.exe && wine=wine
+
+test /$1 = / && list="`seq 0 3`" || list="$*"
+
+for n in $list; do $wine ./throughput 44.1 48 1 0 $n 4; done
diff --git a/soxr/tests/throughput-test.bat b/soxr/tests/throughput-test.bat
new file mode 100644
index 0000000..6644d8d
--- /dev/null
+++ b/soxr/tests/throughput-test.bat
@@ -0,0 +1,5 @@
+@echo off
+rem SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+rem Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+for /L %%i in (0,1,3) DO throughput 44.1 48 1 0 %%i
diff --git a/soxr/tests/throughput.c b/soxr/tests/throughput.c
new file mode 100644
index 0000000..c52b885
--- /dev/null
+++ b/soxr/tests/throughput.c
@@ -0,0 +1,141 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <soxr.h>
+#include "rint.h"
+#include "../examples/examples-common.h"
+
+#define k 1000
+
+#if defined _WIN32
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+  #define timerStart(msecs) LARGE_INTEGER start, stop, tmp; \
+      QueryPerformanceCounter(&start), QueryPerformanceFrequency(&tmp), \
+      stop.QuadPart = (msecs * tmp.QuadPart + k/2) / k
+  #define timerRunning() (QueryPerformanceCounter(&tmp), \
+      (tmp.QuadPart-start.QuadPart < stop.QuadPart))
+#else
+  #include <sys/time.h>
+  #if defined timeradd
+    #define K k
+    #define tv_frac tv_usec
+    #define timespec timeval
+    #define get_time(x) gettimeofday(x, NULL)
+  #else
+    #include <time.h>
+    #include <unistd.h>
+    #if defined _POSIX_TIMERS && _POSIX_TIMERS > 0
+      #define K (k*k)
+      #define tv_frac tv_nsec
+      #if defined _POSIX_MONOTONIC_CLOCK
+        #define get_time(x) clock_gettime(CLOCK_MONOTONIC, x)
+      #else
+        #define get_time(x) clock_gettime(CLOCK_REALTIME, x)
+      #endif
+    #else
+      #include <sys/timeb.h>
+      #define K 1
+      #define tv_frac millitm
+      #define tv_sec time
+      #define timespec timeb
+      #define get_time(x) ftime(x)
+    #endif
+  #endif
+
+  #define timerStart(msecs) struct timespec stop, tmp; get_time(&stop), \
+      stop.tv_frac += (msecs%k)*K, \
+      stop.tv_sec  += msecs/k + stop.tv_frac/(K*k), \
+      stop.tv_frac %= K*k
+  #define timerRunning() (get_time(&tmp), \
+      (tmp.tv_sec < stop.tv_sec || tmp.tv_frac < stop.tv_frac))
+#endif
+
+int main(int n, char const * arg[])
+{
+  char const *     const arg0 = n? --n, *arg++ : "", * engine = "";
+  double          const irate = n? --n, atof(*arg++) : 96000.;
+  double          const orate = n? --n, atof(*arg++) : 44100.;
+  unsigned        const chans = n? --n, (unsigned)atoi(*arg++) : 1;
+  soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned        const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
+  unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+  double   const passband_end = n? --n, atof(*arg++) : 0;
+  double const stopband_begin = n? --n, atof(*arg++) : 0;
+  double const phase_response = n? --n, atof(*arg++) : -1;
+  int       const use_threads = n? --n, atoi(*arg++) : 1;
+  soxr_datatype_t const otype = ospec & 3;
+
+  soxr_quality_spec_t       q_spec = soxr_quality_spec(q_recipe, q_flags);
+  soxr_io_spec_t            io_spec = soxr_io_spec(itype, otype);
+  soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
+
+  /* Allocate resampling input and output buffers in proportion to the input
+   * and output rates: */
+  #define buf_total_len 15000  /* In samples per channel. */
+  size_t const osize = soxr_datatype_size(otype) * chans;
+  size_t const isize = soxr_datatype_size(itype) * chans;
+  size_t const olen0= (size_t)(orate * buf_total_len / (irate + orate) + .5);
+  size_t const olen = min(max(olen0, 1), buf_total_len - 1);
+  size_t const ilen = buf_total_len - olen;
+  void * const obuf = malloc(osize * olen);
+  void * const ibuf = malloc(isize * ilen);
+
+  size_t odone = 0, clips = 0, omax = 0, i;
+  soxr_error_t error;
+  soxr_t soxr;
+  int32_t seed = 0;
+  char const * e = getenv("SOXR_THROUGHPUT_GAIN");
+  double gain = e? atof(e) : .5;
+
+  /* Overrides (if given): */
+  if (passband_end   > 0) q_spec.passband_end   = passband_end / 100;
+  if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100;
+  if (phase_response >=0) q_spec.phase_response = phase_response;
+  io_spec.flags = ospec & ~7u;
+
+  /* Create a stream resampler: */
+  soxr = soxr_create(
+      irate, orate, chans,         /* Input rate, output rate, # of channels. */
+      &error,                         /* To report any error during creation. */
+      &io_spec, &q_spec, &runtime_spec);
+
+#define ranqd1(x) ((x) = 1664525 * (x) + 1013904223) /* int32_t x */
+#define dranqd1(x) (ranqd1(x) * (1. / (65536. * 32768.))) /* [-1,1) */
+#define RAND (dranqd1(seed) * gain)
+#define DURATION_MSECS 125
+#define NUM_ATTEMPTS 8
+
+  if (!error) {                         /* If all is well, run the resampler: */
+    engine = soxr_engine(soxr);
+    switch (itype & 3) {
+      case 0: for (i=0;i<ilen*chans; ((float   *)ibuf)[i]=(float  )RAND, ++i); break;
+      case 1: for (i=0;i<ilen*chans; ((double  *)ibuf)[i]=(double )RAND, ++i); break;
+      case 2: for (i=0;i<ilen*chans; ((int32_t *)ibuf)[i]=rint32(65536.*32768*RAND), ++i); break;
+      case 3: for (i=0;i<ilen*chans; ((int16_t *)ibuf)[i]=rint16(    1.*32768*RAND), ++i); break;
+    }
+                                                       /* Resample in blocks: */
+    for (i=0; i<NUM_ATTEMPTS; ++i) {
+      size_t itotal = 0, ototal = 0;
+      timerStart(DURATION_MSECS);
+      do {
+        size_t const ilen1 = odone < olen? ilen : 0;
+        error = soxr_process(soxr, ibuf, ilen1, NULL, obuf, olen, &odone);
+        itotal += ilen1;
+        ototal += odone;
+      } while (!error && timerRunning());
+      omax = max(omax, ototal);
+    }
+  }
+                                                                  /* Tidy up: */
+  clips = *soxr_num_clips(soxr);     /* Can occur only with integer output. */
+  soxr_delete(soxr);
+  free(obuf), free(ibuf);
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; %lu clips; I/O: %s (%-5s) %.2f Ms/s\n",
+      arg0, soxr_strerror(error), (long unsigned)clips,
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error", engine,
+      1e-6 * k / DURATION_MSECS * chans * (double)omax);
+  return !!error;
+}
diff --git a/soxr/tests/time-test b/soxr/tests/time-test
index e8904b3..f253717 100755
--- a/soxr/tests/time-test
+++ b/soxr/tests/time-test
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -e
 
 # SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
@@ -8,27 +8,28 @@ set -e
 
 tool=./3-options-input-fn
 ext=f32; e=0
-c=2
-q1=0; q2=7
+test z"$1" != z && c="$1" || c=2
+test z"$2" != z && qs="$2" || qs="`seq 0 7` v"
 rates="48000 77773 96000"
+time=`which time`
+BASE=`basename $0`
+TIME=/tmp/$BASE-time-$$
+ERR=/tmp/$BASE-err-$$
+uname -m |grep -q ^arm && len=60 || len=600
+export OMP_NUM_THREADS=2
 
 for rate0 in $rates; do
-	rate1=$rate0
-	rate2=44100
+	rate1=44100
+	rate2=$rate0
 	for n in 1 2; do
-		rate1n=`expr $rate1 / 2`
-		sox -r $rate1 -n -c $c 0.$ext synth 5: sin 0:$rate1n gain -1
-
-		for q in `seq $q1 $q2`; do
-			echo $rate1 '-->' $rate2 c=$c q=$q
-			time $tool $rate1 $rate2 $c $e $e $q < 0.$ext > /dev/null;
+		sox -R -r $rate1 -n -c $c 0.$ext synth $len noise; sync
+		for q in $qs; do
+			test $q = v && Q="4 20" || Q=$q
+			$time -f %e -o $TIME $tool $rate1 $rate2 $c $e $e $Q < 0.$ext > /dev/null 2> $ERR
+			echo $rate1 '-->' $rate2 c=$c q=$q t=`cat $TIME` `cat $ERR | sed 's/.*(/(/'`
 		done
-
-		echo $rate1 '-->' $rate2 c=$c q=v
-		time $tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext > /dev/null
-
-		rate1=44100
-		rate2=$rate0
+		rate1=$rate0
+		rate2=44100
 	done
 done
 
diff --git a/soxr/tests/vector-cmp.c b/soxr/tests/vector-cmp.c
index 6edd2d5..f90cc7f 100644
--- a/soxr/tests/vector-cmp.c
+++ b/soxr/tests/vector-cmp.c
@@ -1,53 +1,56 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Utility used to help test the library; not for general consumption.
  *
- * Compare two swept-sine files.  */
+ * Measure the peak bit difference between two files.  */
 
 #include <stdlib.h>
 #include <stdio.h>
-#include <math.h>
 #include "../src/rint.h"
+#include "../examples/examples-common.h"
 
-int main(int bit, char const * arg[])
+#define TYPE 0 /* As vector-gen */
+
+#if TYPE
+  #define sample_t double
+  #define N 50
+  #define DIFF(s1,s2) abs(rint32((s1-s2)*ldexp(1,N-1)))
+#else
+  #define sample_t int32_t
+  #define N 32
+  #define DIFF(s1,s2) abs((int)(s1-s2))
+#endif
+
+int main(int argc, char const * arg[])
 {
-  FILE    * f1       = fopen(arg[1], "rb"),
-          * f2       = fopen(arg[2], "rb");
-  double  rate       = atof (arg[3]), /* Rate for this vector */
-          leader_len = atof (arg[4]), /* Leader length in seconds */
-          len        = atof (arg[5]), /* Sweep length (excl. leader_len) */
-          expect_bits= atof (arg[6]),
-          expect_bw  = atof (arg[7]);
+  int     two      = !!arg[2][0];
+  FILE    * f1 = fopen(arg[1], "rb"), * f2 = two? fopen(arg[2], "rb") : 0;
+  double  rate     = atof (arg[3]), /* Sample-rate */
+          skip_len = atof (arg[4]), /* Skip length in seconds */
+          len      = atof (arg[5]), /* Compare length in seconds */ r;
+  int i = 0, count = rint32(rate * len), max = 0, diff;
+  sample_t s1, s2;
 
-  int32_t s1, s2;
-  long count = 0;
-  static long thresh[32];
-  double bw, prev = 0;
-
-  for (; fread(&s1, sizeof(s1), 1, f1) == 1 &&
-         fread(&s2, sizeof(s2), 1, f2) == 1; ++count) {
-    long diff = abs((int)(s1 - s2));
-    for (bit = 0; diff && bit < 32; bit++, diff >>= 1)
-      if ((diff & 1) && !thresh[bit])
-        thresh[bit] = count + 1;
-  }
-
-  if (count != (long)((leader_len + len) * rate + .5)) {
-    printf("incorrect file length\n");
-    exit(1);
-  }
-
-  for (bit = 0; bit < 32; ++bit) {
-    bw = ((double)thresh[bit] - 1) / rate - leader_len;
-    if (bit && bw >= 0 && (bw - prev) * 100 / len < .08) {
-      --bit;
-      break;
+  fseek(f1, rint32(rate * skip_len) * (int)sizeof(s1), SEEK_CUR);
+  if (two) {
+    fseek(f2, rint32(rate * skip_len) * (int)sizeof(s2), SEEK_CUR);
+    for (; i < count &&
+        fread(&s1, sizeof(s1), 1, f1) &&
+        fread(&s2, sizeof(s2), 1, f2); ++i) {
+      diff = DIFF(s1, s2);
+      max = max(max, diff);
     }
-    prev = bw;
   }
-  bit = 32 - bit;
-  bw = bw * 100 / len;
-  printf("Bit perfect to %i bits, from DC to %.2f%% nyquist.\n", bit, bw);
-  return !(bit >= expect_bits && bw >= expect_bw);
+  else for (; i < count && fread(&s1, sizeof(s1), 1, f1); ++i) {
+    diff = DIFF(s1, 0);
+    max = max(max, diff);
+  }
+
+  if (i != count) {
+    fprintf(stderr, "incorrect file length\n");
+    return 1;
+  }
+  printf("%f\n", r = N-log(max)/log(2));
+  return argc>6? r<atof(arg[6]) : 0;
 }
diff --git a/soxr/tests/vector-gen.c b/soxr/tests/vector-gen.c
index 06d4bac..0446ec9 100644
--- a/soxr/tests/vector-gen.c
+++ b/soxr/tests/vector-gen.c
@@ -1,56 +1,61 @@
-/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
  * Licence for this file: LGPL v2.1                  See LICENCE for details. */
 
 /* Utility used to help test the library; not for general consumption.
  *
- * Generate a swept sine to a file, with faded `lead-in' section.  */
+ * Generate a swept sine to a file, with `lead-in' section.  */
 
-#define QUAD 0
+#define TYPE 0 /* calc/store: 0:flt64/int32 1:flt80/flt64 2:flt128/flt64 */
 
-#if QUAD
+#if TYPE > 1
   #include <quadmath.h>
 #endif
 
-#include "../examples/examples-common.h"
+#include "math-wrap.h"
+#include <stdlib.h>
+#include <stdio.h>
 
-#if QUAD
-  #define modf modfq
-  #define cos cosq
-  #define sin sinq
-  #undef M_PI
-  #define M_PI M_PIq
-  #define real __float128
-  #define atof(x) strtoflt128(x, 0)
+#if TYPE
+  #if TYPE > 1
+    #define modf modfq
+    #define cos cosq
+    #define sin sinq
+    #define PI M_PIq
+    #define real __float128
+    #define atof(x) strtoflt128(x, 0)
+  #else
+    #define modf modfl
+    #define cos cosl
+    #define sin sinl
+    #define PI M_PIl
+    #define real long double
+  #endif
+  #define MULT 1
+  #define OUT(d) double output = d
 #else
+  #define PI M_PI
   #define real double
   #include "rint.h"
+  #define MULT (32768. * 65536 - 1/scale)
+  #define OUT(d) int32_t output = rint32(d)
 #endif
 
-int main(int i, char const * argv[])
+int main(int argc, char const * argv[])
 {
-  real rate           = atof(argv[1]), /* Rate for this vector */
-       lead_in_len    = atof(argv[2]), /* Lead-in length in seconds */
-       len            = atof(argv[3]), /* Sweep length (excl. lead_in_len) */
-       sweep_to_freq  = atof(argv[4]), /* Sweep from DC to this freq. */
-       multiplier     = atof(argv[5]), /* For headroom */
-       f1 = -sweep_to_freq / len * lead_in_len, f2 = sweep_to_freq,
-       n1 = rate * -lead_in_len, n2 = rate * len,
-       m = (f2 - f1) / (n2 - n1) / 2, dummy;
-  FILE * file = fopen(argv[6], "wb");
-  i = (int)n1;
-  if (!file || i != n1)
-    exit(1);
-  for (; i < (int)(n2 + .5); ++i) {
-    double d1 = multiplier * sin(2 * M_PI * modf(i * m * i / rate, &dummy));
-    double d = i < 0? d1 * (1 - cos(M_PI * (i + n1) / n1)) * .5 : d1;
-#if QUAD
-    size_t actual = fwrite(&d, sizeof(d), 1, file);
-#else
-    int32_t out = rint32(d * (32768. * 65536 - 1));
-    size_t actual = fwrite(&out, sizeof(out), 1, file);
-#endif
-    if (actual != 1)
-      return 1;
+  real rate         = atof(argv[1]), /* Rate for this vector */
+       lead_in_len  = atof(argv[2]), /* Lead-in length in seconds */
+       len          = atof(argv[3]), /* Sweep length (excl. lead_in_len) */
+       f1           = atof(argv[4]),
+       f2           = atof(argv[5]),
+       scale        = atof(argv[6]), /* For headroom */
+       n1 = rate * -lead_in_len,
+       m = (f2 - f1) / (rate * len * 2), dummy;
+  FILE * file = fopen(argv[7], "wb");
+  int i = (int)n1, err = !file || i != n1;
+  for (; !err && i < (int)(rate*(len+lead_in_len)+.5); ++i) {
+    real d = sin(2 * PI * modf((f1 + i * m) * i / rate, &dummy));
+    OUT((double)(scale * MULT * d));
+    err = fwrite(&output, sizeof(output), 1, file) != 1;
   }
-  return 0;
+  return err |!argc;
 }