Revup FFTS to latest upstream version

Taken from https://github.com/linkotec/ffts Fixes ppc64el support and a handful of other bugs
3 months ago · 2ef6dba872
parent c40a208abb
commit 2ef6dba872
24 changed files with 3620 additions and 707 deletions
--- a/lib/ffts/CMakeLists.txt
+++ b/lib/ffts/CMakeLists.txt
@ -7,7 +7,7 @@ set(FFTS_MAJOR 0)
 set(FFTS_MINOR 9)
 set(FFTS_MICRO 0)

-set(FFTS_VERSION "ffts-${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}")
+set(FFTS_VERSION "${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}")

 set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
 set_property(GLOBAL PROPERTY USE_FOLDERS ON)
@ -22,6 +22,16 @@ set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/include/ffts)
 set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib)

 # common options
+
+# !!!! FOR TESTING ONLY !!!!
+option(ENABLE_AVX
+  "Enables AVX instructions." OFF
+)
+# !!!! FOR TESTING ONLY !!!!
+option(ENABLE_DOUBLE
+  "Enables double precision" OFF
+)
+
 option(ENABLE_NEON
  "Enables the use of NEON instructions." OFF
 )
@ -48,24 +58,36 @@ option(ENABLE_STATIC

 include(CheckCSourceCompiles)
 include(CheckCSourceRuns)
+include(CheckFunctionExists)
 include(CheckIncludeFile)
+include(CheckSymbolExists)

 # Ensure defined when building FFTS (as opposed to using it from
 # another project). Used to export functions from Windows DLL.
 add_definitions(-DFFTS_BUILD)

 # check existence of various headers
+check_include_file(inttypes.h  HAVE_INTTYPES_H)
 check_include_file(malloc.h    HAVE_MALLOC_H)
+check_include_file(mm_malloc.h HAVE_MM_MALLOC_H)
 check_include_file(stdint.h    HAVE_STDINT_H)
 check_include_file(stdlib.h    HAVE_STDLIB_H)
 check_include_file(string.h    HAVE_STRING_H)
 check_include_file(sys/mman.h  HAVE_SYS_MMAN_H)
 check_include_file(unistd.h    HAVE_UNISTD_H)

+if(HAVE_INTTYPES_H)
+  add_definitions(-DHAVE_INTTYPES_H)
+endif(HAVE_INTTYPES_H)
+
 if(HAVE_MALLOC_H)
  add_definitions(-DHAVE_MALLOC_H)
 endif(HAVE_MALLOC_H)

+if(HAVE_MM_MALLOC_H)
+  add_definitions(-DHAVE_MM_MALLOC_H)
+endif(HAVE_MM_MALLOC_H)
+
 if(HAVE_STDINT_H)
  add_definitions(-DHAVE_STDINT_H)
 endif(HAVE_STDINT_H)
@ -86,6 +108,50 @@ if(HAVE_UNISTD_H)
  add_definitions(-DHAVE_UNISTD_H)
 endif(HAVE_UNISTD_H)

+# check existence of various declarations
+check_symbol_exists(memalign       malloc.h HAVE_DECL_MEMALIGN)
+check_symbol_exists(posix_memalign stdlib.h HAVE_DECL_POSIX_MEMALIGN)
+check_symbol_exists(valloc         stdlib.h HAVE_DECL_VALLOC)
+check_symbol_exists(_mm_malloc     malloc.h HAVE_DECL__MM_MALLOC)
+
+if(HAVE_DECL_MEMALIGN)
+  add_definitions(-DHAVE_DECL_MEMALIGN)
+endif(HAVE_DECL_MEMALIGN)
+
+if(HAVE_DECL_POSIX_MEMALIGN)
+  add_definitions(-DHAVE_DECL_POSIX_MEMALIGN)
+endif(HAVE_DECL_POSIX_MEMALIGN)
+
+if(HAVE_DECL_VALLOC)
+  add_definitions(-DHAVE_DECL_VALLOC)
+endif(HAVE_DECL_VALLOC)
+
+if(HAVE_DECL__MM_MALLOC)
+  add_definitions(-DHAVE_DECL__MM_MALLOC)
+endif(HAVE_DECL__MM_MALLOC)
+
+# check existence of various functions
+check_function_exists(memalign       HAVE_MEMALIGN)
+check_function_exists(posix_memalign HAVE_POSIX_MEMALIGN)
+check_function_exists(valloc         HAVE_VALLOC)
+check_function_exists(_mm_malloc     HAVE__MM_MALLOC)
+
+if(HAVE_MEMALIGN)
+  add_definitions(-DHAVE_MEMALIGN)
+endif(HAVE_MEMALIGN)
+
+if(HAVE_POSIX_MEMALIGN)
+  add_definitions(-DHAVE_POSIX_MEMALIGN)
+endif(HAVE_POSIX_MEMALIGN)
+
+if(HAVE_VALLOC)
+  add_definitions(-DHAVE_VALLOC)
+endif(HAVE_VALLOC)
+
+if(HAVE__MM_MALLOC)
+  add_definitions(-DHAVE__MM_MALLOC)
+endif(HAVE__MM_MALLOC)
+
 # backup flags
 set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})

@ -246,6 +312,14 @@ if(NOT CMAKE_CROSSCOMPILING)
    if(HAVE_XMMINTRIN_H)
      add_definitions(-DHAVE_SSE)
      set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+
+      # TODO: not the right place
+      if(ENABLE_AVX)
+        add_definitions(-DHAVE_AVX)
+      endif(ENABLE_AVX)
+      if(ENABLE_DOUBLE)
+        add_definitions(-DFFTS_DOUBLE)
+      endif(ENABLE_DOUBLE)
    endif(HAVE_XMMINTRIN_H)

    # enable SSE2 code generation
@ -351,6 +425,10 @@ set(FFTS_HEADERS
 set(FFTS_SOURCES
  src/ffts_attributes.h
  src/ffts.c
+  src/ffts_chirp_z.c
+  src/ffts_chirp_z.h
+  src/ffts_cpu.c
+  src/ffts_cpu.h
  src/ffts_internal.h
  src/ffts_nd.c
  src/ffts_nd.h
@ -369,6 +447,17 @@ set(FFTS_SOURCES
  src/types.h
 )

+if(NOT DISABLE_DYNAMIC_CODE)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+    list(APPEND FFTS_SOURCES
+      src/codegen_sse.h
+    )
+  else()
+    message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.")
+    set(DISABLE_DYNAMIC_CODE ON)
+  endif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
+endif(NOT DISABLE_DYNAMIC_CODE)
+
 if(ENABLE_NEON)
  list(APPEND FFTS_SOURCES
    src/neon.s
@ -393,19 +482,9 @@ elseif(HAVE_XMMINTRIN_H)
  add_definitions(-DHAVE_SSE)

  list(APPEND FFTS_SOURCES
+    src/macros-avx.h
    src/macros-sse.h
  )
-
-  if(NOT DISABLE_DYNAMIC_CODE)
-    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
-      list(APPEND FFTS_SOURCES
-        src/codegen_sse.h
-      )
-    else()
-      message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.")
-      set(DISABLE_DYNAMIC_CODE ON)
-    endif(CMAKE_SIZEOF_VOID_P EQUAL 8)
-  endif(NOT DISABLE_DYNAMIC_CODE)
 endif(ENABLE_NEON)

 if(DISABLE_DYNAMIC_CODE)
@ -452,6 +531,41 @@ if(ENABLE_STATIC)
 endif(ENABLE_STATIC)

 if(ENABLE_STATIC OR ENABLE_SHARED)
+  find_path(MPFR_INCLUDES
+    NAMES mpfr.h
+    PATHS ${INCLUDE_INSTALL_DIR}
+  )
+  find_library(MPFR_LIBRARIES mpfr PATHS ${LIB_INSTALL_DIR})
+  find_package(OpenMP)
+
+  if(MPFR_INCLUDES)
+    add_definitions(-DHAVE_MPFR_H)
+    include_directories(${MPFR_INCLUDES})
+  endif(MPFR_INCLUDES)
+
+  add_executable(ffts_trig_test
+    tests/trig_test.c
+  )
+
+  target_link_libraries(ffts_trig_test ffts)
+  if(MPFR_LIBRARIES)
+    target_link_libraries(ffts_trig_test ${MPFR_LIBRARIES})
+  endif(MPFR_LIBRARIES)
+
+  if(OPENMP_FOUND)
+    if(MSVC)
+      set_target_properties(ffts_trig_test PROPERTIES
+        COMPILE_FLAGS "${OpenMP_C_FLAGS}"
+        LINK_FLAGS "${OpenMP_EXE_LINKER_FLAGS}"
+      )
+    else()
+      set_target_properties(ffts_trig_test PROPERTIES
+        COMPILE_FLAGS "${OpenMP_C_FLAGS}"
+        LINK_FLAGS "${OpenMP_C_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}"
+      )
+    endif(MSVC)
+  endif(OPENMP_FOUND)
+
  add_executable(ffts_test
    tests/test.c
  )
@ -467,6 +581,14 @@ if(ENABLE_STATIC OR ENABLE_SHARED)
    ffts
    ${FFTS_EXTRA_LIBRARIES}
  )
+
+  add_executable(ffts_cpu_test
+    src/ffts_cpu.c
+    src/ffts_cpu.h
+    tests/cpu_test.c
+  )
+
+  set_target_properties(ffts_cpu_test PROPERTIES COMPILE_DEFINITIONS FFTS_BUILDING_CPU_TEST)
 endif(ENABLE_STATIC OR ENABLE_SHARED)

 # generate packageconfig file
--- a/lib/ffts/config.guess
+++ b/lib/ffts/config.guess
@ -1,12 +1,14 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
-#   Copyright 1992-2016 Free Software Foundation, Inc.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011, 2012 Free Software Foundation, Inc.

-timestamp='2016-04-02'
+timestamp='2012-08-14'

 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
+# the Free Software Foundation; either version 2 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
@ -20,17 +22,19 @@ timestamp='2016-04-02'
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program.  This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
+# the same distribution terms that you use for the rest of that program.
+
+
+# Originally written by Per Bothner.  Please send patches (context
+# diff format) to <config-patches@gnu.org> and include a ChangeLog
+# entry.
 #
-# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
+# This script attempts to guess a canonical system name similar to
+# config.sub.  If it succeeds, it prints the system name on stdout, and
+# exits with 0.  Otherwise, it exits with 1.
 #
 # You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
-#
-# Please send patches to <config-patches@gnu.org>.
-
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD

 me=`echo "$0" | sed -e 's,.*/,,'`

@ -50,7 +54,9 @@ version="\
 GNU config.guess ($timestamp)

 Originally written by Per Bothner.
-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.

 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@ -132,27 +138,6 @@ UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
 UNAME_SYSTEM=`(uname -s) 2>/dev/null`  || UNAME_SYSTEM=unknown
 UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown

-case "${UNAME_SYSTEM}" in
-Linux|GNU|GNU/*)
-	# If the system lacks a compiler, then just pick glibc.
-	# We could probably try harder.
-	LIBC=gnu
-
-	eval $set_cc_for_build
-	cat <<-EOF > $dummy.c
-	#include <features.h>
-	#if defined(__UCLIBC__)
-	LIBC=uclibc
-	#elif defined(__dietlibc__)
-	LIBC=dietlibc
-	#else
-	LIBC=gnu
-	#endif
-	EOF
-	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
-	;;
-esac
-
 # Note: order is significant - the case branches are not exclusive.

 case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
@ -168,27 +153,20 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	# Note: NetBSD doesn't particularly care about the vendor
 	# portion of the name.  We always set it to "unknown".
 	sysctl="sysctl -n hw.machine_arch"
-	UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
-	    /sbin/$sysctl 2>/dev/null || \
-	    /usr/sbin/$sysctl 2>/dev/null || \
-	    echo unknown)`
+	UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
+	    /usr/sbin/$sysctl 2>/dev/null || echo unknown)`
 	case "${UNAME_MACHINE_ARCH}" in
 	    armeb) machine=armeb-unknown ;;
 	    arm*) machine=arm-unknown ;;
 	    sh3el) machine=shl-unknown ;;
 	    sh3eb) machine=sh-unknown ;;
 	    sh5el) machine=sh5le-unknown ;;
-	    earmv*)
-		arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
-		endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'`
-		machine=${arch}${endian}-unknown
-		;;
 	    *) machine=${UNAME_MACHINE_ARCH}-unknown ;;
 	esac
 	# The Operating System including object format, if it has switched
 	# to ELF recently, or will in the future.
 	case "${UNAME_MACHINE_ARCH}" in
-	    arm*|earm*|i386|m68k|ns32k|sh3*|sparc|vax)
+	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
 		eval $set_cc_for_build
 		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
 			| grep -q __ELF__
@ -204,13 +182,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		os=netbsd
 		;;
 	esac
-	# Determine ABI tags.
-	case "${UNAME_MACHINE_ARCH}" in
-	    earm*)
-		expr='s/^earmv[0-9]/-eabi/;s/eb$//'
-		abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"`
-		;;
-	esac
 	# The OS release
 	# Debian GNU/NetBSD machines have a different userland, and
 	# thus, need a distinct triplet. However, they do not need
@ -221,13 +192,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		release='-gnu'
 		;;
 	    *)
-		release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2`
+		release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
 		;;
 	esac
 	# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
 	# contains redundant information, the shorter form:
 	# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
-	echo "${machine}-${os}${release}${abi}"
+	echo "${machine}-${os}${release}"
 	exit ;;
    *:Bitrig:*:*)
 	UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
@ -237,10 +208,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
 	echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
 	exit ;;
-    *:LibertyBSD:*:*)
-	UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
-	echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE}
-	exit ;;
    *:ekkoBSD:*:*)
 	echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
 	exit ;;
@ -253,9 +220,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
    *:MirBSD:*:*)
 	echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
 	exit ;;
-    *:Sortix:*:*)
-	echo ${UNAME_MACHINE}-unknown-sortix
-	exit ;;
    alpha:OSF1:*:*)
 	case $UNAME_RELEASE in
 	*4.0)
@ -272,42 +236,42 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^  The alpha \(.*\) processor.*$/\1/p' | head -n 1`
 	case "$ALPHA_CPU_TYPE" in
 	    "EV4 (21064)")
-		UNAME_MACHINE=alpha ;;
+		UNAME_MACHINE="alpha" ;;
 	    "EV4.5 (21064)")
-		UNAME_MACHINE=alpha ;;
+		UNAME_MACHINE="alpha" ;;
 	    "LCA4 (21066/21068)")
-		UNAME_MACHINE=alpha ;;
+		UNAME_MACHINE="alpha" ;;
 	    "EV5 (21164)")
-		UNAME_MACHINE=alphaev5 ;;
+		UNAME_MACHINE="alphaev5" ;;
 	    "EV5.6 (21164A)")
-		UNAME_MACHINE=alphaev56 ;;
+		UNAME_MACHINE="alphaev56" ;;
 	    "EV5.6 (21164PC)")
-		UNAME_MACHINE=alphapca56 ;;
+		UNAME_MACHINE="alphapca56" ;;
 	    "EV5.7 (21164PC)")
-		UNAME_MACHINE=alphapca57 ;;
+		UNAME_MACHINE="alphapca57" ;;
 	    "EV6 (21264)")
-		UNAME_MACHINE=alphaev6 ;;
+		UNAME_MACHINE="alphaev6" ;;
 	    "EV6.7 (21264A)")
-		UNAME_MACHINE=alphaev67 ;;
+		UNAME_MACHINE="alphaev67" ;;
 	    "EV6.8CB (21264C)")
-		UNAME_MACHINE=alphaev68 ;;
+		UNAME_MACHINE="alphaev68" ;;
 	    "EV6.8AL (21264B)")
-		UNAME_MACHINE=alphaev68 ;;
+		UNAME_MACHINE="alphaev68" ;;
 	    "EV6.8CX (21264D)")
-		UNAME_MACHINE=alphaev68 ;;
+		UNAME_MACHINE="alphaev68" ;;
 	    "EV6.9A (21264/EV69A)")
-		UNAME_MACHINE=alphaev69 ;;
+		UNAME_MACHINE="alphaev69" ;;
 	    "EV7 (21364)")
-		UNAME_MACHINE=alphaev7 ;;
+		UNAME_MACHINE="alphaev7" ;;
 	    "EV7.9 (21364A)")
-		UNAME_MACHINE=alphaev79 ;;
+		UNAME_MACHINE="alphaev79" ;;
 	esac
 	# A Pn.n version is a patched version.
 	# A Vn.n version is a released version.
 	# A Tn.n version is a released field test version.
 	# A Xn.n version is an unreleased experimental baselevel.
 	# 1.2 uses "1.2" for uname -r.
-	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
+	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
 	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
 	exitcode=$?
 	trap '' 0
@ -342,7 +306,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
    arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
 	echo arm-acorn-riscix${UNAME_RELEASE}
 	exit ;;
-    arm*:riscos:*:*|arm*:RISCOS:*:*)
+    arm:riscos:*:*|arm:RISCOS:*:*)
 	echo arm-unknown-riscos
 	exit ;;
    SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
@ -380,16 +344,16 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	exit ;;
    i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
 	eval $set_cc_for_build
-	SUN_ARCH=i386
+	SUN_ARCH="i386"
 	# If there is a compiler, see if it is configured for 64-bit objects.
 	# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
 	# This test works for both compilers.
-	if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
+	if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
 	    if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
-		(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
 		grep IS_64BIT_ARCH >/dev/null
 	    then
-		SUN_ARCH=x86_64
+		SUN_ARCH="x86_64"
 	    fi
 	fi
 	echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
@ -414,7 +378,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	exit ;;
    sun*:*:4.2BSD:*)
 	UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
-	test "x${UNAME_RELEASE}" = x && UNAME_RELEASE=3
+	test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
 	case "`/bin/arch`" in
 	    sun3)
 		echo m68k-sun-sunos${UNAME_RELEASE}
@ -600,9 +564,8 @@ EOF
 	else
 		IBM_ARCH=powerpc
 	fi
-	if [ -x /usr/bin/lslpp ] ; then
-		IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
-			   awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
+	if [ -x /usr/bin/oslevel ] ; then
+		IBM_REV=`/usr/bin/oslevel`
 	else
 		IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
 	fi
@ -639,13 +602,13 @@ EOF
 		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
 		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
 		    case "${sc_cpu_version}" in
-		      523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
-		      528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
+		      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
 		      532)                      # CPU_PA_RISC2_0
 			case "${sc_kernel_bits}" in
-			  32) HP_ARCH=hppa2.0n ;;
-			  64) HP_ARCH=hppa2.0w ;;
-			  '') HP_ARCH=hppa2.0 ;;   # HP-UX 10.20
+			  32) HP_ARCH="hppa2.0n" ;;
+			  64) HP_ARCH="hppa2.0w" ;;
+			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
 			esac ;;
 		    esac
 		fi
@ -684,11 +647,11 @@ EOF
 		    exit (0);
 		}
 EOF
-		    (CCOPTS="" $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
+		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
 		    test -z "$HP_ARCH" && HP_ARCH=hppa
 		fi ;;
 	esac
-	if [ ${HP_ARCH} = hppa2.0w ]
+	if [ ${HP_ARCH} = "hppa2.0w" ]
 	then
 	    eval $set_cc_for_build

@ -701,12 +664,12 @@ EOF
 	    # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
 	    # => hppa64-hp-hpux11.23

-	    if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
+	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
 		grep -q __LP64__
 	    then
-		HP_ARCH=hppa2.0w
+		HP_ARCH="hppa2.0w"
 	    else
-		HP_ARCH=hppa64
+		HP_ARCH="hppa64"
 	    fi
 	fi
 	echo ${HP_ARCH}-hp-hpux${HPUX_REV}
@ -811,14 +774,14 @@ EOF
 	echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
 	exit ;;
    F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
-	FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
-	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
+	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
 	FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
 	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
 	exit ;;
    5000:UNIX_System_V:4.*:*)
-	FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
-	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
 	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
 	exit ;;
    i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
@ -848,7 +811,7 @@ EOF
    *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
-    *:MSYS*:*)
+    i*:MSYS*:*)
 	echo ${UNAME_MACHINE}-pc-msys
 	exit ;;
    i*:windows32*:*)
@ -896,21 +859,21 @@ EOF
 	exit ;;
    *:GNU:*:*)
 	# the GNU system
-	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
+	echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
 	exit ;;
    *:GNU/*:*:*)
 	# other systems with GNU libc and userland
-	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
 	exit ;;
    i*86:Minix:*:*)
 	echo ${UNAME_MACHINE}-pc-minix
 	exit ;;
    aarch64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    aarch64_be:Linux:*:*)
 	UNAME_MACHINE=aarch64_be
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    alpha:Linux:*:*)
 	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
@ -923,60 +886,59 @@ EOF
 	  EV68*) UNAME_MACHINE=alphaev68 ;;
 	esac
 	objdump --private-headers /bin/sh | grep -q ld.so.1
-	if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-	exit ;;
-    arc:Linux:*:* | arceb:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
 	exit ;;
    arm*:Linux:*:*)
 	eval $set_cc_for_build
 	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
 	    | grep -q __ARM_EABI__
 	then
-	    echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	    echo ${UNAME_MACHINE}-unknown-linux-gnu
 	else
 	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
 		| grep -q __ARM_PCS_VFP
 	    then
-		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabi
 	    else
-		echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
 	    fi
 	fi
 	exit ;;
    avr32*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    cris:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
+	echo ${UNAME_MACHINE}-axis-linux-gnu
 	exit ;;
    crisv32:Linux:*:*)
-	echo ${UNAME_MACHINE}-axis-linux-${LIBC}
-	exit ;;
-    e2k:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-axis-linux-gnu
 	exit ;;
    frv:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    hexagon:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    i*86:Linux:*:*)
-	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
+	LIBC=gnu
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
 	exit ;;
    ia64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
-	exit ;;
-    k1om:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    m32r*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    m68*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    mips:Linux:*:* | mips64:Linux:*:*)
 	eval $set_cc_for_build
@ -995,63 +957,54 @@ EOF
 	#endif
 EOF
 	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
+	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
 	;;
-    openrisc*:Linux:*:*)
-	echo or1k-unknown-linux-${LIBC}
-	exit ;;
-    or32:Linux:*:* | or1k*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+    or32:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    padre:Linux:*:*)
-	echo sparc-unknown-linux-${LIBC}
+	echo sparc-unknown-linux-gnu
 	exit ;;
    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-${LIBC}
+	echo hppa64-unknown-linux-gnu
 	exit ;;
    parisc:Linux:*:* | hppa:Linux:*:*)
 	# Look for CPU level
 	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
-	  PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
-	  PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
-	  *)    echo hppa-unknown-linux-${LIBC} ;;
+	  PA7*) echo hppa1.1-unknown-linux-gnu ;;
+	  PA8*) echo hppa2.0-unknown-linux-gnu ;;
+	  *)    echo hppa-unknown-linux-gnu ;;
 	esac
 	exit ;;
    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-${LIBC}
+	echo powerpc64-unknown-linux-gnu
 	exit ;;
    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-${LIBC}
-	exit ;;
-    ppc64le:Linux:*:*)
-	echo powerpc64le-unknown-linux-${LIBC}
-	exit ;;
-    ppcle:Linux:*:*)
-	echo powerpcle-unknown-linux-${LIBC}
+	echo powerpc-unknown-linux-gnu
 	exit ;;
    s390:Linux:*:* | s390x:Linux:*:*)
-	echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
+	echo ${UNAME_MACHINE}-ibm-linux
 	exit ;;
    sh64*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    sh*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    sparc:Linux:*:* | sparc64:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    tile*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    vax:Linux:*:*)
-	echo ${UNAME_MACHINE}-dec-linux-${LIBC}
+	echo ${UNAME_MACHINE}-dec-linux-gnu
 	exit ;;
    x86_64:Linux:*:*)
-	echo ${UNAME_MACHINE}-pc-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    xtensa*:Linux:*:*)
-	echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
    i*86:DYNIX/ptx:4*:*)
 	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
@ -1127,7 +1080,7 @@ EOF
 	# uname -m prints for DJGPP always 'pc', but it prints nothing about
 	# the processor, so we play safe by assuming i586.
 	# Note: whatever this is, it MUST be the same as what config.sub
-	# prints for the "djgpp" host, or else GDB configure will decide that
+	# prints for the "djgpp" host, or else GDB configury will decide that
 	# this is a cross-build.
 	echo i586-pc-msdosdjgpp
 	exit ;;
@ -1276,9 +1229,6 @@ EOF
    SX-8R:SUPER-UX:*:*)
 	echo sx8r-nec-superux${UNAME_RELEASE}
 	exit ;;
-    SX-ACE:SUPER-UX:*:*)
-	echo sxace-nec-superux${UNAME_RELEASE}
-	exit ;;
    Power*:Rhapsody:*:*)
 	echo powerpc-apple-rhapsody${UNAME_RELEASE}
 	exit ;;
@ -1287,36 +1237,24 @@ EOF
 	exit ;;
    *:Darwin:*:*)
 	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
+	case $UNAME_PROCESSOR in
+	    i386)
 		eval $set_cc_for_build
-	if test "$UNAME_PROCESSOR" = unknown ; then
-	    UNAME_PROCESSOR=powerpc
-	fi
-	if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
-	    if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
+		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
 		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
-		    (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
+		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
 		      grep IS_64BIT_ARCH >/dev/null
 		  then
-		    case $UNAME_PROCESSOR in
-			i386) UNAME_PROCESSOR=x86_64 ;;
-			powerpc) UNAME_PROCESSOR=powerpc64 ;;
-		    esac
-		fi
-	    fi
-	elif test "$UNAME_PROCESSOR" = i386 ; then
-	    # Avoid executing cc on OS X 10.9, as it ships with a stub
-	    # that puts up a graphical alert prompting to install
-	    # developer tools.  Any system running Mac OS X 10.7 or
-	    # later (Darwin 11 and later) is required to have a 64-bit
-	    # processor. This is not true of the ARM version of Darwin
-	    # that Apple uses in portable devices.
-	    UNAME_PROCESSOR=x86_64
+		      UNAME_PROCESSOR="x86_64"
 		  fi
+		fi ;;
+	    unknown) UNAME_PROCESSOR=powerpc ;;
+	esac
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
 	exit ;;
    *:procnto*:*:* | *:QNX:[0123456789]*:*)
 	UNAME_PROCESSOR=`uname -p`
-	if test "$UNAME_PROCESSOR" = x86; then
+	if test "$UNAME_PROCESSOR" = "x86"; then
 		UNAME_PROCESSOR=i386
 		UNAME_MACHINE=pc
 	fi
@ -1347,7 +1285,7 @@ EOF
 	# "uname -m" is not consistent, so use $cputype instead. 386
 	# is converted to i386 for consistency with other x86
 	# operating systems.
-	if test "$cputype" = 386; then
+	if test "$cputype" = "386"; then
 	    UNAME_MACHINE=i386
 	else
 	    UNAME_MACHINE="$cputype"
@ -1389,7 +1327,7 @@ EOF
 	echo i386-pc-xenix
 	exit ;;
    i*86:skyos:*:*)
-	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE} | sed -e 's/ .*$//'`
+	echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
 	exit ;;
    i*86:rdos:*:*)
 	echo ${UNAME_MACHINE}-pc-rdos
@ -1400,11 +1338,156 @@ EOF
    x86_64:VMkernel:*:*)
 	echo ${UNAME_MACHINE}-unknown-esx
 	exit ;;
-    amd64:Isilon\ OneFS:*:*)
-	echo x86_64-unknown-onefs
-	exit ;;
 esac

+eval $set_cc_for_build
+cat >$dummy.c <<EOF
+#ifdef _SEQUENT_
+# include <sys/types.h>
+# include <sys/utsname.h>
+#endif
+main ()
+{
+#if defined (sony)
+#if defined (MIPSEB)
+  /* BFD wants "bsd" instead of "newsos".  Perhaps BFD should be changed,
+     I don't know....  */
+  printf ("mips-sony-bsd\n"); exit (0);
+#else
+#include <sys/param.h>
+  printf ("m68k-sony-newsos%s\n",
+#ifdef NEWSOS4
+	"4"
+#else
+	""
+#endif
+	); exit (0);
+#endif
+#endif
+
+#if defined (__arm) && defined (__acorn) && defined (__unix)
+  printf ("arm-acorn-riscix\n"); exit (0);
+#endif
+
+#if defined (hp300) && !defined (hpux)
+  printf ("m68k-hp-bsd\n"); exit (0);
+#endif
+
+#if defined (NeXT)
+#if !defined (__ARCHITECTURE__)
+#define __ARCHITECTURE__ "m68k"
+#endif
+  int version;
+  version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
+  if (version < 4)
+    printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
+  else
+    printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
+  exit (0);
+#endif
+
+#if defined (MULTIMAX) || defined (n16)
+#if defined (UMAXV)
+  printf ("ns32k-encore-sysv\n"); exit (0);
+#else
+#if defined (CMU)
+  printf ("ns32k-encore-mach\n"); exit (0);
+#else
+  printf ("ns32k-encore-bsd\n"); exit (0);
+#endif
+#endif
+#endif
+
+#if defined (__386BSD__)
+  printf ("i386-pc-bsd\n"); exit (0);
+#endif
+
+#if defined (sequent)
+#if defined (i386)
+  printf ("i386-sequent-dynix\n"); exit (0);
+#endif
+#if defined (ns32000)
+  printf ("ns32k-sequent-dynix\n"); exit (0);
+#endif
+#endif
+
+#if defined (_SEQUENT_)
+    struct utsname un;
+
+    uname(&un);
+
+    if (strncmp(un.version, "V2", 2) == 0) {
+	printf ("i386-sequent-ptx2\n"); exit (0);
+    }
+    if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
+	printf ("i386-sequent-ptx1\n"); exit (0);
+    }
+    printf ("i386-sequent-ptx\n"); exit (0);
+
+#endif
+
+#if defined (vax)
+# if !defined (ultrix)
+#  include <sys/param.h>
+#  if defined (BSD)
+#   if BSD == 43
+      printf ("vax-dec-bsd4.3\n"); exit (0);
+#   else
+#    if BSD == 199006
+      printf ("vax-dec-bsd4.3reno\n"); exit (0);
+#    else
+      printf ("vax-dec-bsd\n"); exit (0);
+#    endif
+#   endif
+#  else
+    printf ("vax-dec-bsd\n"); exit (0);
+#  endif
+# else
+    printf ("vax-dec-ultrix\n"); exit (0);
+# endif
+#endif
+
+#if defined (alliant) && defined (i860)
+  printf ("i860-alliant-bsd\n"); exit (0);
+#endif
+
+  exit (1);
+}
+EOF
+
+$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
+	{ echo "$SYSTEM_NAME"; exit; }
+
+# Apollos put the system type in the environment.
+
+test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
+
+# Convex versions that predate uname can use getsysinfo(1)
+
+if [ -x /usr/convex/getsysinfo ]
+then
+    case `getsysinfo -f cpu_type` in
+    c1*)
+	echo c1-convex-bsd
+	exit ;;
+    c2*)
+	if getsysinfo -f scalar_acc
+	then echo c32-convex-bsd
+	else echo c2-convex-bsd
+	fi
+	exit ;;
+    c34*)
+	echo c34-convex-bsd
+	exit ;;
+    c38*)
+	echo c38-convex-bsd
+	exit ;;
+    c4*)
+	echo c4-convex-bsd
+	exit ;;
+    esac
+fi
+
 cat >&2 <<EOF
 $0: unable to guess system type

@ -1412,9 +1495,9 @@ This script, last modified $timestamp, has failed to recognize
 the operating system you are using. It is advised that you
 download the most up to date version of the config scripts from

-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 and
-  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+  http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD

 If the version you run ($0) is already up to date, please
 send the following data and any information you think might be
--- a/lib/ffts/config.sub
+++ b/lib/ffts/config.sub
@ -1,18 +1,24 @@
 #! /bin/sh
 # Configuration validation subroutine script.
-#   Copyright 1992-2016 Free Software Foundation, Inc.
+#   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011, 2012 Free Software Foundation, Inc.

-timestamp='2016-03-30'
+timestamp='2012-08-18'

-# This file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by
-# the Free Software Foundation; either version 3 of the License, or
+# This file is (in principle) common to ALL GNU software.
+# The presence of a machine in this file suggests that SOME GNU software
+# can handle that machine.  It does not imply ALL GNU software can.
+#
+# This file is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
 # (at your option) any later version.
 #
-# This program is distributed in the hope that it will be useful, but
-# WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-# General Public License for more details.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with this program; if not, see <http://www.gnu.org/licenses/>.
@ -20,12 +26,11 @@ timestamp='2016-03-30'
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
 # configuration script generated by Autoconf, you may include it under
-# the same distribution terms that you use for the rest of that
-# program.  This Exception is an additional permission under section 7
-# of the GNU General Public License, version 3 ("GPLv3").
+# the same distribution terms that you use for the rest of that program.


-# Please send patches to <config-patches@gnu.org>.
+# Please send patches to <config-patches@gnu.org>.  Submit a context
+# diff and a properly formatted GNU ChangeLog entry.
 #
 # Configuration subroutine to validate and canonicalize a configuration type.
 # Supply the specified configuration type as an argument.
@ -33,7 +38,7 @@ timestamp='2016-03-30'
 # Otherwise, we print the canonical config type on stdout and succeed.

 # You can get the latest version of this script from:
-# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD

 # This file is supposed to be the same for all GNU packages
 # and recognize all the CPU types, system types and aliases
@ -53,7 +58,8 @@ timestamp='2016-03-30'
 me=`echo "$0" | sed -e 's,.*/,,'`

 usage="\
-Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
+Usage: $0 [OPTION] CPU-MFR-OPSYS
+       $0 [OPTION] ALIAS

 Canonicalize a configuration name.

@ -67,7 +73,9 @@ Report bugs and patches to <config-patches@gnu.org>."
 version="\
 GNU config.sub ($timestamp)

-Copyright 1992-2016 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.

 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@ -116,7 +124,7 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
 case $maybe_os in
  nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
  linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
-  knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
+  knetbsd*-gnu* | netbsd*-gnu* | \
  kopensolaris*-gnu* | \
  storm-chaos* | os2-emx* | rtmk-nova*)
    os=-$maybe_os
@ -148,7 +156,7 @@ case $os in
 	-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
 	-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
 	-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-	-apple | -axis | -knuth | -cray | -microblaze*)
+	-apple | -axis | -knuth | -cray | -microblaze)
 		os=
 		basic_machine=$1
 		;;
@ -251,25 +259,21 @@ case $basic_machine in
 	| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
 	| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
 	| am33_2.0 \
-	| arc | arceb \
-	| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
-	| avr | avr32 \
-	| ba \
+	| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
        | be32 | be64 \
 	| bfin \
-	| c4x | c8051 | clipper \
+	| c4x | clipper \
 	| d10v | d30v | dlx | dsp16xx \
-	| e2k | epiphany \
-	| fido | fr30 | frv | ft32 \
+	| epiphany \
+	| fido | fr30 | frv \
 	| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
 	| hexagon \
 	| i370 | i860 | i960 | ia64 \
 	| ip2k | iq2000 \
-	| k1om \
 	| le32 | le64 \
 	| lm32 \
 	| m32c | m32r | m32rle | m68000 | m68k | m88k \
-	| maxq | mb | microblaze | microblazeel | mcore | mep | metag \
+	| maxq | mb | microblaze | mcore | mep | metag \
 	| mips | mipsbe | mipseb | mipsel | mipsle \
 	| mips16 \
 	| mips64 | mips64el \
@ -283,29 +287,26 @@ case $basic_machine in
 	| mips64vr5900 | mips64vr5900el \
 	| mipsisa32 | mipsisa32el \
 	| mipsisa32r2 | mipsisa32r2el \
-	| mipsisa32r6 | mipsisa32r6el \
 	| mipsisa64 | mipsisa64el \
 	| mipsisa64r2 | mipsisa64r2el \
-	| mipsisa64r6 | mipsisa64r6el \
 	| mipsisa64sb1 | mipsisa64sb1el \
 	| mipsisa64sr71k | mipsisa64sr71kel \
-	| mipsr5900 | mipsr5900el \
 	| mipstx39 | mipstx39el \
 	| mn10200 | mn10300 \
 	| moxie \
 	| mt \
 	| msp430 \
 	| nds32 | nds32le | nds32be \
-	| nios | nios2 | nios2eb | nios2el \
+	| nios | nios2 \
 	| ns16k | ns32k \
-	| open8 | or1k | or1knd | or32 \
+	| open8 \
+	| or32 \
 	| pdp10 | pdp11 | pj | pjl \
 	| powerpc | powerpc64 | powerpc64le | powerpcle \
 	| pyramid \
-	| riscv32 | riscv64 \
 	| rl78 | rx \
 	| score \
-	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
+	| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
 	| sh64 | sh64le \
 	| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
 	| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@ -313,7 +314,6 @@ case $basic_machine in
 	| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
 	| ubicom32 \
 	| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
-	| visium \
 	| we32k \
 	| x86 | xc16x | xstormy16 | xtensa \
 	| z8k | z80)
@ -328,10 +328,7 @@ case $basic_machine in
 	c6x)
 		basic_machine=tic6x-unknown
 		;;
-	leon|leon[3-9])
-		basic_machine=sparc-$basic_machine
-		;;
-	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip)
+	m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip)
 		basic_machine=$basic_machine-unknown
 		os=-none
 		;;
@ -373,29 +370,26 @@ case $basic_machine in
 	| aarch64-* | aarch64_be-* \
 	| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
 	| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
-	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
+	| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
 	| arm-*  | armbe-* | armle-* | armeb-* | armv*-* \
 	| avr-* | avr32-* \
-	| ba-* \
 	| be32-* | be64-* \
 	| bfin-* | bs2000-* \
 	| c[123]* | c30-* | [cjt]90-* | c4x-* \
-	| c8051-* | clipper-* | craynv-* | cydra-* \
+	| clipper-* | craynv-* | cydra-* \
 	| d10v-* | d30v-* | dlx-* \
-	| e2k-* | elxsi-* \
+	| elxsi-* \
 	| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
 	| h8300-* | h8500-* \
 	| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
 	| hexagon-* \
 	| i*86-* | i860-* | i960-* | ia64-* \
 	| ip2k-* | iq2000-* \
-	| k1om-* \
 	| le32-* | le64-* \
 	| lm32-* \
 	| m32c-* | m32r-* | m32rle-* \
 	| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
-	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
-	| microblaze-* | microblazeel-* \
+	| m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
 	| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
 	| mips16-* \
 	| mips64-* | mips64el-* \
@ -409,33 +403,28 @@ case $basic_machine in
 	| mips64vr5900-* | mips64vr5900el-* \
 	| mipsisa32-* | mipsisa32el-* \
 	| mipsisa32r2-* | mipsisa32r2el-* \
-	| mipsisa32r6-* | mipsisa32r6el-* \
 	| mipsisa64-* | mipsisa64el-* \
 	| mipsisa64r2-* | mipsisa64r2el-* \
-	| mipsisa64r6-* | mipsisa64r6el-* \
 	| mipsisa64sb1-* | mipsisa64sb1el-* \
 	| mipsisa64sr71k-* | mipsisa64sr71kel-* \
-	| mipsr5900-* | mipsr5900el-* \
 	| mipstx39-* | mipstx39el-* \
 	| mmix-* \
 	| mt-* \
 	| msp430-* \
 	| nds32-* | nds32le-* | nds32be-* \
-	| nios-* | nios2-* | nios2eb-* | nios2el-* \
+	| nios-* | nios2-* \
 	| none-* | np1-* | ns16k-* | ns32k-* \
 	| open8-* \
-	| or1k*-* \
 	| orion-* \
 	| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
 	| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
 	| pyramid-* \
-	| riscv32-* | riscv64-* \
 	| rl78-* | romp-* | rs6000-* | rx-* \
 	| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
 	| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
 	| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
 	| sparclite-* \
-	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
+	| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
 	| tahoe-* \
 	| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
 	| tile*-* \
@ -443,7 +432,6 @@ case $basic_machine in
 	| ubicom32-* \
 	| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
 	| vax-* \
-	| visium-* \
 	| we32k-* \
 	| x86-* | x86_64-* | xc16x-* | xps100-* \
 	| xstormy16-* | xtensa*-* \
@ -520,9 +508,6 @@ case $basic_machine in
 		basic_machine=i386-pc
 		os=-aros
 		;;
-	asmjs)
-		basic_machine=asmjs-unknown
-		;;
 	aux)
 		basic_machine=m68k-apple
 		os=-aux
@ -784,9 +769,6 @@ case $basic_machine in
 		basic_machine=m68k-isi
 		os=-sysv
 		;;
-	leon-*|leon[3-9]-*)
-		basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
-		;;
 	m68knommu)
 		basic_machine=m68k-unknown
 		os=-linux
@ -806,7 +788,7 @@ case $basic_machine in
 		basic_machine=ns32k-utek
 		os=-sysv
 		;;
-	microblaze*)
+	microblaze)
 		basic_machine=microblaze-xilinx
 		;;
 	mingw64)
@ -814,7 +796,7 @@ case $basic_machine in
 		os=-mingw64
 		;;
 	mingw32)
-		basic_machine=i686-pc
+		basic_machine=i386-pc
 		os=-mingw32
 		;;
 	mingw32ce)
@ -842,10 +824,6 @@ case $basic_machine in
 		basic_machine=powerpc-unknown
 		os=-morphos
 		;;
-	moxiebox)
-		basic_machine=moxie-unknown
-		os=-moxiebox
-		;;
 	msdos)
 		basic_machine=i386-pc
 		os=-msdos
@ -854,7 +832,7 @@ case $basic_machine in
 		basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
 		;;
 	msys)
-		basic_machine=i686-pc
+		basic_machine=i386-pc
 		os=-msys
 		;;
 	mvs)
@ -1045,11 +1023,7 @@ case $basic_machine in
 		basic_machine=i586-unknown
 		os=-pw32
 		;;
-	rdos | rdos64)
-		basic_machine=x86_64-pc
-		os=-rdos
-		;;
-	rdos32)
+	rdos)
 		basic_machine=i386-pc
 		os=-rdos
 		;;
@ -1376,13 +1350,13 @@ case $os in
 	-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
 	      | -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
 	      | -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
-	      | -sym* | -kopensolaris* | -plan9* \
+	      | -sym* | -kopensolaris* \
 	      | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
-	      | -aos* | -aros* | -cloudabi* | -sortix* \
+	      | -aos* | -aros* \
 	      | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
 	      | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
 	      | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
-	      | -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \
+	      | -bitrig* | -openbsd* | -solidbsd* \
 	      | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
 	      | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
 	      | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
@ -1391,15 +1365,14 @@ case $os in
 	      | -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
 	      | -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
 	      | -linux-newlib* | -linux-musl* | -linux-uclibc* \
-	      | -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \
+	      | -uxpv* | -beos* | -mpeix* | -udk* \
 	      | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
 	      | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
 	      | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
 	      | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
 	      | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
 	      | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
-	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \
-	      | -onefs* | -tirtos*)
+	      | -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
 	# Remember, each alternative MUST END IN *, to match a version number.
 		;;
 	-qnx*)
@ -1523,6 +1496,9 @@ case $os in
 	-aros*)
 		os=-aros
 		;;
+	-kaos*)
+		os=-kaos
+		;;
 	-zvmoe)
 		os=-zvmoe
 		;;
@ -1531,8 +1507,6 @@ case $os in
 		;;
 	-nacl*)
 		;;
-	-ios)
-		;;
 	-none)
 		;;
 	*)
@ -1573,9 +1547,6 @@ case $basic_machine in
 	c4x-* | tic4x-*)
 		os=-coff
 		;;
-	c8051-*)
-		os=-elf
-		;;
 	hexagon-*)
 		os=-elf
 		;;
--- a/lib/ffts/ffts.pc.cmake.in
+++ b/lib/ffts/ffts.pc.cmake.in
@ -1,7 +1,7 @@
 prefix=@CMAKE_INSTALL_PREFIX@
-exec_prefix=${exec_prefix}
-libdir=${libdir}
-includedir=${includedir}
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include

 Name: @CMAKE_PROJECT_NAME@
 Description: fast Fourier transform library
--- a/lib/ffts/include/ffts.h
+++ b/lib/ffts/include/ffts.h
@ -3,6 +3,7 @@
 This file is part of FFTS.

 Copyright (c) 2012, Anthony M. Blake
+ Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
@ -75,6 +76,9 @@ typedef struct _ffts_plan_t ffts_plan_t;
 FFTS_API ffts_plan_t*
 ffts_init_1d(size_t N, int sign);

+FFTS_API ffts_plan_t*
+ffts_init_1d_64f(size_t N, int sign);
+
 FFTS_API ffts_plan_t*
 ffts_init_2d(size_t N1, size_t N2, int sign);

--- a/lib/ffts/src/Makefile.am
+++ b/lib/ffts/src/Makefile.am
@ -2,7 +2,7 @@

 lib_LTLIBRARIES = libffts.la

-libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c
+libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c ffts_chirp_z.c
 libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h

 if DYNAMIC_DISABLED
@ -14,7 +14,7 @@ endif
 libffts_includedir=$(includedir)/ffts
 libffts_include_HEADERS = ../include/ffts.h

-AM_CFLAGS = -I$(top_srcdir)/include
+AM_CFLAGS = -I$(top_srcdir)/include -DAUTOTOOLS_BUILD=yes

 if HAVE_VFP
 libffts_la_SOURCES += vfp.s 
--- a/lib/ffts/src/codegen.c
+++ b/lib/ffts/src/codegen.c
@ -139,9 +139,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N

 #ifdef HAVE_SSE
    if (sign < 0) {
-        p->constants = sse_constants;
+        p->constants = (const void*) sse_constants;
    } else {
-        p->constants = sse_constants_inv;
+        p->constants = (const void*) sse_constants_inv;
    }
 #endif

--- a/lib/ffts/src/codegen_sse.h
+++ b/lib/ffts/src/codegen_sse.h
@ -488,7 +488,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_movaps_reg_memindex(ins, X64_XMM7,  X64_RDX, offsets[0], X64_RAX, 2);
    x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2);
@ -507,14 +507,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
    x64_sse_movaps_reg_memindex(ins, X64_XMM8,  X64_RDX, offsets[6], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+    x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2);
    x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM8);
    x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
    extend--;

    x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
@ -530,7 +530,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
    x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
+    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
@ -538,10 +538,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
    x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM8);

-    x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+    x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
    extend--;

-    x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
+    x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
@ -551,7 +551,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)

    x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
@ -580,7 +580,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
    x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
@ -588,7 +588,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
    x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
@ -620,7 +620,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_movaps_reg_memindex(ins, X64_XMM7,  X64_RSI, offsets[0], X64_RAX, 2);
    x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RSI, offsets[2], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[3], X64_RAX, 2);
@ -640,14 +640,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
    x64_sse_movaps_reg_memindex(ins, X64_XMM3,  X64_RSI, offsets[6], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[7], X64_RAX, 2);
    x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM3);
    x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
    extend--;

    x64_movsxd_reg_memindex(ins, X64_R11, X64_R8, 0, X64_RAX, 2);
@ -663,7 +663,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
    x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
@ -671,10 +671,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
    x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM3);

-    x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+	x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
    extend--;

-    x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
+	x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
@ -684,7 +684,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)

    x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
@ -713,7 +713,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
    x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
@ -721,7 +721,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
    x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
    x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
@ -1157,28 +1157,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten

    x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2);
    x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2);

-    x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+	x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
    extend--;

-    x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+	x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
    x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
    x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2);
@ -1206,7 +1206,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
    x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
    x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0);
    extend--;

    x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
@ -1218,7 +1218,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
    x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
    x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);

-    x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
+	x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
@ -1257,28 +1257,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten

    x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RSI, offsets[0], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[1], X64_RAX, 2);
    x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[2], X64_RAX, 2);

-    x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
+	x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
    extend--;

-    x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
+	x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RSI, offsets[3], X64_RAX, 2);
    x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
    x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RSI, offsets[4], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RSI, offsets[5], X64_RAX, 2);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[6], X64_RAX, 2);
@ -1306,7 +1306,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
    x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
    x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);

-    x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
+	x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0);
    extend--;

    x64_movsxd_reg_memindex(ins, X64_R12, X64_R8, 8, X64_RAX, 2);
@ -1318,7 +1318,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
    x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
    x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);

-    x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
+	x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0);
    extend--;

    x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
--- a/lib/ffts/src/ffts.c
+++ b/lib/ffts/src/ffts.c
@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "ffts.h"

 #include "ffts_internal.h"
+#include "ffts_chirp_z.h"
 #include "ffts_static.h"
 #include "ffts_trig.h"
 #include "macros.h"
@ -76,7 +77,8 @@ static const FFTS_ALIGN(64) float w_data[16] = {
 };
 #endif

-static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
+static FFTS_INLINE int
+ffts_allow_execute(void *start, size_t len)
 {
    int result;

@ -90,7 +92,8 @@ static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
    return result;
 }

-static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
+static FFTS_INLINE int
+ffts_deny_execute(void *start, size_t len)
 {
    int result;

@ -104,7 +107,8 @@ static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
    return result;
 }

-static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
+static FFTS_INLINE int
+ffts_flush_instruction_cache(void *start, size_t length)
 {
 #ifdef _WIN32
    return !FlushInstructionCache(GetCurrentProcess(), start, length);
@ -124,7 +128,8 @@ static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
 #endif
 }

-static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
+static FFTS_INLINE void*
+ffts_vmem_alloc(size_t length)
 {
 #if __APPLE__
    return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
@ -139,7 +144,8 @@ static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
 #endif
 }

-static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length)
+static FFTS_INLINE void
+ffts_vmem_free(void *addr, size_t length)
 {
 #ifdef _WIN32
    (void) length;
@ -174,7 +180,8 @@ ffts_free(ffts_plan_t *p)
    }
 }

-void ffts_free_1d(ffts_plan_t *p)
+static void
+ffts_free_1d(ffts_plan_t *p)
 {
 #if !defined(DYNAMIC_DISABLED)
    if (p->transform_base) {
@ -188,7 +195,7 @@ void ffts_free_1d(ffts_plan_t *p)
    }

    if (p->ws) {
-        FFTS_FREE(p->ws);
+        ffts_aligned_free(p->ws);
    }

    if (p->is) {
@ -233,7 +240,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
        lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f);
 #endif

-        p->ws = FFTS_MALLOC(lut_size, 32);
+        p->ws = ffts_aligned_malloc(lut_size);
        if (!p->ws) {
            goto cleanup;
        }
@ -253,7 +260,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)

    /* calculate factors */
    m = leaf_N << (n_luts - 2);
-    tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32);
+    tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_32f));

    ffts_generate_cosine_sine_pow2_32f(tmp, m);

@ -263,7 +270,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
        p->ws_is[i] = w - (ffts_cpx_32f*) p->ws;

        if (!i) {
-            ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32);
+            ffts_cpx_32f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_32f));
            float *fw0 = (float*) w0;
            float *fw = (float*) w;

@ -300,11 +307,11 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
            w += n/4 * 2;
 #endif

-            FFTS_FREE(w0);
+            ffts_aligned_free(w0);
        } else {
-            ffts_cpx_32f *w0 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
-            ffts_cpx_32f *w1 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
-            ffts_cpx_32f *w2 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
+            ffts_cpx_32f *w0 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
+            ffts_cpx_32f *w1 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
+            ffts_cpx_32f *w2 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));

            float *fw0 = (float*) w0;
            float *fw1 = (float*) w1;
@ -380,9 +387,9 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
            w += n/8 * 3 * 2;
 #endif

-            FFTS_FREE(w0);
-            FFTS_FREE(w1);
-            FFTS_FREE(w2);
+            ffts_aligned_free(w0);
+            ffts_aligned_free(w1);
+            ffts_aligned_free(w2);
        }

        n *= 2;
@ -401,7 +408,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
    }
 #endif

-    FFTS_FREE(tmp);
+    ffts_aligned_free(tmp);

    p->lastlut = w;
    p->n_luts = n_luts;
@ -411,18 +418,166 @@ cleanup:
    return -1;
 }

+#ifdef FFTS_DOUBLE
+static int
+ffts_generate_luts_64f(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
+{
+    V4DF MULI_SIGN;
+    size_t n_luts;
+    ffts_cpx_64f *w;
+    ffts_cpx_64f *tmp;
+    size_t i, j, m, n;
+    int stride;
+
+    if (sign < 0) {
+        MULI_SIGN = V4DF_LIT4(-0.0, 0.0, -0.0, 0.0);
+    } else {
+        MULI_SIGN = V4DF_LIT4(0.0, -0.0, 0.0, -0.0);
+    }
+
+    /* LUTS */
+    n_luts = ffts_ctzl(N / leaf_N);
+    if (n_luts >= 32) {
+        n_luts = 0;
+    }
+
+    if (n_luts) {
+        size_t lut_size;
+
+        lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_64f);
+
+        p->ws = ffts_aligned_malloc(lut_size);
+        if (!p->ws) {
+            goto cleanup;
+        }
+
+        p->ws_is = (size_t*) malloc(n_luts * sizeof(*p->ws_is));
+        if (!p->ws_is) {
+            goto cleanup;
+        }
+    }
+
+    w = p->ws;
+    n = leaf_N * 2;
+
+    /* calculate factors */
+    m = leaf_N << (n_luts - 2);
+    tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_64f));
+
+    ffts_generate_cosine_sine_pow2_64f(tmp, m);
+
+    /* generate lookup tables */
+    stride = 1 << (n_luts - 1);
+    for (i = 0; i < n_luts; i++) {
+        p->ws_is[i] = w - (ffts_cpx_64f*) p->ws;
+
+        if (!i) {
+            ffts_cpx_64f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_64f));
+            double *fw0 = (double*) w0;
+            double *fw = (double*) w;
+
+            for (j = 0; j < n/4; j++) {
+                w0[j][0] = tmp[j * stride][0];
+                w0[j][1] = tmp[j * stride][1];
+            }
+
+            for (j = 0; j < n/4; j += 2) {
+                V4DF re, im, temp0;
+                temp0 = V4DF_LD(fw0 + j*2);
+                re = V4DF_DUPLICATE_RE(temp0);
+                im = V4DF_DUPLICATE_IM(temp0);
+                im = V4DF_XOR(im, MULI_SIGN);
+                V4DF_ST(fw + j*4 + 0, re);
+                V4DF_ST(fw + j*4 + 4, im);
+            }
+
+            w += n/4 * 2;
+            ffts_aligned_free(w0);
+        } else {
+            ffts_cpx_64f *w0 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
+            ffts_cpx_64f *w1 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
+            ffts_cpx_64f *w2 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
+
+            double *fw0 = (double*) w0;
+            double *fw1 = (double*) w1;
+            double *fw2 = (double*) w2;
+
+            double *fw = (double*)w;
+
+            for (j = 0; j < n/8; j++) {
+                w0[j][0] = tmp[2 * j * stride][0];
+                w0[j][1] = tmp[2 * j * stride][1];
+
+                w1[j][0] = tmp[j * stride][0];
+                w1[j][1] = tmp[j * stride][1];
+
+                w2[j][0] = tmp[(j + (n/8)) * stride][0];
+                w2[j][1] = tmp[(j + (n/8)) * stride][1];
+            }
+
+            for (j = 0; j < n/8; j += 2) {
+                V4DF temp0, temp1, temp2, re, im;
+
+                temp0 = V4DF_LD(fw0 + j*2);
+                re = V4DF_DUPLICATE_RE(temp0);
+                im = V4DF_DUPLICATE_IM(temp0);
+                im = V4DF_XOR(im, MULI_SIGN);
+                V4DF_ST(fw + j*2*6+0, re);
+                V4DF_ST(fw + j*2*6+4, im);
+
+                temp1 = V4DF_LD(fw1 + j*2);
+                re = V4DF_DUPLICATE_RE(temp1);
+                im = V4DF_DUPLICATE_IM(temp1);
+                im = V4DF_XOR(im, MULI_SIGN);
+                V4DF_ST(fw + j*2*6+8 , re);
+                V4DF_ST(fw + j*2*6+12, im);
+
+                temp2 = V4DF_LD(fw2 + j*2);
+                re = V4DF_DUPLICATE_RE(temp2);
+                im = V4DF_DUPLICATE_IM(temp2);
+                im = V4DF_XOR(im, MULI_SIGN);
+                V4DF_ST(fw + j*2*6+16, re);
+                V4DF_ST(fw + j*2*6+20, im);
+            }
+
+            w += n/8 * 3 * 2;
+            ffts_aligned_free(w0);
+            ffts_aligned_free(w1);
+            ffts_aligned_free(w2);
+        }
+
+        n *= 2;
+        stride >>= 1;
+    }
+
+    ffts_aligned_free(tmp);
+
+    p->lastlut = w;
+    p->n_luts = n_luts;
+    return 0;
+
+cleanup:
+    return -1;
+}
+#endif
+
 FFTS_API ffts_plan_t*
 ffts_init_1d(size_t N, int sign)
 {
    const size_t leaf_N = 8;
    ffts_plan_t *p;

-    if (N < 2 || (N & (N - 1)) != 0) {
-        LOG("FFT size must be a power of two\n");
+    if (N < 2) {
+        LOG("FFT size must be greater than 1");
        return NULL;
    }

-    p = calloc(1, sizeof(*p));
+    /* check if size is not a power of two */
+    if (N & (N - 1)) {
+        return ffts_chirp_z_init(N, sign);
+    }
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p));
    if (!p) {
        return NULL;
    }
@ -537,3 +692,98 @@ cleanup:
    ffts_free_1d(p);
    return NULL;
 }
+
+#ifdef FFTS_DOUBLE
+FFTS_API ffts_plan_t*
+ffts_init_1d_64f(size_t N, int sign)
+{
+    const size_t leaf_N = 8;
+    ffts_plan_t *p;
+
+    if (N < 2) {
+        LOG("FFT size must be greater than 1");
+        return NULL;
+    }
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }
+
+    p->destroy = ffts_free_1d;
+    p->N = N;
+
+    if (N >= 32) {
+        /* generate lookup tables */
+        if (ffts_generate_luts_64f(p, N, leaf_N, sign)) {
+            goto cleanup;
+        }
+
+        p->offsets = ffts_init_offsets(N, leaf_N);
+        if (!p->offsets) {
+            goto cleanup;
+        }
+
+        p->is = ffts_init_is(N, leaf_N, 1);
+        if (!p->is) {
+            goto cleanup;
+        }
+
+        p->i0 = N/leaf_N/3 + 1;
+        p->i1 = p->i2 = N/leaf_N/3;
+        if ((N/leaf_N) % 3 > 1) {
+            p->i1++;
+        }
+
+        p->i0 /= 2;
+        p->i1 /= 2;
+
+        if (sign < 0) {
+            p->transform = ffts_static_transform_f_64f;
+        } else {
+            p->transform = ffts_static_transform_i_64f;
+        }
+    } else {
+        switch (N) {
+        case 2:
+            p->transform = &ffts_small_2_64f;
+            break;
+        case 4:
+            if (sign == -1) {
+                p->transform = &ffts_small_forward4_64f;
+            } else if (sign == 1) {
+                p->transform = &ffts_small_backward4_64f;
+            }
+            break;
+        case 8:
+            if (sign == -1) {
+                p->transform = &ffts_small_forward8_64f;
+            } else if (sign == 1) {
+                p->transform = &ffts_small_backward8_64f;
+            }
+            break;
+        case 16:
+        default:
+            if (sign == -1) {
+                p->transform = &ffts_small_forward16_64f;
+            } else {
+                p->transform = &ffts_small_backward16_64f;
+            }
+            break;
+        }
+    }
+
+    return p;
+
+cleanup:
+    ffts_free_1d(p);
+    return NULL;
+}
+#else
+FFTS_API ffts_plan_t*
+ffts_init_1d_64f(size_t N, int sign)
+{
+    /* disabled */
+    return NULL;
+}
+#endif
--- a/lib/ffts/src/ffts_chirp_z.c
+++ b/lib/ffts/src/ffts_chirp_z.c
@ -0,0 +1,225 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_chirp_z.h"
+
+#include "ffts_internal.h"
+#include "ffts_trig.h"
+
+/*
+*  For more information on algorithms:
+*
+*  L. I. Bluestein, A linear filtering approach to the computation of
+*  the discrete Fourier transform, 1968 NEREM Rec., pp. 218-219
+*
+*  Lawrence R. Rabiner, Ronald W. Schafer, Charles M. Rader,
+*  The Chirp z-Transform Algorithm and Its Application
+*  Bell Sys. Tech. J., vol. 48, pp. 1249-1292, May 1969.
+*
+*  Rick Lyons, Four Ways to Compute an Inverse FFT Using the Forward FFT Algorithm
+*  https://www.dsprelated.com/showarticle/800.php, July 7, 2015
+*/
+
+/* forward declarations */
+static void
+ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out);
+
+static void
+ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out);
+
+static void
+ffts_chirp_z_free(ffts_plan_t *p)
+{
+    if (p->B)
+        ffts_aligned_free(p->B);
+
+    if (p->A)
+        ffts_aligned_free(p->A);
+
+    if (p->buf)
+        ffts_aligned_free(p->buf);
+
+    if (p->plans[0])
+        ffts_free(p->plans[0]);
+
+    free(p);
+}
+
+ffts_plan_t*
+ffts_chirp_z_init(size_t N, int sign)
+{
+    float *A, *B, reciprocal_M, *tmp;
+    ffts_plan_t *p;
+    size_t i, M;
+
+    FFTS_ASSUME(N > 2);
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
+    if (!p)
+        return NULL;
+
+    p->destroy = ffts_chirp_z_free;
+    p->N = N;
+    p->rank = 1;
+    p->plans = (ffts_plan_t**) &p[1];
+
+    if (sign < 0)
+        p->transform = ffts_chirp_z_transform_f_32f;
+    else
+        p->transform = ffts_chirp_z_transform_i_32f;
+
+    /* determinate next power of two such that M >= 2*N-1 */
+    M = ffts_next_power_of_2(2*N-1);
+    p->plans[0] = ffts_init_1d(M, FFTS_FORWARD);
+    if (!p->plans[0])
+        goto cleanup;
+
+    p->A = A = (float*) ffts_aligned_malloc(2 * N * sizeof(float));
+    if (!p->A)
+        goto cleanup;
+
+    p->B = B = (float*) ffts_aligned_malloc(2 * M * sizeof(float));
+    if (!p->B)
+        goto cleanup;
+
+    p->buf = tmp = (float*) ffts_aligned_malloc(2 * 2 * M * sizeof(float));
+
+    ffts_generate_chirp_32f((ffts_cpx_32f*) A, N);
+
+    /* scale with reciprocal of length */
+    reciprocal_M = 1.0f / M;
+    tmp[0] = A[0] * reciprocal_M;
+    tmp[1] = A[1] * reciprocal_M;
+    for (i = 1; i < N; ++i) {
+        tmp[2 * i + 0] = tmp[2 * (M - i) + 0] = A[2 * i + 0] * reciprocal_M;
+        tmp[2 * i + 1] = tmp[2 * (M - i) + 1] = A[2 * i + 1] * reciprocal_M;
+    }
+
+    /* zero pad */
+    for (; i <= M - N; ++i)
+        tmp[2 * i] = tmp[2 * i + 1] = 0.0f;
+
+    /* FFT */
+    p->plans[0]->transform(p->plans[0], tmp, B);
+    return p;
+
+cleanup:
+    ffts_chirp_z_free(p);
+    return NULL;
+}
+
+static void
+ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out)
+{
+    const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
+    size_t i, M = p->plans[0]->N, N = p->N;
+    float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
+    float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
+    const float *din = (const float*) in;
+    float *dout = (float*) out;
+
+    /* we know this */
+    FFTS_ASSUME(M >= 8);
+
+    /* multiply input with conjugated sequence */
+    for (i = 0; i < N; ++i) {
+        t1[2 * i + 0] = din[2 * i + 0] * A[2 * i + 0] + din[2 * i + 1] * A[2 * i + 1];
+        t1[2 * i + 1] = din[2 * i + 1] * A[2 * i + 0] - din[2 * i + 0] * A[2 * i + 1];
+    }
+
+    /* zero pad */
+    for (; i < M; ++i)
+        t1[2 * i] = t1[2 * i + 1] = 0.0f;
+
+    /* convolution using FFT */
+    p->plans[0]->transform(p->plans[0], t1, t2);
+
+    /* complex multiply */
+    for (i = 0; i < M; ++i) {
+        t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
+        t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
+    }
+
+    /* IFFT using FFT with real and imaginary parts swapped */
+    p->plans[0]->transform(p->plans[0], t1, t2);
+
+    /* multiply output with conjugated sequence */
+    for (i = 0; i < N; ++i) {
+        dout[2 * i + 0] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
+        dout[2 * i + 1] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
+    }
+}
+
+/* IFFT using FFT with real and imaginary parts swapped */
+static void
+ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out)
+{
+    const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
+    size_t i, M = p->plans[0]->N, N = p->N;
+    float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
+    float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
+    const float *din = (const float*) in;
+    float *dout = (float*) out;
+
+    /* we know this */
+    FFTS_ASSUME(M >= 8);
+
+    /* multiply input with conjugated sequence */
+    for (i = 0; i < N; ++i) {
+        t1[2 * i + 0] = din[2 * i + 1] * A[2 * i + 0] + din[2 * i + 0] * A[2 * i + 1];
+        t1[2 * i + 1] = din[2 * i + 0] * A[2 * i + 0] - din[2 * i + 1] * A[2 * i + 1];
+    }
+
+    /* zero pad */
+    for (; i < M; ++i)
+        t1[2 * i] = t1[2 * i + 1] = 0.0f;
+
+    /* convolution using FFT */
+    p->plans[0]->transform(p->plans[0], t1, t2);
+
+    /* complex multiply */
+    for (i = 0; i < M; ++i) {
+        t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
+        t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
+    }
+
+    /* IFFT using FFT with real and imaginary parts swapped */
+    p->plans[0]->transform(p->plans[0], t1, t2);
+
+    /* multiply output with conjugated sequence */
+    for (i = 0; i < N; ++i) {
+        dout[2 * i + 0] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
+        dout[2 * i + 1] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
+    }
+}
--- a/lib/ffts/src/ffts_chirp_z.h
+++ b/lib/ffts/src/ffts_chirp_z.h
@ -0,0 +1,45 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_CHIRP_Z_H
+#define FFTS_CHIRP_Z_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts.h"
+
+ffts_plan_t*
+ffts_chirp_z_init(size_t N, int sign);
+
+#endif /* FFTS_CHIRP_Z_H */
--- a/lib/ffts/src/ffts_cpu.c
+++ b/lib/ffts/src/ffts_cpu.c
@ -0,0 +1,371 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_cpu.h"
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+#include <stdio.h>
+#endif
+
+#if defined(_WIN32)
+#include <intrin.h>
+#include <windows.h>
+#endif
+
+/* TODO: add detection/declaration of these to CMake phase */
+#if !defined(FFTS_CPU_X64)
+#if defined(_M_AMD64) || defined(__amd64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64) || defined(__x86_64__)
+/* 64 bit x86 detected */
+#define FFTS_CPU_X64
+#endif
+#endif
+
+#if !defined(FFTS_CPU_X64) && !defined(FFTS_CPU_X86)
+#if defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_)
+/* 32 bit x86 detected */
+#define FFTS_CPU_X86
+#endif
+#endif
+
+/* check if build is 32 bit or 64 bit x86 */
+#if defined(FFTS_CPU_X64) || defined(FFTS_CPU_X86)
+
+/* Build and tested on
+CentOS 6.8 2.6.32-642.11.1.el6.x86_64 - gcc version 4.4.7 20120313
+Mac OSX 10.9 - Apple Clang 6.0
+Ubuntu 14.04 LTS 4.2.0-42 x86_64 - gcc version 4.8.4
+Windows XP SP3 - Visual Studio 2005 SP1 x86/x64
+Windows Vista SP2 - Visual Studio 2010 SP1 x86/x64
+Windows 7 Ultimate SP1 - Visual Studio 2015 x86/x64
+Windows 7 Ultimate SP1 - gcc version 4.9.2 (i686-posix-dwarf-rev1)
+Windows 7 Ultimate SP1 - gcc version 4.9.2 (x86_64-posix-seh-rev3)
+Windows 10 Pro - Visual Studio 2017 x86/x64
+*/
+
+/* Visual Studio 2010 SP1 or newer have _xgetbv intrinsic */
+#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219)
+#define FFTS_HAVE_XGETBV
+#endif
+
+#ifndef BIT
+#define BIT(n) (1u << n)
+#endif
+
+/* bit masks */
+#define FFTS_CPU_X86_SSE_BITS    (BIT(0) | BIT(15) | BIT(23) | BIT(24) | BIT(25))
+#define FFTS_CPU_X86_SSE2_BITS   (BIT(26))
+#define FFTS_CPU_X86_SSE3_BITS   (BIT(0))
+#define FFTS_CPU_X86_SSSE3_BITS  (BIT(9))
+#define FFTS_CPU_X86_SSE4_1_BITS (BIT(19))
+#define FFTS_CPU_X86_SSE4_2_BITS (BIT(20) | BIT(23))
+#define FFTS_CPU_X86_AVX_BITS    (BIT(26) | BIT(27) | BIT(28))
+#define FFTS_CPU_X86_XCR0_BITS   (
+#define FFTS_CPU_X86_AVX2_BITS   (BIT(5))
+#define FFTS_CPU_X86_AVX512_BITS (BIT(16))
+
+/* Visual Studio 2008 or older */
+#if defined(FFTS_CPU_X64) && defined(_MSC_VER) && _MSC_VER <= 1500
+#pragma optimize("", off)
+static void __fastcall ffts_cpuidex(int subleaf, int regs[4], int leaf)
+{
+    /* x64 uses a four register fast-call calling convention by default and
+       arguments are passed in registers RCX, RDX, R8, and R9. By disabling
+       optimization and passing subleaf as first argument we get __cpuidex
+    */
+    (void) subleaf;
+    __cpuid(regs, leaf);
+}
+#pragma optimize("", on)
+#endif
+
+static FFTS_INLINE void ffts_cpuid(int regs[4], int leaf, int subleaf)
+{
+#if defined(_MSC_VER)
+#if defined(FFTS_CPU_X64)
+    /* Visual Studio 2010 or newer */
+#if _MSC_VER > 1500
+    __cpuidex(regs, leaf, subleaf);
+#else
+    ffts_cpuidex(subleaf, regs, leaf);
+#endif
+#else
+    __asm {
+        mov eax, leaf
+        mov ecx, subleaf
+        mov esi, regs
+        cpuid
+        mov [esi + 0x0], eax
+        mov [esi + 0x4], ebx
+        mov [esi + 0x8], ecx
+        mov [esi + 0xc], edx
+    }
+#endif
+#elif defined(__GNUC__) && __GNUC__
+#if defined(FFTS_CPU_X64)
+    __asm__ __volatile__(
+        "cpuid\n\t"
+        : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
+        : "a"(leaf), "c"(subleaf));
+#elif defined(__PIC__)
+    __asm__ __volatile__(
+        "xchgl %%ebx, %1\n\t"
+        "cpuid          \n\t"
+        "xchgl %%ebx, %1\n\t"
+        : "=a"(regs[0]), "=r"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
+        : "a"(leaf), "c"(subleaf));
+#else
+    __asm__ __volatile__(
+        "cpuid\n\t"
+        : "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
+        : "a"(leaf), "c"(subleaf));
+#endif
+#else
+    /* unknown compiler for x86 */
+    regs[0] = regs[1] = regs[2] = regs[3] = 0;
+#endif
+}
+
+/* at least Visual Studio 2010 generates invalidate optimized _xgetbv */
+#if defined(FFTS_HAVE_XGETBV)
+#pragma optimize("", off)
+#endif
+static FFTS_INLINE unsigned int ffts_get_xcr0(void)
+{
+#if defined(FFTS_HAVE_XGETBV)
+    return (unsigned int) _xgetbv(0);
+#elif defined(_MSC_VER)
+#if defined(FFTS_CPU_X64)
+    /* emulate xgetbv(0) on Windows 7 SP1 or newer */
+    typedef DWORD64 (WINAPI *PGETENABLEDXSTATEFEATURES)(VOID);
+    PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures = 
+        (PGETENABLEDXSTATEFEATURES) GetProcAddress(
+        GetModuleHandle(TEXT("kernel32.dll")), "GetEnabledXStateFeatures");
+    return pfnGetEnabledXStateFeatures ? (unsigned int) pfnGetEnabledXStateFeatures() : 0;
+#else
+    /* note that we have to touch edx register to tell compiler it's used by emited xgetbv */
+    unsigned __int32 hi, lo;
+    __asm {
+        xor ecx, ecx
+        _emit 0x0f
+        _emit 0x01
+        _emit 0xd0
+        mov lo, eax
+        mov hi, edx
+    }
+    return (unsigned int) lo;
+#endif
+#elif defined(__GNUC__) && __GNUC__
+    unsigned int lo;
+    __asm__ __volatile__(".byte 0x0f, 0x01, 0xd0\n"
+        : "=a"(lo)
+        : "c"(0)
+        : "edx");
+    return lo;
+#else
+    /* unknown x86 compiler */
+    return 0;
+#endif
+}
+#if defined(FFTS_HAVE_XGETBV)
+#pragma optimize("", on)
+#endif
+
+int
+ffts_cpu_detect(int *extra_flags)
+{
+    static int cpu_flags = -1;
+    static int cpu_extra_flags = -1;
+    int max_basic_func;
+    int regs[4];
+    unsigned int xcr0;
+
+    if (cpu_flags >= 0) {
+        goto exit;
+    }
+
+    /* initialize */
+    cpu_flags = cpu_extra_flags = 0;
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("cpuid check: ");
+#endif
+#if defined(FFTS_CPU_X64)
+    /* cpuid is always supported on x64 */
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("skipped\n");
+#endif
+#else
+#if defined(_MSC_VER)
+    _asm {
+        pushfd
+        pop eax
+        mov ebx,eax
+        xor eax,200000h
+        push eax
+        popfd
+        pushfd
+        pop eax
+        push ebx
+        popfd
+        mov regs[0 * TYPE regs],eax
+        mov regs[1 * TYPE regs],ebx
+    }
+#else
+    __asm__ (
+        "pushfl\n\t"
+        "pop %0\n\t"
+        "movl %0,%1\n\t"
+        "xorl $0x200000,%0\n\t"
+        "pushl %0\n\t"
+        "popfl\n\t"
+        "pushfl\n\t"
+        "popl %0\n\t"
+        "pushl %1\n\t"
+        "popfl\n\t"
+        : "=r" (regs[0]), "=r" (regs[1])
+    );
+#endif
+    /* check CPUID bit (bit 21) in EFLAGS register can be toggled */
+    if (((regs[0] ^ regs[1]) & 0x200000) == 0) {
+#if defined(FFTS_BUILDING_CPU_TEST)
+        printf("not supported\n");
+#endif
+        goto exit;
+    }
+#if defined(FFTS_BUILDING_CPU_TEST)
+        printf("supported\n");
+#endif
+#endif
+
+    /* get the number of basic functions */
+    ffts_cpuid(regs, 0, 0);
+    max_basic_func = regs[0];
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("cpuid eax=0, ecx=0: %d\n", max_basic_func);
+#endif
+    if (max_basic_func == 0)
+        goto exit;
+
+    /* get feature flags */
+    ffts_cpuid(regs, 1, 0);
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("cpuid eax=1, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
+#endif
+
+#if defined(FFTS_CPU_X64)
+    /* minimum for any x64 */
+    cpu_flags = FFTS_CPU_X86_SSE | FFTS_CPU_X86_SSE2;
+#else
+    /* test if SSE is supported */
+    if ((regs[3] & FFTS_CPU_X86_SSE_BITS) != FFTS_CPU_X86_SSE_BITS)
+        goto exit;
+    cpu_flags = FFTS_CPU_X86_SSE;
+
+    /* test if SSE2 is supported */
+    if (!(regs[3] & FFTS_CPU_X86_SSE2_BITS))
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSE2;
+#endif
+
+    /* test if SSE3 is supported */
+    if (!(regs[2] & FFTS_CPU_X86_SSE3_BITS))
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSE3;
+
+    /* test if SSSE3 is supported */
+    if (!(regs[2] & FFTS_CPU_X86_SSSE3_BITS))
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSSE3;
+
+    /* test if SSE4.1 is supported */
+    if (!(regs[2] & FFTS_CPU_X86_SSE4_1_BITS))
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSE4_1;
+
+    /* test if SSE4.2 is supported */
+    if ((regs[2] & FFTS_CPU_X86_SSE4_2_BITS) != FFTS_CPU_X86_SSE4_2_BITS)
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_SSE4_2;
+
+    /* test if AVX is supported */
+    if ((regs[2] & FFTS_CPU_X86_AVX_BITS) != FFTS_CPU_X86_AVX_BITS)
+        goto exit;
+
+    /* test if legaxy x87, 128-bit SSE and 256-bit AVX states are enabled in XCR0 */
+    xcr0 = ffts_get_xcr0();
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("xcr0: %u\n", xcr0);
+#endif
+    if ((xcr0 & 0x6) != 0x6)
+        goto exit;
+
+    cpu_flags |= FFTS_CPU_X86_AVX;
+
+    /* check that cpuid extended features exist */
+    if (max_basic_func < 7)
+        goto exit;
+
+    /* get extended features */
+    ffts_cpuid(regs, 7, 0);
+
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("cpuid eax=7, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
+#endif
+
+    /* test if AVX2 is supported */
+    if ((regs[1] & FFTS_CPU_X86_AVX2_BITS) != FFTS_CPU_X86_AVX2_BITS)
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_AVX2;
+
+    /* test if AVX512 is supported */
+    if ((regs[1] & FFTS_CPU_X86_AVX512_BITS) != FFTS_CPU_X86_AVX512_BITS)
+        goto exit;
+    cpu_flags |= FFTS_CPU_X86_AVX512;
+
+exit:
+    if (extra_flags) {
+        *extra_flags = cpu_extra_flags;
+    }
+    return cpu_flags;
+}
+#else 
+int
+ffts_cpu_detect(int *extra_flags)
+{
+    /* not implemented */
+#if defined(FFTS_BUILDING_CPU_TEST)
+    printf("CPU detection not implemented!!\n");
+#endif
+    return 0;
+}
+#endif
--- a/lib/ffts/src/ffts_cpu.h
+++ b/lib/ffts/src/ffts_cpu.h
@ -0,0 +1,54 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_CPU_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts_internal.h"
+
+#define FFTS_CPU_X86_SSE    0x001
+#define FFTS_CPU_X86_SSE2   0x002
+#define FFTS_CPU_X86_SSE3   0x004
+#define FFTS_CPU_X86_SSSE3  0x008
+#define FFTS_CPU_X86_SSE4_1 0x010
+#define FFTS_CPU_X86_SSE4_2 0x020
+#define FFTS_CPU_X86_AVX    0x040
+#define FFTS_CPU_X86_AVX2   0x080
+#define FFTS_CPU_X86_AVX512 0x100
+
+int
+ffts_cpu_detect(int *extra_flags);
+
+#endif /* FFTS_CPU_H */
--- a/lib/ffts/src/ffts_internal.h
+++ b/lib/ffts/src/ffts_internal.h
@ -2,6 +2,7 @@

 This file is part of FFTS -- The Fastest Fourier Transform in the South

+Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
 Copyright (c) 2012, The University of Waikato

@ -34,7 +35,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifndef FFTS_INTERNAL_H
 #define FFTS_INTERNAL_H

+#ifdef AUTOTOOLS_BUILD
 #include "config.h"
+#endif
+
 #include "ffts_attributes.h"
 #include "types.h"

@ -42,18 +46,59 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <malloc.h>
 #endif

+#ifdef HAVE_MM_ALLOC_H
+#include <mm_malloc.h>
+#ifndef HAVE__MM_MALLOC
+#define HAVE__MM_MALLOC
+#endif
+#endif
+
 #include <stddef.h>

-#ifdef HAVE_STDINT_H
+#ifdef HAVE_INTTYPES_H
+#include <inttypes.h>
+#elif HAVE_STDINT_H
 #include <stdint.h>
+#elif _MSC_VER
+typedef __int32 int32_t;
+typedef __int64 int64_t;
+typedef unsigned __int32 uint32_t;
+typedef unsigned __int64 uint64_t;
+#else
+typedef signed long int	int32_t;
+typedef unsigned long int uint32_t;
+typedef signed long long int int64_t; 
+typedef unsigned long long int uint64_t;
 #endif

 #ifdef HAVE_STDLIB_H
 #include <stdlib.h>
 #endif

+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif
+
 #include <stdio.h>

+#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN
+extern void *memalign(size_t, size_t);
+#endif
+
+#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN
+extern int posix_memalign(void **, size_t, size_t);
+#endif
+
+#if defined(HAVE_DECL_VALLOC) && !HAVE_DECL_VALLOC
+extern void *valloc(size_t);
+#endif
+
+#ifdef _mm_malloc
+#ifndef HAVE__MM_MALLOC
+#define HAVE__MM_MALLOC
+#endif
+#endif
+
 #ifdef ENABLE_LOG
 #ifdef __ANDROID__
 #include <android/log.h>
@ -142,11 +187,9 @@ struct _ffts_plan_t {
     */
    size_t transform_size;

-    /**
-     * Points to the cosnant variables used by
-     * the Assembly Code
-     */
-    void *constants;
+    /* pointer to the constant variable used by SSE for sign change */
+    /* TODO: #ifdef HAVE_SSE */
+    const void *constants;

    // multi-dimensional stuff:
    struct _ffts_plan_t **plans;
@ -171,44 +214,96 @@ struct _ffts_plan_t {
    size_t i2;
 };

-static FFTS_INLINE void *ffts_aligned_malloc(size_t size)
+static FFTS_INLINE void*
+ffts_aligned_malloc(size_t size)
 {
-#if defined(_WIN32)
-    return _aligned_malloc(size, 32);
+    void *p = NULL;
+
+    /* various ways to allocate aligned memory in order of preferance */
+#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+    p = (void*) _mm_malloc(size, 32);
+#elif defined(HAVE_POSIX_MEMALIGN)
+    if (posix_memalign(&p, 32, size))
+        p = NULL;
+#elif defined(HAVE_MEMALIGN)
+    p = memalign(32, size);
+#elif defined(__ALTIVEC__)
+    p = vec_malloc(size);
+#elif defined(_MSC_VER) || defined(WIN32)
+    p = _aligned_malloc(size, 32);
+#elif defined(HAVE_VALLOC)
+    p = valloc(size);
 #else
-    return valloc(size);
+    p = malloc(size);
 #endif
+
+    return p;
 }

-static FFTS_INLINE void ffts_aligned_free(void *p)
+static FFTS_INLINE
+void ffts_aligned_free(void *p)
 {
-#if defined(_WIN32)
+    /* order must match with ffts_aligned_malloc */
+#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
+    _mm_free(p);
+#elif defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN)
+    free(p);
+#elif defined(__ALTIVEC__)
+    vec_free(p);
+#elif defined(_MSC_VER) || defined(WIN32)
    _aligned_free(p);
 #else
+    /* valloc or malloc */
    free(p);
 #endif
 }

 #if GCC_VERSION_AT_LEAST(3,3)
 #define ffts_ctzl __builtin_ctzl
+
+static FFTS_INLINE size_t
+ffts_next_power_of_2(size_t N)
+{
+    return 1 << (32 - __builtin_clzl(N));
+}
 #elif defined(_MSC_VER)
 #include <intrin.h>
 #ifdef _M_X64
 #pragma intrinsic(_BitScanForward64)
-static __inline unsigned long ffts_ctzl(size_t N)
+static FFTS_INLINE unsigned long
+ffts_ctzl(size_t N)
 {
    unsigned long count;
    _BitScanForward64((unsigned long*) &count, N);
    return count;
 }
+
+#pragma intrinsic(_BitScanReverse64)
+static FFTS_INLINE size_t
+ffts_next_power_of_2(size_t N)
+{
+    unsigned long log_2;
+    _BitScanReverse64((unsigned long*)&log_2, N);
+    return 1ULL << (log_2 + 1);
+}
 #else
 #pragma intrinsic(_BitScanForward)
-static __inline unsigned long ffts_ctzl(size_t N)
+static FFTS_INLINE unsigned long
+ffts_ctzl(size_t N)
 {
    unsigned long count;
    _BitScanForward((unsigned long*) &count, N);
    return count;
 }
+
+#pragma intrinsic(_BitScanReverse)
+static FFTS_INLINE size_t
+ffts_next_power_of_2(size_t N)
+{
+    unsigned long log_2;
+    _BitScanReverse((unsigned long*)&log_2, N);
+    return 1 << (log_2 + 1);
+}
 #endif /* _WIN64 */
 #endif /* _MSC_VER */

--- a/lib/ffts/src/ffts_real.c
+++ b/lib/ffts/src/ffts_real.c
@ -4,7 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South

 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
 Copyright (c) 2012, The University of Waikato
-Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2015 - 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>

 All rights reserved.

@ -33,6 +33,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #include "ffts_real.h"
+#include "ffts_cpu.h"
 #include "ffts_internal.h"
 #include "ffts_trig.h"

@ -46,7 +47,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <pmmintrin.h>
 #elif HAVE_INTRIN_H
 #include <intrin.h>
-#else
+#endif
+
 /* avoid using negative zero as some configurations have problems with those */
 static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = {
    0x80000000, 0, 0x80000000, 0
@ -55,7 +57,6 @@ static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = {
    0, 0x80000000, 0, 0x80000000
 };
 #endif
-#endif

 static void
 ffts_free_1d_real(ffts_plan_t *p)
@ -79,8 +80,9 @@ ffts_free_1d_real(ffts_plan_t *p)
    free(p);
 }

+#ifdef __ARM_NEON__
 static void
-ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
+ffts_execute_1d_real_neon(ffts_plan_t *p, const void *input, void *output)
 {
    float *const FFTS_RESTRICT out =
        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
@ -91,25 +93,19 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
    const float *const FFTS_RESTRICT B =
        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
    const int N = (const int) p->N;
-    int i;
-
-#ifdef __ARM_NEON__
    float *p_buf0 = buf;
    float *p_buf1 = buf + N - 2;
    float *p_out = out;
-#endif
+    int i;

    /* we know this */
    FFTS_ASSUME(N/2 > 0);

    p->plans[0]->transform(p->plans[0], input, buf);

-#ifndef HAVE_SSE
    buf[N + 0] = buf[0];
    buf[N + 1] = buf[1];
-#endif

-#ifdef __ARM_NEON__
    for (i = 0; i < N; i += 4) {
        __asm__ __volatile__ (
            "vld1.32 {q8},  [%[pa]]!\n\t"
@ -151,7 +147,35 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
            : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
        );
    }
-#elif HAVE_SSE3
+
+    out[N + 0] = buf[0] - buf[1];
+    out[N + 1] = 0.0f;
+}
+#endif
+
+#if HAVE_SSE3
+static void
+ffts_execute_1d_real_sse3(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT out =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
+    p->plans[0]->transform(p->plans[0], input, buf);
+
+    buf[N + 0] = buf[0];
+    buf[N + 1] = buf[1];
+
    if (FFTS_UNLIKELY(N <= 8)) {
        __m128 t0 = _mm_load_ps(buf);
        __m128 t1 = _mm_load_ps(buf + N - 4);
@ -235,7 +259,32 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
        }
    }
-#elif HAVE_SSE
+
+    out[N + 0] = buf[0] - buf[1];
+    out[N + 1] = 0.0f;
+}
+#endif
+
+#ifdef HAVE_SSE
+static void
+ffts_execute_1d_real_sse(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT out =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
+    p->plans[0]->transform(p->plans[0], input, buf);
+
    if (FFTS_UNLIKELY(N <= 8)) {
        __m128 c0 = _mm_load_ps((const float*) sign_mask_even);
        __m128 t0 = _mm_load_ps(buf);
@ -327,7 +376,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
                _MM_SHUFFLE(2,3,0,1)))));
        }
    }
-#else
+
+    out[N + 0] = buf[0] - buf[1];
+    out[N + 1] = 0.0f;
+}
+#endif
+
+static void
+ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT out =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
+    p->plans[0]->transform(p->plans[0], input, buf);
+
+    buf[N + 0] = buf[0];
+    buf[N + 1] = buf[1];
+
    for (i = 0; i < N/2; i++) {
        out[2*i + 0] =
            buf[    2*i + 0] * A[2*i + 0] - buf[    2*i + 1] * A[2*i + 1] +
@ -336,14 +412,14 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
            buf[    2*i + 1] * A[2*i + 0] + buf[    2*i + 0] * A[2*i + 1] +
            buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
    }
-#endif

    out[N + 0] = buf[0] - buf[1];
    out[N + 1] = 0.0f;
 }

+#ifdef __ARM_NEON__
 static void
-ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
+ffts_execute_1d_real_inv_neon(ffts_plan_t *p, const void *input, void *output)
 {
    float *const FFTS_RESTRICT in =
        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
@ -354,18 +430,14 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
    const float *const FFTS_RESTRICT B =
        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
    const int N = (const int) p->N;
-    int i;
-
-#ifdef __ARM_NEON__
    float *p_buf0 = in;
    float *p_buf1 = in + N - 2;
    float *p_out = buf;
-#endif
+    int i;

    /* we know this */
    FFTS_ASSUME(N/2 > 0);

-#ifdef __ARM_NEON__
    for (i = 0; i < N/2; i += 2) {
        __asm__ __volatile__ (
            "vld1.32 {q8},  [%[pa]]!\n\t"
@ -407,7 +479,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
            : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
        );
    }
-#elif HAVE_SSE3
+
+    p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+#if HAVE_SSE3
+static void
+ffts_execute_1d_real_inv_sse3(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT in =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
    if (FFTS_UNLIKELY(N <= 8)) {
        __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
        __m128 t1 = _mm_load_ps(in);
@ -492,7 +586,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
        }
    }
-#elif HAVE_SSE
+
+    p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+#if HAVE_SSE
+static void
+ffts_execute_1d_real_inv_sse(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT in =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
    if (FFTS_UNLIKELY(N <= 8)) {
        __m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
        __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
@ -585,7 +701,28 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
                _mm_xor_ps(t4, c0))));
        }
    }
-#else
+
+    p->plans[0]->transform(p->plans[0], buf, output);
+}
+#endif
+
+static void
+ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT in =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
    for (i = 0; i < N/2; i++) {
        buf[2*i + 0] =
            in[    2*i + 0] * A[2*i + 0] + in[    2*i + 1] * A[2*i + 1] +
@ -594,7 +731,6 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
            in[    2*i + 1] * A[2*i + 0] - in[    2*i + 0] * A[2*i + 1] -
            in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
    }
-#endif

    p->plans[0]->transform(p->plans[0], buf, output);
 }
@ -602,18 +738,35 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
 FFTS_API ffts_plan_t*
 ffts_init_1d_real(size_t N, int sign)
 {
+#ifndef __ARM_NEON__
+    int cpu_flags = ffts_cpu_detect(NULL);
+#endif
    ffts_plan_t *p;
+    int invert = 0;

    p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
    if (!p) {
        return NULL;
    }

-    if (sign < 0) {
-        p->transform = &ffts_execute_1d_real;
-    } else {
-        p->transform = &ffts_execute_1d_real_inv;
+#ifdef __ARM_NEON__
+    p->transform = (sign < 0) ? &ffts_execute_1d_real_neon : &ffts_execute_1d_real_inv;
+#else
+#ifdef HAVE_SSE3
+    if (cpu_flags & FFTS_CPU_X86_SSE3) {
+        p->transform = (sign < 0) ? &ffts_execute_1d_real_sse3 : &ffts_execute_1d_real_inv_sse3;
+        invert = 1;
+    } else
+#endif
+#ifdef HAVE_SSE
+    if (cpu_flags & FFTS_CPU_X86_SSE) {
+        p->transform = (sign < 0) ? &ffts_execute_1d_real_sse : &ffts_execute_1d_real_inv_sse;
+    } else
+#endif
+    {
+        p->transform = (sign < 0) ? &ffts_execute_1d_real : &ffts_execute_1d_real_inv;
    }
+#endif

    p->destroy = &ffts_free_1d_real;
    p->N       = N;
@ -640,12 +793,7 @@ ffts_init_1d_real(size_t N, int sign)
        goto cleanup;
    }

-#ifdef HAVE_SSE3
-    ffts_generate_table_1d_real_32f(p, sign, 1);
-#else
-    ffts_generate_table_1d_real_32f(p, sign, 0);
-#endif
-
+    ffts_generate_table_1d_real_32f(p, sign, invert);
    return p;

 cleanup:
--- a/lib/ffts/src/ffts_static.c
+++ b/lib/ffts/src/ffts_static.c
@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South

 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
 Copyright (c) 2012, The University of Waikato
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>

 All rights reserved.

@ -258,6 +259,29 @@ static const FFTS_ALIGN(16) double ffts_constants_inv_64f[16] = {
    -0.7071067811865475244008443621048490392848359376884740
 };

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_K_0(int inv,
+         V4DF *r0,
+         V4DF *r1,
+         V4DF *r2,
+         V4DF *r3)
+{
+    V4DF t0, t1, t2, t3;
+
+    t0 = *r0;
+    t1 = *r1;
+
+    t2 = V4DF_ADD(*r2, *r3);
+    t3 = V4DF_IMULI(inv, V4DF_SUB(*r2, *r3));
+
+    *r0 = V4DF_ADD(t0, t2);
+    *r2 = V4DF_SUB(t0, t2);
+    *r1 = V4DF_SUB(t1, t3);
+    *r3 = V4DF_ADD(t1, t3);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_K_0(int inv,
         V4SF *r0,
@ -279,6 +303,31 @@ V4SF_K_0(int inv,
    *r3 = V4SF_ADD(t1, t3);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_2(const double *FFTS_RESTRICT i0,
+         const double *FFTS_RESTRICT i1,
+         const double *FFTS_RESTRICT i2,
+         const double *FFTS_RESTRICT i3,
+         V4DF *r0,
+         V4DF *r1,
+         V4DF *r2,
+         V4DF *r3)
+{
+    V4DF t0, t1, t2, t3;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t2 = V4DF_LD(i2);
+    t3 = V4DF_LD(i3);
+
+    *r0 = V4DF_ADD(t0, t1);
+    *r1 = V4DF_SUB(t0, t1);
+    *r2 = V4DF_ADD(t2, t3);
+    *r3 = V4DF_SUB(t2, t3);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_2(const float *FFTS_RESTRICT i0,
         const float *FFTS_RESTRICT i1,
@ -302,6 +351,37 @@ V4SF_L_2(const float *FFTS_RESTRICT i0,
    *r3 = V4SF_SUB(t2, t3);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_4(int inv,
+         const double *FFTS_RESTRICT i0,
+         const double *FFTS_RESTRICT i1,
+         const double *FFTS_RESTRICT i2,
+         const double *FFTS_RESTRICT i3,
+         V4DF *r0,
+         V4DF *r1,
+         V4DF *r2,
+         V4DF *r3)
+{
+    V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t2 = V4DF_LD(i2);
+    t3 = V4DF_LD(i3);
+
+    t4 = V4DF_ADD(t0, t1);
+    t5 = V4DF_SUB(t0, t1);
+    t6 = V4DF_ADD(t2, t3);
+    t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3));
+
+    *r0 = V4DF_ADD(t4, t6);
+    *r2 = V4DF_SUB(t4, t6);
+    *r1 = V4DF_SUB(t5, t7);
+    *r3 = V4DF_ADD(t5, t7);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_4(int inv,
         const float *FFTS_RESTRICT i0,
@ -331,6 +411,36 @@ V4SF_L_4(int inv,
    *r3 = V4SF_ADD(t5, t7);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_EE(double *const FFTS_RESTRICT out,
+             const ptrdiff_t *FFTS_RESTRICT os,
+             const double    *FFTS_RESTRICT in,
+             const ptrdiff_t *FFTS_RESTRICT is,
+             int inv)
+{
+    const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+    V4DF_L_2(in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7);
+
+    V4DF_K_0(inv, &r0, &r2, &r4, &r6);
+    V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7);
+    V4DF_TX2(&r0, &r1);
+    V4DF_TX2(&r2, &r3);
+    V4DF_TX2(&r4, &r5);
+    V4DF_TX2(&r6, &r7);
+
+    V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+    V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_EE(float *const FFTS_RESTRICT out,
             const ptrdiff_t *FFTS_RESTRICT os,
@ -359,6 +469,36 @@ V4SF_LEAF_EE(float *const FFTS_RESTRICT out,
    V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_EE2(double *const FFTS_RESTRICT out,
+              const ptrdiff_t *FFTS_RESTRICT os,
+              const double *FFTS_RESTRICT in,
+              const ptrdiff_t *FFTS_RESTRICT is,
+              int inv)
+{
+    const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r0, &r1, &r2, &r3);
+    V4DF_L_2(in + is[0], in + is[1], in + is[3], in + is[2], &r4, &r5, &r6, &r7);
+
+    V4DF_K_0(inv, &r0, &r2, &r4, &r6);
+    V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7);
+    V4DF_TX2(&r0, &r1);
+    V4DF_TX2(&r2, &r3);
+    V4DF_TX2(&r4, &r5);
+    V4DF_TX2(&r6, &r7);
+
+    V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+    V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_EE2(float *const FFTS_RESTRICT out,
              const ptrdiff_t *FFTS_RESTRICT os,
@ -387,6 +527,30 @@ V4SF_LEAF_EE2(float *const FFTS_RESTRICT out,
    V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_EO(double *const FFTS_RESTRICT out,
+             const ptrdiff_t *FFTS_RESTRICT os,
+             const double *FFTS_RESTRICT in,
+             const ptrdiff_t *FFTS_RESTRICT is,
+             int inv)
+{
+    const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+    V4DF_L_2_4(inv, in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7);
+
+    V4DF_S_4(r2, r3, r7, r6, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+    V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r1, &r4, &r5);
+    V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_EO(float *const FFTS_RESTRICT out,
             const ptrdiff_t *FFTS_RESTRICT os,
@ -409,6 +573,30 @@ V4SF_LEAF_EO(float *const FFTS_RESTRICT out,
    V4SF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_OE(double *const FFTS_RESTRICT out,
+             const ptrdiff_t *FFTS_RESTRICT os,
+             const double *FFTS_RESTRICT in,
+             const ptrdiff_t *FFTS_RESTRICT is,
+             int inv)
+{
+    const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
+
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4_2(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+    V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7);
+
+    V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+    V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r6, &r7, &r2, &r3);
+    V4DF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_OE(float *const FFTS_RESTRICT out,
             const ptrdiff_t *FFTS_RESTRICT os,
@ -431,6 +619,27 @@ V4SF_LEAF_OE(float *const FFTS_RESTRICT out,
    V4SF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_LEAF_OO(double *const FFTS_RESTRICT out,
+             const ptrdiff_t *FFTS_RESTRICT os,
+             const double *FFTS_RESTRICT in,
+             const ptrdiff_t *FFTS_RESTRICT is,
+             int inv)
+{
+    V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+    double *out0 = out + os[0];
+    double *out1 = out + os[1];
+
+    V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
+    V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7);
+
+    V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
+    V4DF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_LEAF_OO(float *const FFTS_RESTRICT out,
             const ptrdiff_t *FFTS_RESTRICT os,
@ -450,6 +659,34 @@ V4SF_LEAF_OO(float *const FFTS_RESTRICT out,
    V4SF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_X_4(int inv,
+         double *FFTS_RESTRICT data,
+         size_t N,
+         const double *FFTS_RESTRICT LUT)
+{
+    size_t i;
+
+    for (i = 0; i < N/8; i++) {
+        V4DF r0 = V4DF_LD(data);
+        V4DF r1 = V4DF_LD(data + 2*N/4);
+        V4DF r2 = V4DF_LD(data + 4*N/4);
+        V4DF r3 = V4DF_LD(data + 6*N/4);
+
+        V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3);
+
+        V4DF_ST(data        , r0);
+        V4DF_ST(data + 2*N/4, r1);
+        V4DF_ST(data + 4*N/4, r2);
+        V4DF_ST(data + 6*N/4, r3);
+
+        LUT += 8;
+        data += 4;
+    }
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_X_4(int inv,
         float *FFTS_RESTRICT data,
@ -536,6 +773,68 @@ V4SF_X_8(int inv,
    }
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_X_8(int inv,
+         double *FFTS_RESTRICT data0,
+         size_t N,
+         const double *FFTS_RESTRICT LUT)
+{
+    double *data1 = data0 + 1*N/4;
+    double *data2 = data0 + 2*N/4;
+    double *data3 = data0 + 3*N/4;
+    double *data4 = data0 + 4*N/4;
+    double *data5 = data0 + 5*N/4;
+    double *data6 = data0 + 6*N/4;
+    double *data7 = data0 + 7*N/4;
+    size_t i;
+
+    for (i = 0; i < N/16; i++) {
+        V4DF r0, r1, r2, r3, r4, r5, r6, r7;
+
+        r0 = V4DF_LD(data0);
+        r1 = V4DF_LD(data1);
+        r2 = V4DF_LD(data2);
+        r3 = V4DF_LD(data3);
+
+        V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3);
+        r4 = V4DF_LD(data4);
+        r6 = V4DF_LD(data6);
+
+        V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r2, &r4, &r6);
+        r5 = V4DF_LD(data5);
+        r7 = V4DF_LD(data7);
+
+        V4DF_K_N(inv, V4DF_LD(LUT + 16), V4DF_LD(LUT + 20), &r1, &r3, &r5, &r7);
+        LUT += 24;
+
+        V4DF_ST(data0, r0);
+        data0 += 4;
+
+        V4DF_ST(data1, r1);
+        data1 += 4;
+
+        V4DF_ST(data2, r2);
+        data2 += 4;
+
+        V4DF_ST(data3, r3);
+        data3 += 4;
+
+        V4DF_ST(data4, r4);
+        data4 += 4;
+
+        V4DF_ST(data5, r5);
+        data5 += 4;
+
+        V4DF_ST(data6, r6);
+        data6 += 4;
+
+        V4DF_ST(data7, r7);
+        data7 += 4;
+    }
+}
+#endif
+
 static FFTS_INLINE void
 ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out,
                              const float *FFTS_RESTRICT in,
@ -569,6 +868,41 @@ ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out,
    }
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+ffts_static_firstpass_odd_64f(double *const FFTS_RESTRICT out,
+                              const double *FFTS_RESTRICT in,
+                              const ffts_plan_t *FFTS_RESTRICT p,
+                              int inv)
+{
+    size_t i, i0 = p->i0, i1 = p->i1;
+    const ptrdiff_t *is = (const ptrdiff_t*) p->is;
+    const ptrdiff_t *os = (const ptrdiff_t*) p->offsets;
+
+    for (i = i0; i > 0; --i) {
+        V4DF_LEAF_EE(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+
+    for (i = i1; i > 0; --i) {
+        V4DF_LEAF_OO(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+
+    V4DF_LEAF_OE(out, os, in, is, inv);
+    in += 4;
+    os += 2;
+
+    for (i = i1; i > 0; --i) {
+        V4DF_LEAF_EE2(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+}
+#endif
+
 void
 ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out)
 {
@ -789,23 +1123,23 @@ ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out)
    V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }

+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out)
 {
+    const double *FFTS_RESTRICT lut = ffts_constants_small_64f;
    const double *din = (const double*) in;
    double *dout = (double*) out;
-//  V4SF r0_1, r2_3, r4_5, r6_7;
-//  double *LUT8 = (double*) p->ws + p->ws_is[0];
+    V4DF r0_1, r2_3, r4_5, r6_7;
+    
+    /* unreferenced parameter */
    (void) p;
-    (void) din;
-    (void) dout;

-#if MACROS_READY
-    L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
-#endif
+    V4DF_L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut + 4), &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }
+#endif

 void
 ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out)
@ -823,24 +1157,23 @@ ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out)
    V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }

+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out)
 {
+    const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f;
    const double *din = (const double*) in;
    double *dout = (double*) out;
-//  V4SF r0_1, r2_3, r4_5, r6_7;
-//  double *LUT8 = (double*) p->ws + p->ws_is[0];
-    (void) p;
-    (void) din;
-    (void) dout;
+    V4DF r0_1, r2_3, r4_5, r6_7;

+    /* unreferenced parameter */
+    (void) p;

-#if MACROS_READY
-    L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
-#endif
+    V4DF_L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
 }
+#endif

 void
 ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out)
@ -862,27 +1195,27 @@ ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out)
    V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
 }

+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out)
 {
+    const double *FFTS_RESTRICT lut = ffts_constants_small_64f;
    const double *din = (const double*) in;
    double *dout = (double*) out;
-//  double *LUT8 = (double*) p->ws;
-//  V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+    V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+
+    /* unreferenced parameter */
    (void) p;
-    (void) din;
-    (void) dout;
-
-#ifdef MACROS_READY
-    L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
-    L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
-    K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(0, VLD(LUT8+8), VLD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13);
-    S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
-    K_N(0, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
-    S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
-#endif
+
+    V4DF_L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
+    V4DF_L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
+    V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_K_N(0, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13);
+    V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
+    V4DF_K_N(0, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15);
+    V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
 }
+#endif

 void
 ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out)
@ -904,27 +1237,27 @@ ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out)
    V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
 }

+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out)
 {
+    const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f;
    const double *din = (const double*) in;
    double *dout = (double*) out;
-//  double *LUT8 = (double*) p->ws;
-//  V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+    V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
+    
+    /* unreferenced parameter */
    (void) p;
-    (void) din;
-    (void) dout;
-
-#ifdef MACROS_READY
-    L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
-    L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
-    K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(1, VLD(LUT8+8), VLD(LUT8+12),&r0_1, &r4_5, &r8_9, &r12_13);
-    S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
-    K_N(1, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
-    S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
-#endif
+
+    V4DF_L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
+    V4DF_L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
+    V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
+    V4DF_K_N(1, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13);
+    V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
+    V4DF_K_N(1, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15);
+    V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
 }
+#endif

 static FFTS_INLINE void
 ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out,
@ -959,6 +1292,41 @@ ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out,
    }
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+ffts_static_firstpass_even_64f(double *FFTS_RESTRICT out,
+                               const double *FFTS_RESTRICT in,
+                               const ffts_plan_t *FFTS_RESTRICT p,
+                               int inv)
+{
+    size_t i, i0 = p->i0, i1 = p->i1;
+    const ptrdiff_t *is = (const ptrdiff_t*) p->is;
+    const ptrdiff_t *os = (const ptrdiff_t*) p->offsets;
+
+    for(i = i0; i > 0; --i) {
+        V4DF_LEAF_EE(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+
+    V4DF_LEAF_EO(out, os, in, is, inv);
+    in += 4;
+    os += 2;
+
+    for (i = i1; i > 0; --i) {
+        V4DF_LEAF_OO(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+
+    for (i = i1; i > 0; --i) {
+        V4DF_LEAF_EE2(out, os, in, is, inv);
+        in += 4;
+        os += 2;
+    }
+}
+#endif
+
 static void
 ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
 {
@ -1035,6 +1403,47 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
 #endif
 }

+#ifdef FFTS_DOUBLE
+static void
+ffts_static_rec_f_64f(const ffts_plan_t *p, double *data, size_t N)
+{
+    const double *ws = (const double*) p->ws;
+
+    if (N > 128) {
+        const size_t N1 = N >> 1;
+        const size_t N2 = N >> 2;
+        const size_t N3 = N >> 3;
+
+        ffts_static_rec_f_64f(p, data              , N2);
+        ffts_static_rec_f_64f(p, data +     N1     , N3);
+        ffts_static_rec_f_64f(p, data +     N1 + N2, N3);
+        ffts_static_rec_f_64f(p, data + N          , N2);
+        ffts_static_rec_f_64f(p, data + N + N1     , N2);
+
+        V4DF_X_8(0, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
+    } else if (N == 128) {
+        const double *ws1 = ws + (p->ws_is[1] << 1);
+
+        V4DF_X_8(0, data +   0,  32, ws1);
+        V4DF_X_4(0, data +  64,  16, ws);
+        V4DF_X_4(0, data +  96,  16, ws);
+        V4DF_X_8(0, data + 128,  32, ws1);
+        V4DF_X_8(0, data + 192,  32, ws1);
+
+        V4DF_X_8(0, data, 128, ws + (p->ws_is[3] << 1));
+    } else if (N == 64) {
+        V4DF_X_4(0, data +  0, 16, ws);
+        V4DF_X_4(0, data + 64, 16, ws);
+        V4DF_X_4(0, data + 96, 16, ws);
+
+        V4DF_X_8(0, data, 64, ws + (p->ws_is[2] << 1));
+    } else {
+        assert(N == 32);
+        V4DF_X_8(0, data, 32, ws + (p->ws_is[1] << 1));
+    }
+}
+#endif
+
 static void
 ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
 {
@ -1111,6 +1520,47 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
 #endif
 }

+#ifdef FFTS_DOUBLE
+static void
+ffts_static_rec_i_64f(const ffts_plan_t *p, double *data, size_t N)
+{
+    const double *ws = (const double*) p->ws;
+
+    if (N > 128) {
+        const size_t N1 = N >> 1;
+        const size_t N2 = N >> 2;
+        const size_t N3 = N >> 3;
+
+        ffts_static_rec_i_64f(p, data              , N2);
+        ffts_static_rec_i_64f(p, data +     N1     , N3);
+        ffts_static_rec_i_64f(p, data +     N1 + N2, N3);
+        ffts_static_rec_i_64f(p, data + N          , N2);
+        ffts_static_rec_i_64f(p, data + N + N1     , N2);
+
+        V4DF_X_8(1, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
+    } else if (N == 128) {
+        const double *ws1 = ws + (p->ws_is[1] << 1);
+
+        V4DF_X_8(1, data +   0, 32, ws1);
+        V4DF_X_4(1, data +  64, 16, ws);
+        V4DF_X_4(1, data +  96, 16, ws);
+        V4DF_X_8(1, data + 128, 32, ws1);
+        V4DF_X_8(1, data + 192, 32, ws1);
+
+        V4DF_X_8(1, data, 128, ws + (p->ws_is[3] << 1));
+    } else if (N == 64) {
+        V4DF_X_4(1, data +  0, 16, ws);
+        V4DF_X_4(1, data + 64, 16, ws);
+        V4DF_X_4(1, data + 96, 16, ws);
+
+        V4DF_X_8(1, data, 64, ws + (p->ws_is[2] << 1));
+    } else {
+        assert(N == 32);
+        V4DF_X_8(1, data, 32, ws + (p->ws_is[1] << 1));
+    }
+}
+#endif
+
 void
 ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
 {
@ -1172,6 +1622,26 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
 #endif
 }

+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out)
+{
+    const double *din = (const double*) in;
+    double *dout = (double*) out;
+
+    const size_t N = p->N;
+    const int N_log_2 = ffts_ctzl(N);
+
+    if (N_log_2 & 1) {
+        ffts_static_firstpass_odd_64f(dout, din, p, 0);
+    } else {
+        ffts_static_firstpass_even_64f(dout, din, p, 0);
+    }
+
+    ffts_static_rec_f_64f(p, dout, N);
+}
+#endif
+
 void
 ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
 {
@ -1232,3 +1702,23 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
    ffts_static_rec_i_32f(p, dout, N);
 #endif
 }
+
+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out)
+{
+    const double *din = (const double*) in;
+    double *dout = (double*) out;
+
+    const size_t N = p->N;
+    const int N_log_2 = ffts_ctzl(N);
+
+    if (N_log_2 & 1) {
+        ffts_static_firstpass_odd_64f(dout, din, p, 1);
+    } else {
+        ffts_static_firstpass_even_64f(dout, din, p, 1);
+    }
+
+    ffts_static_rec_i_64f(p, dout, N);
+}
+#endif
--- a/lib/ffts/src/ffts_static.h
+++ b/lib/ffts/src/ffts_static.h
@ -43,49 +43,73 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 void
 ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out);

+#ifdef FFTS_DOUBLE
 void
 ffts_small_2_64f(ffts_plan_t *p, const void *in, void *out);
+#endif

 void
 ffts_small_forward4_32f(ffts_plan_t *p, const void *in, void *out);

+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward4_64f(ffts_plan_t *p, const void *in, void *out);
+#endif

 void
 ffts_small_backward4_32f(ffts_plan_t *p, const void *in, void *out);

+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward4_64f(ffts_plan_t *p, const void *in, void *out);
+#endif

 void
 ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out);

+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out);
+#endif

 void
 ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out);

+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out);
+#endif

 void
 ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out);

+#ifdef FFTS_DOUBLE
 void
 ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out);
+#endif

 void
 ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out);

+#ifdef FFTS_DOUBLE
 void
 ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out);
+#endif

 void
 ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out);

+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
+
 void
 ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out);

+#ifdef FFTS_DOUBLE
+void
+ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out);
+#endif
+
 #endif /* FFTS_STATIC_H */
--- a/lib/ffts/src/ffts_trig.c
+++ b/lib/ffts/src/ffts_trig.c
--- a/lib/ffts/src/ffts_trig.h
+++ b/lib/ffts/src/ffts_trig.h
@ -2,7 +2,7 @@

 This file is part of FFTS -- The Fastest Fourier Transform in the South

-Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>

 All rights reserved.

@ -39,8 +39,16 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "ffts_internal.h"

+/* calculate cos(pi * n / d) and sin(pi * n / d) with maximum error less than 1 ULP, average ~0.5 ULP */
 int
-ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size);
+ffts_cexp_32f(size_t n, size_t d, float *output);
+
+int
+ffts_generate_chirp_32f(ffts_cpx_32f *const table, size_t table_size);
+
+/* generate cosine and sine tables with maximum error less than 1 ULP, average ~0.5 ULP */
+int
+ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, size_t table_size);

 int
 ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size);
--- a/lib/ffts/src/macros-alpha.h
+++ b/lib/ffts/src/macros-alpha.h
@ -58,9 +58,6 @@ typedef union {
    uint32_t u[4];
 } V4SF;

-#define FFTS_MALLOC(d,a) (malloc(d))
-#define FFTS_FREE(d) (free(d))
-
 static FFTS_ALWAYS_INLINE V4SF
 V4SF_LIT4(float f3, float f2, float f1, float f0)
 {
--- a/lib/ffts/src/macros-altivec.h
+++ b/lib/ffts/src/macros-altivec.h
@ -4,6 +4,7 @@
  
 Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
 Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2019, Timothy Pearson <tpearson@raptorengineering.com>
 
 All rights reserved.

@ -39,99 +40,89 @@

 #define restrict

-typedef vector float V;
+typedef vector float V4SF;
 typedef vector unsigned char VUC;

-#ifdef __apple__
-#define FFTS_MALLOC(d,a) vec_malloc(d)
-#define FFTS_FREE(d) vec_free(d)
-#else
-/* It appears vec_malloc() and friends are not implemented on Linux */
-#include <malloc.h>
-#define FFTS_MALLOC(d,a) memalign(16,d)
-#define FFTS_FREE(d) free(d)
-#endif
-
-#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3})
+#define V4SF_LIT4(f0,f1,f2,f3) ((V4SF){f0, f1, f2, f3})

-#define VADD(x,y) vec_add(x,y)
-#define VSUB(x,y) vec_sub(x,y)
-#define VMUL(x,y) vec_madd(x,y,(V){0})
-#define VMULADD(x,y,z) vec_madd(x,y,z)
-#define VNMULSUB(x,y,z) vec_nmsub(x,y,z)
-#define VXOR(x,y) vec_xor((x),(y))
-#define VSWAPPAIRS(x)						\
+#define V4SF_ADD(x,y) vec_add(x,y)
+#define V4SF_SUB(x,y) vec_sub(x,y)
+#define V4SF_MUL(x,y) vec_madd(x,y,(V4SF){0})
+#define V4SF_MULADD(x,y,z) vec_madd(x,y,z)
+#define V4SF_NMULSUB(x,y,z) vec_nmsub(x,y,z)
+#define V4SF_XOR(x,y) vec_xor((x),(y))
+#define V4SF_SWAPPAIRS(x)					\
    vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03,	\
 		       0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b})

-#define VBLEND(x,y)						\
+#define V4SF_BLEND(x,y)						\
    vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,	\
 		       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})

-#define VUNPACKHI(x,y)						\
+#define V4SF_UNPACK_HI(x,y)					\
    vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,	\
 		       0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})

-#define VUNPACKLO(x,y)						\
+#define V4SF_UNPACK_LO(x,y)					\
    vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07,	\
 		       0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17})

-#define VDUPRE(x)						\
+#define V4SF_DUPLICATE_RE(x)					\
    vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03,	\
 		       0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b})

-#define VDUPIM(x)						\
+#define V4SF_DUPLICATE_IM(x)					\
    vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07,	\
 		       0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f})


-static inline V IMUL(V d, V re, V im)
+static inline V4SF V4SF_IMUL(V4SF d, V4SF re, V4SF im)
 {
-    im = VMUL(im, VSWAPPAIRS(d));
-    re = VMUL(re, d);
-    return VSUB(re, im);  
+    im = V4SF_MUL(im, V4SF_SWAPPAIRS(d));
+    re = V4SF_MUL(re, d);
+    return V4SF_SUB(re, im);  
 }


-static inline V IMULJ(V d, V re, V im)
+static inline V4SF V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
 {
-    im = VMUL(im, VSWAPPAIRS(d));
-    return VMULADD(re, d, im);
+    im = V4SF_MUL(im, V4SF_SWAPPAIRS(d));
+    return V4SF_MULADD(re, d, im);
 }

 #ifndef __GNUC__
 /* gcc (4.6 and 4.7) ICEs on this code! */
-static inline V MULI(int inv, V x)
+static inline V4SF MULI(int inv, V4SF x)
 {
-    return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f));
+    return V4SF_XOR(x, inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f));
 }
 #else
 /* but compiles this fine... */
-static inline V MULI(int inv, V x)
+static inline V4SF MULI(int inv, V4SF x)
 {
-    V t;
-    t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f);
-    return VXOR(x, t);
+    V4SF t;
+    t = inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f);
+    return V4SF_XOR(x, t);
 }
 #endif


-static inline V IMULI(int inv, V x)
+static inline V4SF V4SF_IMULI(int inv, V4SF x)
 {
-    return VSWAPPAIRS(MULI(inv, x));
+    return V4SF_SWAPPAIRS(MULI(inv, x));
 }


-static inline V VLD(const void *s)
+static inline V4SF V4SF_LD(const void *s)
 {
-    V *d = (V *)s;
+    V4SF *d = (V4SF *)s;
    return *d;
 }


-static inline void VST(void *d, V s)
+static inline void V4SF_ST(void *d, V4SF s)
 {
-    V *r = (V *)d;
+    V4SF *r = (V4SF *)d;
    *r = s;
 }
 #endif
--- a/lib/ffts/src/macros-neon.h
+++ b/lib/ffts/src/macros-neon.h
@ -39,9 +39,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stdlib.h>
 #endif

-#define FFTS_MALLOC(d,a) (valloc(d))
-#define FFTS_FREE(d) (free(d))
-
 typedef float32x4_t   V4SF;
 typedef float32x4x2_t V4SF2;

--- a/lib/ffts/src/macros-sse.h
+++ b/lib/ffts/src/macros-sse.h
@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South

 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
 Copyright (c) 2012, The University of Waikato
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>

 All rights reserved.

@ -40,9 +41,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include <xmmintrin.h>

-#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
-#define FFTS_FREE(d) (_mm_free(d))
-
 typedef __m128 V4SF;

 #define V4SF_ADD  _mm_add_ps
@ -56,8 +54,9 @@ typedef __m128 V4SF;
 #define V4SF_SWAP_PAIRS(x) \
    (_mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1)))

+/* note: order is swapped */
 #define V4SF_UNPACK_HI(x,y) \
-    (_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,3,2)))
+    (_mm_movehl_ps(y, x))

 #define V4SF_UNPACK_LO(x,y) \
    (_mm_movelh_ps(x, y))
@ -97,4 +96,220 @@ V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
    return V4SF_ADD(re, im);
 }

+#ifdef FFTS_DOUBLE
+typedef union {
+    struct {
+        double r1;
+        double i1;
+        double r2;
+        double i2;
+    } r;
+    uint32_t u[8];
+} V4DF;
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_LIT4(double f3, double f2, double f1, double f0)
+{
+    V4DF z;
+
+    z.r.r1 = f0;
+    z.r.i1 = f1;
+    z.r.r2 = f2;
+    z.r.i2 = f3;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_ADD(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1 + y.r.r1;
+    z.r.i1 = x.r.i1 + y.r.i1;
+    z.r.r2 = x.r.r2 + y.r.r2;
+    z.r.i2 = x.r.i2 + y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_SUB(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1 - y.r.r1;
+    z.r.i1 = x.r.i1 - y.r.i1;
+    z.r.r2 = x.r.r2 - y.r.r2;
+    z.r.i2 = x.r.i2 - y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_MUL(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1 * y.r.r1;
+    z.r.i1 = x.r.i1 * y.r.i1;
+    z.r.r2 = x.r.r2 * y.r.r2;
+    z.r.i2 = x.r.i2 * y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_XOR(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.u[0] = x.u[0] ^ y.u[0];
+    z.u[1] = x.u[1] ^ y.u[1];
+    z.u[2] = x.u[2] ^ y.u[2];
+    z.u[3] = x.u[3] ^ y.u[3];
+    z.u[4] = x.u[4] ^ y.u[4];
+    z.u[5] = x.u[5] ^ y.u[5];
+    z.u[6] = x.u[6] ^ y.u[6];
+    z.u[7] = x.u[7] ^ y.u[7];
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_SWAP_PAIRS(V4DF x)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.i1;
+    z.r.i1 = x.r.r1;
+    z.r.r2 = x.r.i2;
+    z.r.i2 = x.r.r2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_BLEND(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1;
+    z.r.i1 = x.r.i1;
+    z.r.r2 = y.r.r2;
+    z.r.i2 = y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_UNPACK_HI(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r2;
+    z.r.i1 = x.r.i2;
+    z.r.r2 = y.r.r2;
+    z.r.i2 = y.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_UNPACK_LO(V4DF x, V4DF y)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1;
+    z.r.i1 = x.r.i1;
+    z.r.r2 = y.r.r1;
+    z.r.i2 = y.r.i1;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_DUPLICATE_RE(V4DF x)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.r1;
+    z.r.i1 = x.r.r1;
+    z.r.r2 = x.r.r2;
+    z.r.i2 = x.r.r2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_DUPLICATE_IM(V4DF x)
+{
+    V4DF z;
+
+    z.r.r1 = x.r.i1;
+    z.r.i1 = x.r.i1;
+    z.r.r2 = x.r.i2;
+    z.r.i2 = x.r.i2;
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_IMUL(V4DF d, V4DF re, V4DF im)
+{
+    re = V4DF_MUL(re, d);
+    im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d));
+    return V4DF_SUB(re, im);
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_IMULJ(V4DF d, V4DF re, V4DF im)
+{
+    re = V4DF_MUL(re, d);
+    im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d));
+    return V4DF_ADD(re, im);
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_MULI(int inv, V4DF x)
+{
+    V4DF z;
+
+    if (inv) {
+        z.r.r1 = -x.r.r1;
+        z.r.i1 =  x.r.i1;
+        z.r.r2 = -x.r.r2;
+        z.r.i2 =  x.r.i2;
+    } else {
+        z.r.r1 =  x.r.r1;
+        z.r.i1 = -x.r.i1;
+        z.r.r2 =  x.r.r2;
+        z.r.i2 = -x.r.i2;
+    }
+
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_IMULI(int inv, V4DF x)
+{
+    return V4DF_SWAP_PAIRS(V4DF_MULI(inv, x));
+}
+
+static FFTS_ALWAYS_INLINE V4DF
+V4DF_LD(const void *s)
+{
+    V4DF z;
+    memcpy(&z, s, sizeof(z));
+    return z;
+}
+
+static FFTS_ALWAYS_INLINE void
+V4DF_ST(void *d, V4DF s)
+{
+    V4DF *r = (V4DF*) d;
+    *r = s;
+}
+#endif
+
 #endif /* FFTS_MACROS_SSE_H */
--- a/lib/ffts/src/macros.h
+++ b/lib/ffts/src/macros.h
@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South

 Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
 Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>

 All rights reserved.

@ -41,14 +42,29 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef HAVE_NEON
 #include "macros-neon.h"
 #elif HAVE_SSE
+#ifdef HAVE_AVX
+#include "macros-avx.h"
+#else
 #include "macros-sse.h"
+#endif
 // NOTE: AltiVec support disabled until updated to provide new V4SF variable type
-//#elif __powerpc__
-//#include "macros-altivec.h"
+#elif __powerpc__
+#include "macros-altivec.h"
 #else
 #include "macros-alpha.h"
 #endif

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_TX2(V4DF *a, V4DF *b)
+{
+    V4DF t0 = V4DF_UNPACK_LO(*a, *b);
+    V4DF t1 = V4DF_UNPACK_HI(*a, *b);
+    *a = t0;
+    *b = t1;
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_TX2(V4SF *a, V4SF *b)
 {
@ -58,6 +74,34 @@ V4SF_TX2(V4SF *a, V4SF *b)
    *b = t1;
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_K_N(int inv,
+         V4DF re,
+         V4DF im,
+         V4DF *r0,
+         V4DF *r1,
+         V4DF *r2,
+         V4DF *r3)
+{
+    V4DF uk, uk2, zk_p, zk_n, zk, zk_d;
+
+    uk  = *r0;
+    uk2 = *r1;
+
+    zk_p = V4DF_IMUL(*r2, re, im);
+    zk_n = V4DF_IMULJ(*r3, re, im);
+
+    zk   = V4DF_ADD(zk_p, zk_n);
+    zk_d = V4DF_IMULI(inv, V4DF_SUB(zk_p, zk_n));
+
+    *r2 = V4DF_SUB(uk, zk);
+    *r0 = V4DF_ADD(uk, zk);
+    *r3 = V4DF_ADD(uk2, zk_d);
+    *r1 = V4DF_SUB(uk2, zk_d);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_K_N(int inv,
         V4SF re,
@ -84,6 +128,45 @@ V4SF_K_N(int inv,
    *r1 = V4SF_SUB(uk2, zk_d);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_2_4(int inv,
+           const double *FFTS_RESTRICT i0,
+           const double *FFTS_RESTRICT i1,
+           const double *FFTS_RESTRICT i2,
+           const double *FFTS_RESTRICT i3,
+           V4DF *r0,
+           V4DF *r1,
+           V4DF *r2,
+           V4DF *r3)
+{
+    V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t2 = V4DF_LD(i2);
+    t3 = V4DF_LD(i3);
+
+    t4 = V4DF_ADD(t0, t1);
+    t5 = V4DF_SUB(t0, t1);
+    t6 = V4DF_ADD(t2, t3);
+    t7 = V4DF_SUB(t2, t3);
+
+    *r0 = V4DF_UNPACK_LO(t4, t5);
+    *r1 = V4DF_UNPACK_LO(t6, t7);
+
+    t5 = V4DF_IMULI(inv, t5);
+
+    t0 = V4DF_ADD(t6, t4);
+    t2 = V4DF_SUB(t6, t4);
+    t1 = V4DF_SUB(t7, t5);
+    t3 = V4DF_ADD(t7, t5);
+
+    *r3 = V4DF_UNPACK_HI(t0, t1);
+    *r2 = V4DF_UNPACK_HI(t2, t3);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_2_4(int inv,
           const float *FFTS_RESTRICT i0,
@ -121,6 +204,46 @@ V4SF_L_2_4(int inv,
    *r2 = V4SF_UNPACK_HI(t2, t3);
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_4_4(int inv,
+           const double *FFTS_RESTRICT i0,
+           const double *FFTS_RESTRICT i1,
+           const double *FFTS_RESTRICT i2,
+           const double *FFTS_RESTRICT i3,
+           V4DF *r0,
+           V4DF *r1,
+           V4DF *r2,
+           V4DF *r3)
+{
+    V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t2 = V4DF_LD(i2);
+    t3 = V4DF_LD(i3);
+
+    t4 = V4DF_ADD(t0, t1);
+    t5 = V4DF_SUB(t0, t1);
+    t6 = V4DF_ADD(t2, t3);
+
+    t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3));
+
+    t0 = V4DF_ADD(t4, t6);
+    t2 = V4DF_SUB(t4, t6);
+    t1 = V4DF_SUB(t5, t7);
+    t3 = V4DF_ADD(t5, t7);
+
+    V4DF_TX2(&t0, &t1);
+    V4DF_TX2(&t2, &t3);
+
+    *r0 = t0;
+    *r2 = t1;
+    *r1 = t2;
+    *r3 = t3;
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_4_4(int inv,
           const float *FFTS_RESTRICT i0,
@ -159,6 +282,48 @@ V4SF_L_4_4(int inv,
    *r3 = t3;
 }

+#ifdef FFTS_DOUBLE
+static FFTS_INLINE void
+V4DF_L_4_2(int inv,
+           const double *FFTS_RESTRICT i0,
+           const double *FFTS_RESTRICT i1,
+           const double *FFTS_RESTRICT i2,
+           const double *FFTS_RESTRICT i3,
+           V4DF *r0,
+           V4DF *r1,
+           V4DF *r2,
+           V4DF *r3)
+{
+    V4DF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4DF_LD(i0);
+    t1 = V4DF_LD(i1);
+    t6 = V4DF_LD(i2);
+    t7 = V4DF_LD(i3);
+
+    t2 = V4DF_BLEND(t6, t7);
+    t3 = V4DF_BLEND(t7, t6);
+
+    t4 = V4DF_ADD(t0, t1);
+    t5 = V4DF_SUB(t0, t1);
+    t6 = V4DF_ADD(t2, t3);
+    t7 = V4DF_SUB(t2, t3);
+
+    *r2 = V4DF_UNPACK_HI(t4, t5);
+    *r3 = V4DF_UNPACK_HI(t6, t7);
+
+    t7 = V4DF_IMULI(inv, t7);
+
+    t0 = V4DF_ADD(t4, t6);
+    t2 = V4DF_SUB(t4, t6);
+    t1 = V4DF_SUB(t5, t7);
+    t3 = V4DF_ADD(t5, t7);
+
+    *r0 = V4DF_UNPACK_LO(t0, t1);
+    *r1 = V4DF_UNPACK_LO(t2, t3);
+}
+#endif
+
 static FFTS_INLINE void
 V4SF_L_4_2(int inv,
           const float *FFTS_RESTRICT i0,
@ -199,6 +364,9 @@ V4SF_L_4_2(int inv,
    *r1 = V4SF_UNPACK_LO(t2, t3);
 }

+#define V4DF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
+    V4DF_ST(o0, r0); V4DF_ST(o1, r1); V4DF_ST(o2, r2); V4DF_ST(o3, r3);
+
 #define V4SF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
    V4SF_ST(o0, r0); V4SF_ST(o1, r1); V4SF_ST(o2, r2); V4SF_ST(o3, r3);