Revup FFTS to latest upstream version

Taken from https://github.com/linkotec/ffts

Fixes ppc64el support and a handful of other bugs
master
Timothy Pearson 1 month ago
parent c40a208abb
commit 2ef6dba872

@ -7,7 +7,7 @@ set(FFTS_MAJOR 0)
set(FFTS_MINOR 9)
set(FFTS_MICRO 0)
set(FFTS_VERSION "ffts-${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}")
set(FFTS_VERSION "${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}")
set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
@ -22,6 +22,16 @@ set(INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/include/ffts)
set(LIB_INSTALL_DIR ${CMAKE_INSTALL_PREFIX}/lib)
# common options
# !!!! FOR TESTING ONLY !!!!
option(ENABLE_AVX
"Enables AVX instructions." OFF
)
# !!!! FOR TESTING ONLY !!!!
option(ENABLE_DOUBLE
"Enables double precision" OFF
)
option(ENABLE_NEON
"Enables the use of NEON instructions." OFF
)
@ -48,24 +58,36 @@ option(ENABLE_STATIC
include(CheckCSourceCompiles)
include(CheckCSourceRuns)
include(CheckFunctionExists)
include(CheckIncludeFile)
include(CheckSymbolExists)
# Ensure defined when building FFTS (as opposed to using it from
# another project). Used to export functions from Windows DLL.
add_definitions(-DFFTS_BUILD)
# check existence of various headers
check_include_file(malloc.h HAVE_MALLOC_H)
check_include_file(stdint.h HAVE_STDINT_H)
check_include_file(stdlib.h HAVE_STDLIB_H)
check_include_file(string.h HAVE_STRING_H)
check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
check_include_file(unistd.h HAVE_UNISTD_H)
check_include_file(inttypes.h HAVE_INTTYPES_H)
check_include_file(malloc.h HAVE_MALLOC_H)
check_include_file(mm_malloc.h HAVE_MM_MALLOC_H)
check_include_file(stdint.h HAVE_STDINT_H)
check_include_file(stdlib.h HAVE_STDLIB_H)
check_include_file(string.h HAVE_STRING_H)
check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
check_include_file(unistd.h HAVE_UNISTD_H)
if(HAVE_INTTYPES_H)
add_definitions(-DHAVE_INTTYPES_H)
endif(HAVE_INTTYPES_H)
if(HAVE_MALLOC_H)
add_definitions(-DHAVE_MALLOC_H)
endif(HAVE_MALLOC_H)
if(HAVE_MM_MALLOC_H)
add_definitions(-DHAVE_MM_MALLOC_H)
endif(HAVE_MM_MALLOC_H)
if(HAVE_STDINT_H)
add_definitions(-DHAVE_STDINT_H)
endif(HAVE_STDINT_H)
@ -86,6 +108,50 @@ if(HAVE_UNISTD_H)
add_definitions(-DHAVE_UNISTD_H)
endif(HAVE_UNISTD_H)
# check existence of various declarations
check_symbol_exists(memalign malloc.h HAVE_DECL_MEMALIGN)
check_symbol_exists(posix_memalign stdlib.h HAVE_DECL_POSIX_MEMALIGN)
check_symbol_exists(valloc stdlib.h HAVE_DECL_VALLOC)
check_symbol_exists(_mm_malloc malloc.h HAVE_DECL__MM_MALLOC)
if(HAVE_DECL_MEMALIGN)
add_definitions(-DHAVE_DECL_MEMALIGN)
endif(HAVE_DECL_MEMALIGN)
if(HAVE_DECL_POSIX_MEMALIGN)
add_definitions(-DHAVE_DECL_POSIX_MEMALIGN)
endif(HAVE_DECL_POSIX_MEMALIGN)
if(HAVE_DECL_VALLOC)
add_definitions(-DHAVE_DECL_VALLOC)
endif(HAVE_DECL_VALLOC)
if(HAVE_DECL__MM_MALLOC)
add_definitions(-DHAVE_DECL__MM_MALLOC)
endif(HAVE_DECL__MM_MALLOC)
# check existence of various functions
check_function_exists(memalign HAVE_MEMALIGN)
check_function_exists(posix_memalign HAVE_POSIX_MEMALIGN)
check_function_exists(valloc HAVE_VALLOC)
check_function_exists(_mm_malloc HAVE__MM_MALLOC)
if(HAVE_MEMALIGN)
add_definitions(-DHAVE_MEMALIGN)
endif(HAVE_MEMALIGN)
if(HAVE_POSIX_MEMALIGN)
add_definitions(-DHAVE_POSIX_MEMALIGN)
endif(HAVE_POSIX_MEMALIGN)
if(HAVE_VALLOC)
add_definitions(-DHAVE_VALLOC)
endif(HAVE_VALLOC)
if(HAVE__MM_MALLOC)
add_definitions(-DHAVE__MM_MALLOC)
endif(HAVE__MM_MALLOC)
# backup flags
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
@ -246,6 +312,14 @@ if(NOT CMAKE_CROSSCOMPILING)
if(HAVE_XMMINTRIN_H)
add_definitions(-DHAVE_SSE)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
# TODO: not the right place
if(ENABLE_AVX)
add_definitions(-DHAVE_AVX)
endif(ENABLE_AVX)
if(ENABLE_DOUBLE)
add_definitions(-DFFTS_DOUBLE)
endif(ENABLE_DOUBLE)
endif(HAVE_XMMINTRIN_H)
# enable SSE2 code generation
@ -351,6 +425,10 @@ set(FFTS_HEADERS
set(FFTS_SOURCES
src/ffts_attributes.h
src/ffts.c
src/ffts_chirp_z.c
src/ffts_chirp_z.h
src/ffts_cpu.c
src/ffts_cpu.h
src/ffts_internal.h
src/ffts_nd.c
src/ffts_nd.h
@ -369,6 +447,17 @@ set(FFTS_SOURCES
src/types.h
)
if(NOT DISABLE_DYNAMIC_CODE)
if(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
list(APPEND FFTS_SOURCES
src/codegen_sse.h
)
else()
message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.")
set(DISABLE_DYNAMIC_CODE ON)
endif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
endif(NOT DISABLE_DYNAMIC_CODE)
if(ENABLE_NEON)
list(APPEND FFTS_SOURCES
src/neon.s
@ -393,19 +482,9 @@ elseif(HAVE_XMMINTRIN_H)
add_definitions(-DHAVE_SSE)
list(APPEND FFTS_SOURCES
src/macros-avx.h
src/macros-sse.h
)
if(NOT DISABLE_DYNAMIC_CODE)
if(CMAKE_SIZEOF_VOID_P EQUAL 8)
list(APPEND FFTS_SOURCES
src/codegen_sse.h
)
else()
message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.")
set(DISABLE_DYNAMIC_CODE ON)
endif(CMAKE_SIZEOF_VOID_P EQUAL 8)
endif(NOT DISABLE_DYNAMIC_CODE)
endif(ENABLE_NEON)
if(DISABLE_DYNAMIC_CODE)
@ -452,6 +531,41 @@ if(ENABLE_STATIC)
endif(ENABLE_STATIC)
if(ENABLE_STATIC OR ENABLE_SHARED)
find_path(MPFR_INCLUDES
NAMES mpfr.h
PATHS ${INCLUDE_INSTALL_DIR}
)
find_library(MPFR_LIBRARIES mpfr PATHS ${LIB_INSTALL_DIR})
find_package(OpenMP)
if(MPFR_INCLUDES)
add_definitions(-DHAVE_MPFR_H)
include_directories(${MPFR_INCLUDES})
endif(MPFR_INCLUDES)
add_executable(ffts_trig_test
tests/trig_test.c
)
target_link_libraries(ffts_trig_test ffts)
if(MPFR_LIBRARIES)
target_link_libraries(ffts_trig_test ${MPFR_LIBRARIES})
endif(MPFR_LIBRARIES)
if(OPENMP_FOUND)
if(MSVC)
set_target_properties(ffts_trig_test PROPERTIES
COMPILE_FLAGS "${OpenMP_C_FLAGS}"
LINK_FLAGS "${OpenMP_EXE_LINKER_FLAGS}"
)
else()
set_target_properties(ffts_trig_test PROPERTIES
COMPILE_FLAGS "${OpenMP_C_FLAGS}"
LINK_FLAGS "${OpenMP_C_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}"
)
endif(MSVC)
endif(OPENMP_FOUND)
add_executable(ffts_test
tests/test.c
)
@ -467,6 +581,14 @@ if(ENABLE_STATIC OR ENABLE_SHARED)
ffts
${FFTS_EXTRA_LIBRARIES}
)
add_executable(ffts_cpu_test
src/ffts_cpu.c
src/ffts_cpu.h
tests/cpu_test.c
)
set_target_properties(ffts_cpu_test PROPERTIES COMPILE_DEFINITIONS FFTS_BUILDING_CPU_TEST)
endif(ENABLE_STATIC OR ENABLE_SHARED)
# generate packageconfig file

@ -1,12 +1,14 @@
#! /bin/sh
# Attempt to guess a canonical system name.
# Copyright 1992-2016 Free Software Foundation, Inc.
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
# 2011, 2012 Free Software Foundation, Inc.
timestamp='2016-04-02'
timestamp='2012-08-14'
# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
@ -20,17 +22,19 @@ timestamp='2016-04-02'
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
# the same distribution terms that you use for the rest of that
# program. This Exception is an additional permission under section 7
# of the GNU General Public License, version 3 ("GPLv3").
# the same distribution terms that you use for the rest of that program.
# Originally written by Per Bothner. Please send patches (context
# diff format) to <config-patches@gnu.org> and include a ChangeLog
# entry.
#
# Originally written by Per Bothner; maintained since 2000 by Ben Elliston.
# This script attempts to guess a canonical system name similar to
# config.sub. If it succeeds, it prints the system name on stdout, and
# exits with 0. Otherwise, it exits with 1.
#
# You can get the latest version of this script from:
# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
#
# Please send patches to <config-patches@gnu.org>.
# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
me=`echo "$0" | sed -e 's,.*/,,'`
@ -50,7 +54,9 @@ version="\
GNU config.guess ($timestamp)
Originally written by Per Bothner.
Copyright 1992-2016 Free Software Foundation, Inc.
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@ -132,27 +138,6 @@ UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown
UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown
UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
case "${UNAME_SYSTEM}" in
Linux|GNU|GNU/*)
# If the system lacks a compiler, then just pick glibc.
# We could probably try harder.
LIBC=gnu
eval $set_cc_for_build
cat <<-EOF > $dummy.c
#include <features.h>
#if defined(__UCLIBC__)
LIBC=uclibc
#elif defined(__dietlibc__)
LIBC=dietlibc
#else
LIBC=gnu
#endif
EOF
eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC' | sed 's, ,,g'`
;;
esac
# Note: order is significant - the case branches are not exclusive.
case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
@ -168,27 +153,20 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
# Note: NetBSD doesn't particularly care about the vendor
# portion of the name. We always set it to "unknown".
sysctl="sysctl -n hw.machine_arch"
UNAME_MACHINE_ARCH=`(uname -p 2>/dev/null || \
/sbin/$sysctl 2>/dev/null || \
/usr/sbin/$sysctl 2>/dev/null || \
echo unknown)`
UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \
/usr/sbin/$sysctl 2>/dev/null || echo unknown)`
case "${UNAME_MACHINE_ARCH}" in
armeb) machine=armeb-unknown ;;
arm*) machine=arm-unknown ;;
sh3el) machine=shl-unknown ;;
sh3eb) machine=sh-unknown ;;
sh5el) machine=sh5le-unknown ;;
earmv*)
arch=`echo ${UNAME_MACHINE_ARCH} | sed -e 's,^e\(armv[0-9]\).*$,\1,'`
endian=`echo ${UNAME_MACHINE_ARCH} | sed -ne 's,^.*\(eb\)$,\1,p'`
machine=${arch}${endian}-unknown
;;
*) machine=${UNAME_MACHINE_ARCH}-unknown ;;
esac
# The Operating System including object format, if it has switched
# to ELF recently, or will in the future.
case "${UNAME_MACHINE_ARCH}" in
arm*|earm*|i386|m68k|ns32k|sh3*|sparc|vax)
arm*|i386|m68k|ns32k|sh3*|sparc|vax)
eval $set_cc_for_build
if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
| grep -q __ELF__
@ -204,13 +182,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
os=netbsd
;;
esac
# Determine ABI tags.
case "${UNAME_MACHINE_ARCH}" in
earm*)
expr='s/^earmv[0-9]/-eabi/;s/eb$//'
abi=`echo ${UNAME_MACHINE_ARCH} | sed -e "$expr"`
;;
esac
# The OS release
# Debian GNU/NetBSD machines have a different userland, and
# thus, need a distinct triplet. However, they do not need
@ -221,13 +192,13 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
release='-gnu'
;;
*)
release=`echo ${UNAME_RELEASE} | sed -e 's/[-_].*//' | cut -d. -f1,2`
release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'`
;;
esac
# Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM:
# contains redundant information, the shorter form:
# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used.
echo "${machine}-${os}${release}${abi}"
echo "${machine}-${os}${release}"
exit ;;
*:Bitrig:*:*)
UNAME_MACHINE_ARCH=`arch | sed 's/Bitrig.//'`
@ -237,10 +208,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'`
echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE}
exit ;;
*:LibertyBSD:*:*)
UNAME_MACHINE_ARCH=`arch | sed 's/^.*BSD\.//'`
echo ${UNAME_MACHINE_ARCH}-unknown-libertybsd${UNAME_RELEASE}
exit ;;
*:ekkoBSD:*:*)
echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE}
exit ;;
@ -253,9 +220,6 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
*:MirBSD:*:*)
echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE}
exit ;;
*:Sortix:*:*)
echo ${UNAME_MACHINE}-unknown-sortix
exit ;;
alpha:OSF1:*:*)
case $UNAME_RELEASE in
*4.0)
@ -272,42 +236,42 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1`
case "$ALPHA_CPU_TYPE" in
"EV4 (21064)")
UNAME_MACHINE=alpha ;;
UNAME_MACHINE="alpha" ;;
"EV4.5 (21064)")
UNAME_MACHINE=alpha ;;
UNAME_MACHINE="alpha" ;;
"LCA4 (21066/21068)")
UNAME_MACHINE=alpha ;;
UNAME_MACHINE="alpha" ;;
"EV5 (21164)")
UNAME_MACHINE=alphaev5 ;;
UNAME_MACHINE="alphaev5" ;;
"EV5.6 (21164A)")
UNAME_MACHINE=alphaev56 ;;
UNAME_MACHINE="alphaev56" ;;
"EV5.6 (21164PC)")
UNAME_MACHINE=alphapca56 ;;
UNAME_MACHINE="alphapca56" ;;
"EV5.7 (21164PC)")
UNAME_MACHINE=alphapca57 ;;
UNAME_MACHINE="alphapca57" ;;
"EV6 (21264)")
UNAME_MACHINE=alphaev6 ;;
UNAME_MACHINE="alphaev6" ;;
"EV6.7 (21264A)")
UNAME_MACHINE=alphaev67 ;;
UNAME_MACHINE="alphaev67" ;;
"EV6.8CB (21264C)")
UNAME_MACHINE=alphaev68 ;;
UNAME_MACHINE="alphaev68" ;;
"EV6.8AL (21264B)")
UNAME_MACHINE=alphaev68 ;;
UNAME_MACHINE="alphaev68" ;;
"EV6.8CX (21264D)")
UNAME_MACHINE=alphaev68 ;;
UNAME_MACHINE="alphaev68" ;;
"EV6.9A (21264/EV69A)")
UNAME_MACHINE=alphaev69 ;;
UNAME_MACHINE="alphaev69" ;;
"EV7 (21364)")
UNAME_MACHINE=alphaev7 ;;
UNAME_MACHINE="alphaev7" ;;
"EV7.9 (21364A)")
UNAME_MACHINE=alphaev79 ;;
UNAME_MACHINE="alphaev79" ;;
esac
# A Pn.n version is a patched version.
# A Vn.n version is a released version.
# A Tn.n version is a released field test version.
# A Xn.n version is an unreleased experimental baselevel.
# 1.2 uses "1.2" for uname -r.
echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
exitcode=$?
trap '' 0
@ -342,7 +306,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
echo arm-acorn-riscix${UNAME_RELEASE}
exit ;;
arm*:riscos:*:*|arm*:RISCOS:*:*)
arm:riscos:*:*|arm:RISCOS:*:*)
echo arm-unknown-riscos
exit ;;
SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*)
@ -380,16 +344,16 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
exit ;;
i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
eval $set_cc_for_build
SUN_ARCH=i386
SUN_ARCH="i386"
# If there is a compiler, see if it is configured for 64-bit objects.
# Note that the Sun cc does not turn __LP64__ into 1 like gcc does.
# This test works for both compilers.
if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
if (echo '#ifdef __amd64'; echo IS_64BIT_ARCH; echo '#endif') | \
(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
grep IS_64BIT_ARCH >/dev/null
then
SUN_ARCH=x86_64
SUN_ARCH="x86_64"
fi
fi
echo ${SUN_ARCH}-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
@ -414,7 +378,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
exit ;;
sun*:*:4.2BSD:*)
UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null`
test "x${UNAME_RELEASE}" = x && UNAME_RELEASE=3
test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3
case "`/bin/arch`" in
sun3)
echo m68k-sun-sunos${UNAME_RELEASE}
@ -600,9 +564,8 @@ EOF
else
IBM_ARCH=powerpc
fi
if [ -x /usr/bin/lslpp ] ; then
IBM_REV=`/usr/bin/lslpp -Lqc bos.rte.libc |
awk -F: '{ print $3 }' | sed s/[0-9]*$/0/`
if [ -x /usr/bin/oslevel ] ; then
IBM_REV=`/usr/bin/oslevel`
else
IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE}
fi
@ -639,13 +602,13 @@ EOF
sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
case "${sc_cpu_version}" in
523) HP_ARCH=hppa1.0 ;; # CPU_PA_RISC1_0
528) HP_ARCH=hppa1.1 ;; # CPU_PA_RISC1_1
523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
532) # CPU_PA_RISC2_0
case "${sc_kernel_bits}" in
32) HP_ARCH=hppa2.0n ;;
64) HP_ARCH=hppa2.0w ;;
'') HP_ARCH=hppa2.0 ;; # HP-UX 10.20
32) HP_ARCH="hppa2.0n" ;;
64) HP_ARCH="hppa2.0w" ;;
'') HP_ARCH="hppa2.0" ;; # HP-UX 10.20
esac ;;
esac
fi
@ -684,11 +647,11 @@ EOF
exit (0);
}
EOF
(CCOPTS="" $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
(CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
test -z "$HP_ARCH" && HP_ARCH=hppa
fi ;;
esac
if [ ${HP_ARCH} = hppa2.0w ]
if [ ${HP_ARCH} = "hppa2.0w" ]
then
eval $set_cc_for_build
@ -701,12 +664,12 @@ EOF
# $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess
# => hppa64-hp-hpux11.23
if echo __LP64__ | (CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) |
if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
grep -q __LP64__
then
HP_ARCH=hppa2.0w
HP_ARCH="hppa2.0w"
else
HP_ARCH=hppa64
HP_ARCH="hppa64"
fi
fi
echo ${HP_ARCH}-hp-hpux${HPUX_REV}
@ -811,14 +774,14 @@ EOF
echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
exit ;;
F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
FUJITSU_PROC=`uname -m | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz`
FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
exit ;;
5000:UNIX_System_V:4.*:*)
FUJITSU_SYS=`uname -p | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/\///'`
FUJITSU_REL=`echo ${UNAME_RELEASE} | tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz | sed -e 's/ /_/'`
FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
exit ;;
i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
@ -848,7 +811,7 @@ EOF
*:MINGW*:*)
echo ${UNAME_MACHINE}-pc-mingw32
exit ;;
*:MSYS*:*)
i*:MSYS*:*)
echo ${UNAME_MACHINE}-pc-msys
exit ;;
i*:windows32*:*)
@ -896,21 +859,21 @@ EOF
exit ;;
*:GNU:*:*)
# the GNU system
echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-${LIBC}`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'`
exit ;;
*:GNU/*:*:*)
# other systems with GNU libc and userland
echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr "[:upper:]" "[:lower:]"``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-${LIBC}
echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu
exit ;;
i*86:Minix:*:*)
echo ${UNAME_MACHINE}-pc-minix
exit ;;
aarch64:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
aarch64_be:Linux:*:*)
UNAME_MACHINE=aarch64_be
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
alpha:Linux:*:*)
case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
@ -923,60 +886,59 @@ EOF
EV68*) UNAME_MACHINE=alphaev68 ;;
esac
objdump --private-headers /bin/sh | grep -q ld.so.1
if test "$?" = 0 ; then LIBC=gnulibc1 ; fi
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
exit ;;
arc:Linux:*:* | arceb:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
exit ;;
arm*:Linux:*:*)
eval $set_cc_for_build
if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
| grep -q __ARM_EABI__
then
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
else
if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
| grep -q __ARM_PCS_VFP
then
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabi
echo ${UNAME_MACHINE}-unknown-linux-gnueabi
else
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}eabihf
echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
fi
fi
exit ;;
avr32*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
cris:Linux:*:*)
echo ${UNAME_MACHINE}-axis-linux-${LIBC}
echo ${UNAME_MACHINE}-axis-linux-gnu
exit ;;
crisv32:Linux:*:*)
echo ${UNAME_MACHINE}-axis-linux-${LIBC}
exit ;;
e2k:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-axis-linux-gnu
exit ;;
frv:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
hexagon:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
i*86:Linux:*:*)
echo ${UNAME_MACHINE}-pc-linux-${LIBC}
LIBC=gnu
eval $set_cc_for_build
sed 's/^ //' << EOF >$dummy.c
#ifdef __dietlibc__
LIBC=dietlibc
#endif
EOF
eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
exit ;;
ia64:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
exit ;;
k1om:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
m32r*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
m68*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
mips:Linux:*:* | mips64:Linux:*:*)
eval $set_cc_for_build
@ -995,63 +957,54 @@ EOF
#endif
EOF
eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
test x"${CPU}" != x && { echo "${CPU}-unknown-linux-${LIBC}"; exit; }
test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
;;
openrisc*:Linux:*:*)
echo or1k-unknown-linux-${LIBC}
exit ;;
or32:Linux:*:* | or1k*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
or32:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
padre:Linux:*:*)
echo sparc-unknown-linux-${LIBC}
echo sparc-unknown-linux-gnu
exit ;;
parisc64:Linux:*:* | hppa64:Linux:*:*)
echo hppa64-unknown-linux-${LIBC}
echo hppa64-unknown-linux-gnu
exit ;;
parisc:Linux:*:* | hppa:Linux:*:*)
# Look for CPU level
case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
PA7*) echo hppa1.1-unknown-linux-${LIBC} ;;
PA8*) echo hppa2.0-unknown-linux-${LIBC} ;;
*) echo hppa-unknown-linux-${LIBC} ;;
PA7*) echo hppa1.1-unknown-linux-gnu ;;
PA8*) echo hppa2.0-unknown-linux-gnu ;;
*) echo hppa-unknown-linux-gnu ;;
esac
exit ;;
ppc64:Linux:*:*)
echo powerpc64-unknown-linux-${LIBC}
echo powerpc64-unknown-linux-gnu
exit ;;
ppc:Linux:*:*)
echo powerpc-unknown-linux-${LIBC}
exit ;;
ppc64le:Linux:*:*)
echo powerpc64le-unknown-linux-${LIBC}
exit ;;
ppcle:Linux:*:*)
echo powerpcle-unknown-linux-${LIBC}
echo powerpc-unknown-linux-gnu
exit ;;
s390:Linux:*:* | s390x:Linux:*:*)
echo ${UNAME_MACHINE}-ibm-linux-${LIBC}
echo ${UNAME_MACHINE}-ibm-linux
exit ;;
sh64*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
sh*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
sparc:Linux:*:* | sparc64:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
tile*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
vax:Linux:*:*)
echo ${UNAME_MACHINE}-dec-linux-${LIBC}
echo ${UNAME_MACHINE}-dec-linux-gnu
exit ;;
x86_64:Linux:*:*)
echo ${UNAME_MACHINE}-pc-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
xtensa*:Linux:*:*)
echo ${UNAME_MACHINE}-unknown-linux-${LIBC}
echo ${UNAME_MACHINE}-unknown-linux-gnu
exit ;;
i*86:DYNIX/ptx:4*:*)
# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
@ -1127,7 +1080,7 @@ EOF
# uname -m prints for DJGPP always 'pc', but it prints nothing about
# the processor, so we play safe by assuming i586.
# Note: whatever this is, it MUST be the same as what config.sub
# prints for the "djgpp" host, or else GDB configure will decide that
# prints for the "djgpp" host, or else GDB configury will decide that
# this is a cross-build.
echo i586-pc-msdosdjgpp
exit ;;
@ -1276,9 +1229,6 @@ EOF
SX-8R:SUPER-UX:*:*)
echo sx8r-nec-superux${UNAME_RELEASE}
exit ;;
SX-ACE:SUPER-UX:*:*)
echo sxace-nec-superux${UNAME_RELEASE}
exit ;;
Power*:Rhapsody:*:*)
echo powerpc-apple-rhapsody${UNAME_RELEASE}
exit ;;
@ -1287,36 +1237,24 @@ EOF
exit ;;
*:Darwin:*:*)
UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
eval $set_cc_for_build
if test "$UNAME_PROCESSOR" = unknown ; then
UNAME_PROCESSOR=powerpc
fi
if test `echo "$UNAME_RELEASE" | sed -e 's/\..*//'` -le 10 ; then
if [ "$CC_FOR_BUILD" != no_compiler_found ]; then
if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
(CCOPTS="" $CC_FOR_BUILD -E - 2>/dev/null) | \
grep IS_64BIT_ARCH >/dev/null
then
case $UNAME_PROCESSOR in
i386) UNAME_PROCESSOR=x86_64 ;;
powerpc) UNAME_PROCESSOR=powerpc64 ;;
esac
fi
fi
elif test "$UNAME_PROCESSOR" = i386 ; then
# Avoid executing cc on OS X 10.9, as it ships with a stub
# that puts up a graphical alert prompting to install
# developer tools. Any system running Mac OS X 10.7 or
# later (Darwin 11 and later) is required to have a 64-bit
# processor. This is not true of the ARM version of Darwin
# that Apple uses in portable devices.
UNAME_PROCESSOR=x86_64
fi
case $UNAME_PROCESSOR in
i386)
eval $set_cc_for_build
if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
(CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
grep IS_64BIT_ARCH >/dev/null
then
UNAME_PROCESSOR="x86_64"
fi
fi ;;
unknown) UNAME_PROCESSOR=powerpc ;;
esac
echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
exit ;;
*:procnto*:*:* | *:QNX:[0123456789]*:*)
UNAME_PROCESSOR=`uname -p`
if test "$UNAME_PROCESSOR" = x86; then
if test "$UNAME_PROCESSOR" = "x86"; then
UNAME_PROCESSOR=i386
UNAME_MACHINE=pc
fi
@ -1347,7 +1285,7 @@ EOF
# "uname -m" is not consistent, so use $cputype instead. 386
# is converted to i386 for consistency with other x86
# operating systems.
if test "$cputype" = 386; then
if test "$cputype" = "386"; then
UNAME_MACHINE=i386
else
UNAME_MACHINE="$cputype"
@ -1389,7 +1327,7 @@ EOF
echo i386-pc-xenix
exit ;;
i*86:skyos:*:*)
echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE} | sed -e 's/ .*$//'`
echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//'
exit ;;
i*86:rdos:*:*)
echo ${UNAME_MACHINE}-pc-rdos
@ -1400,11 +1338,156 @@ EOF
x86_64:VMkernel:*:*)
echo ${UNAME_MACHINE}-unknown-esx
exit ;;
amd64:Isilon\ OneFS:*:*)
echo x86_64-unknown-onefs
exit ;;
esac
eval $set_cc_for_build
cat >$dummy.c <<EOF
#ifdef _SEQUENT_
# include <sys/types.h>
# include <sys/utsname.h>
#endif
main ()
{
#if defined (sony)
#if defined (MIPSEB)
/* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed,
I don't know.... */
printf ("mips-sony-bsd\n"); exit (0);
#else
#include <sys/param.h>
printf ("m68k-sony-newsos%s\n",
#ifdef NEWSOS4
"4"
#else
""
#endif
); exit (0);
#endif
#endif
#if defined (__arm) && defined (__acorn) && defined (__unix)
printf ("arm-acorn-riscix\n"); exit (0);
#endif
#if defined (hp300) && !defined (hpux)
printf ("m68k-hp-bsd\n"); exit (0);
#endif
#if defined (NeXT)
#if !defined (__ARCHITECTURE__)
#define __ARCHITECTURE__ "m68k"
#endif
int version;
version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`;
if (version < 4)
printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version);
else
printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version);
exit (0);
#endif
#if defined (MULTIMAX) || defined (n16)
#if defined (UMAXV)
printf ("ns32k-encore-sysv\n"); exit (0);
#else
#if defined (CMU)
printf ("ns32k-encore-mach\n"); exit (0);
#else
printf ("ns32k-encore-bsd\n"); exit (0);
#endif
#endif
#endif
#if defined (__386BSD__)
printf ("i386-pc-bsd\n"); exit (0);
#endif
#if defined (sequent)
#if defined (i386)
printf ("i386-sequent-dynix\n"); exit (0);
#endif
#if defined (ns32000)
printf ("ns32k-sequent-dynix\n"); exit (0);
#endif
#endif
#if defined (_SEQUENT_)
struct utsname un;
uname(&un);
if (strncmp(un.version, "V2", 2) == 0) {
printf ("i386-sequent-ptx2\n"); exit (0);
}
if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */
printf ("i386-sequent-ptx1\n"); exit (0);
}
printf ("i386-sequent-ptx\n"); exit (0);
#endif
#if defined (vax)
# if !defined (ultrix)
# include <sys/param.h>
# if defined (BSD)
# if BSD == 43
printf ("vax-dec-bsd4.3\n"); exit (0);
# else
# if BSD == 199006
printf ("vax-dec-bsd4.3reno\n"); exit (0);
# else
printf ("vax-dec-bsd\n"); exit (0);
# endif
# endif
# else
printf ("vax-dec-bsd\n"); exit (0);
# endif
# else
printf ("vax-dec-ultrix\n"); exit (0);
# endif
#endif
#if defined (alliant) && defined (i860)
printf ("i860-alliant-bsd\n"); exit (0);
#endif
exit (1);
}
EOF
$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` &&
{ echo "$SYSTEM_NAME"; exit; }
# Apollos put the system type in the environment.
test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; }
# Convex versions that predate uname can use getsysinfo(1)
if [ -x /usr/convex/getsysinfo ]
then
case `getsysinfo -f cpu_type` in
c1*)
echo c1-convex-bsd
exit ;;
c2*)
if getsysinfo -f scalar_acc
then echo c32-convex-bsd
else echo c2-convex-bsd
fi
exit ;;
c34*)
echo c34-convex-bsd
exit ;;
c38*)
echo c38-convex-bsd
exit ;;
c4*)
echo c4-convex-bsd
exit ;;
esac
fi
cat >&2 <<EOF
$0: unable to guess system type
@ -1412,9 +1495,9 @@ This script, last modified $timestamp, has failed to recognize
the operating system you are using. It is advised that you
download the most up to date version of the config scripts from
http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess
http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
and
http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
If the version you run ($0) is already up to date, please
send the following data and any information you think might be

139
lib/ffts/config.sub vendored

@ -1,18 +1,24 @@
#! /bin/sh
# Configuration validation subroutine script.
# Copyright 1992-2016 Free Software Foundation, Inc.
# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
# 2011, 2012 Free Software Foundation, Inc.
timestamp='2016-03-30'
timestamp='2012-08-18'
# This file is free software; you can redistribute it and/or modify it
# under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 3 of the License, or
# This file is (in principle) common to ALL GNU software.
# The presence of a machine in this file suggests that SOME GNU software
# can handle that machine. It does not imply ALL GNU software can.
#
# This file is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, see <http://www.gnu.org/licenses/>.
@ -20,12 +26,11 @@ timestamp='2016-03-30'
# As a special exception to the GNU General Public License, if you
# distribute this file as part of a program that contains a
# configuration script generated by Autoconf, you may include it under
# the same distribution terms that you use for the rest of that
# program. This Exception is an additional permission under section 7
# of the GNU General Public License, version 3 ("GPLv3").
# the same distribution terms that you use for the rest of that program.
# Please send patches to <config-patches@gnu.org>.
# Please send patches to <config-patches@gnu.org>. Submit a context
# diff and a properly formatted GNU ChangeLog entry.
#
# Configuration subroutine to validate and canonicalize a configuration type.
# Supply the specified configuration type as an argument.
@ -33,7 +38,7 @@ timestamp='2016-03-30'
# Otherwise, we print the canonical config type on stdout and succeed.
# You can get the latest version of this script from:
# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub
# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.sub;hb=HEAD
# This file is supposed to be the same for all GNU packages
# and recognize all the CPU types, system types and aliases
@ -53,7 +58,8 @@ timestamp='2016-03-30'
me=`echo "$0" | sed -e 's,.*/,,'`
usage="\
Usage: $0 [OPTION] CPU-MFR-OPSYS or ALIAS
Usage: $0 [OPTION] CPU-MFR-OPSYS
$0 [OPTION] ALIAS
Canonicalize a configuration name.
@ -67,7 +73,9 @@ Report bugs and patches to <config-patches@gnu.org>."
version="\
GNU config.sub ($timestamp)
Copyright 1992-2016 Free Software Foundation, Inc.
Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
Free Software Foundation, Inc.
This is free software; see the source for copying conditions. There is NO
warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@ -116,7 +124,7 @@ maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'`
case $maybe_os in
nto-qnx* | linux-gnu* | linux-android* | linux-dietlibc | linux-newlib* | \
linux-musl* | linux-uclibc* | uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | \
knetbsd*-gnu* | netbsd*-gnu* | netbsd*-eabi* | \
knetbsd*-gnu* | netbsd*-gnu* | \
kopensolaris*-gnu* | \
storm-chaos* | os2-emx* | rtmk-nova*)
os=-$maybe_os
@ -148,7 +156,7 @@ case $os in
-convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\
-c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \
-harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \
-apple | -axis | -knuth | -cray | -microblaze*)
-apple | -axis | -knuth | -cray | -microblaze)
os=
basic_machine=$1
;;
@ -251,25 +259,21 @@ case $basic_machine in
| alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \
| alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \
| am33_2.0 \
| arc | arceb \
| arm | arm[bl]e | arme[lb] | armv[2-8] | armv[3-8][lb] | armv7[arm] \
| avr | avr32 \
| ba \
| be32 | be64 \
| arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \
| be32 | be64 \
| bfin \
| c4x | c8051 | clipper \
| c4x | clipper \
| d10v | d30v | dlx | dsp16xx \
| e2k | epiphany \
| fido | fr30 | frv | ft32 \
| epiphany \
| fido | fr30 | frv \
| h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \
| hexagon \
| i370 | i860 | i960 | ia64 \
| ip2k | iq2000 \
| k1om \
| le32 | le64 \
| lm32 \
| m32c | m32r | m32rle | m68000 | m68k | m88k \
| maxq | mb | microblaze | microblazeel | mcore | mep | metag \
| maxq | mb | microblaze | mcore | mep | metag \
| mips | mipsbe | mipseb | mipsel | mipsle \
| mips16 \
| mips64 | mips64el \
@ -283,29 +287,26 @@ case $basic_machine in
| mips64vr5900 | mips64vr5900el \
| mipsisa32 | mipsisa32el \
| mipsisa32r2 | mipsisa32r2el \
| mipsisa32r6 | mipsisa32r6el \
| mipsisa64 | mipsisa64el \
| mipsisa64r2 | mipsisa64r2el \
| mipsisa64r6 | mipsisa64r6el \
| mipsisa64sb1 | mipsisa64sb1el \
| mipsisa64sr71k | mipsisa64sr71kel \
| mipsr5900 | mipsr5900el \
| mipstx39 | mipstx39el \
| mn10200 | mn10300 \
| moxie \
| mt \
| msp430 \
| nds32 | nds32le | nds32be \
| nios | nios2 | nios2eb | nios2el \
| nios | nios2 \
| ns16k | ns32k \
| open8 | or1k | or1knd | or32 \
| open8 \
| or32 \
| pdp10 | pdp11 | pj | pjl \
| powerpc | powerpc64 | powerpc64le | powerpcle \
| pyramid \
| riscv32 | riscv64 \
| rl78 | rx \
| score \
| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[234]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
| sh | sh[1234] | sh[24]a | sh[24]aeb | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \
| sh64 | sh64le \
| sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \
| sparcv8 | sparcv9 | sparcv9b | sparcv9v \
@ -313,7 +314,6 @@ case $basic_machine in
| tahoe | tic4x | tic54x | tic55x | tic6x | tic80 | tron \
| ubicom32 \
| v850 | v850e | v850e1 | v850e2 | v850es | v850e2v3 \
| visium \
| we32k \
| x86 | xc16x | xstormy16 | xtensa \
| z8k | z80)
@ -328,10 +328,7 @@ case $basic_machine in
c6x)
basic_machine=tic6x-unknown
;;
leon|leon[3-9])
basic_machine=sparc-$basic_machine
;;
m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | nvptx | picochip)
m6811 | m68hc11 | m6812 | m68hc12 | m68hcs12x | picochip)
basic_machine=$basic_machine-unknown
os=-none
;;
@ -373,29 +370,26 @@ case $basic_machine in
| aarch64-* | aarch64_be-* \
| alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \
| alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \
| alphapca5[67]-* | alpha64pca5[67]-* | arc-* | arceb-* \
| alphapca5[67]-* | alpha64pca5[67]-* | arc-* \
| arm-* | armbe-* | armle-* | armeb-* | armv*-* \
| avr-* | avr32-* \
| ba-* \
| be32-* | be64-* \
| bfin-* | bs2000-* \
| c[123]* | c30-* | [cjt]90-* | c4x-* \
| c8051-* | clipper-* | craynv-* | cydra-* \
| clipper-* | craynv-* | cydra-* \
| d10v-* | d30v-* | dlx-* \
| e2k-* | elxsi-* \
| elxsi-* \
| f30[01]-* | f700-* | fido-* | fr30-* | frv-* | fx80-* \
| h8300-* | h8500-* \
| hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \
| hexagon-* \
| i*86-* | i860-* | i960-* | ia64-* \
| ip2k-* | iq2000-* \
| k1om-* \
| le32-* | le64-* \
| lm32-* \
| m32c-* | m32r-* | m32rle-* \
| m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \
| m88110-* | m88k-* | maxq-* | mcore-* | metag-* \
| microblaze-* | microblazeel-* \
| m88110-* | m88k-* | maxq-* | mcore-* | metag-* | microblaze-* \
| mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \
| mips16-* \
| mips64-* | mips64el-* \
@ -409,33 +403,28 @@ case $basic_machine in
| mips64vr5900-* | mips64vr5900el-* \
| mipsisa32-* | mipsisa32el-* \
| mipsisa32r2-* | mipsisa32r2el-* \
| mipsisa32r6-* | mipsisa32r6el-* \
| mipsisa64-* | mipsisa64el-* \
| mipsisa64r2-* | mipsisa64r2el-* \
| mipsisa64r6-* | mipsisa64r6el-* \
| mipsisa64sb1-* | mipsisa64sb1el-* \
| mipsisa64sr71k-* | mipsisa64sr71kel-* \
| mipsr5900-* | mipsr5900el-* \
| mipstx39-* | mipstx39el-* \
| mmix-* \
| mt-* \
| msp430-* \
| nds32-* | nds32le-* | nds32be-* \
| nios-* | nios2-* | nios2eb-* | nios2el-* \
| nios-* | nios2-* \
| none-* | np1-* | ns16k-* | ns32k-* \
| open8-* \
| or1k*-* \
| orion-* \
| pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \
| powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* \
| pyramid-* \
| riscv32-* | riscv64-* \
| rl78-* | romp-* | rs6000-* | rx-* \
| sh-* | sh[1234]-* | sh[24]a-* | sh[24]aeb-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \
| shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \
| sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \
| sparclite-* \
| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx*-* \
| sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | sv1-* | sx?-* \
| tahoe-* \
| tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \
| tile*-* \
@ -443,7 +432,6 @@ case $basic_machine in
| ubicom32-* \
| v850-* | v850e-* | v850e1-* | v850es-* | v850e2-* | v850e2v3-* \
| vax-* \
| visium-* \
| we32k-* \
| x86-* | x86_64-* | xc16x-* | xps100-* \
| xstormy16-* | xtensa*-* \
@ -520,9 +508,6 @@ case $basic_machine in
basic_machine=i386-pc
os=-aros
;;
asmjs)
basic_machine=asmjs-unknown
;;
aux)
basic_machine=m68k-apple
os=-aux
@ -784,9 +769,6 @@ case $basic_machine in
basic_machine=m68k-isi
os=-sysv
;;
leon-*|leon[3-9]-*)
basic_machine=sparc-`echo $basic_machine | sed 's/-.*//'`
;;
m68knommu)
basic_machine=m68k-unknown
os=-linux
@ -806,7 +788,7 @@ case $basic_machine in
basic_machine=ns32k-utek
os=-sysv
;;
microblaze*)
microblaze)
basic_machine=microblaze-xilinx
;;
mingw64)
@ -814,7 +796,7 @@ case $basic_machine in
os=-mingw64
;;
mingw32)
basic_machine=i686-pc
basic_machine=i386-pc
os=-mingw32
;;
mingw32ce)
@ -842,10 +824,6 @@ case $basic_machine in
basic_machine=powerpc-unknown
os=-morphos
;;
moxiebox)
basic_machine=moxie-unknown
os=-moxiebox
;;
msdos)
basic_machine=i386-pc
os=-msdos
@ -854,7 +832,7 @@ case $basic_machine in
basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'`
;;
msys)
basic_machine=i686-pc
basic_machine=i386-pc
os=-msys
;;
mvs)
@ -1045,11 +1023,7 @@ case $basic_machine in
basic_machine=i586-unknown
os=-pw32
;;
rdos | rdos64)
basic_machine=x86_64-pc
os=-rdos
;;
rdos32)
rdos)
basic_machine=i386-pc
os=-rdos
;;
@ -1376,13 +1350,13 @@ case $os in
-gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \
| -*vms* | -sco* | -esix* | -isc* | -aix* | -cnk* | -sunos | -sunos[34]*\
| -hpux* | -unos* | -osf* | -luna* | -dgux* | -auroraux* | -solaris* \
| -sym* | -kopensolaris* | -plan9* \
| -sym* | -kopensolaris* \
| -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \
| -aos* | -aros* | -cloudabi* | -sortix* \
| -aos* | -aros* \
| -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \
| -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \
| -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \
| -bitrig* | -openbsd* | -solidbsd* | -libertybsd* \
| -bitrig* | -openbsd* | -solidbsd* \
| -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \
| -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \
| -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \
@ -1391,15 +1365,14 @@ case $os in
| -cygwin* | -msys* | -pe* | -psos* | -moss* | -proelf* | -rtems* \
| -mingw32* | -mingw64* | -linux-gnu* | -linux-android* \
| -linux-newlib* | -linux-musl* | -linux-uclibc* \
| -uxpv* | -beos* | -mpeix* | -udk* | -moxiebox* \
| -uxpv* | -beos* | -mpeix* | -udk* \
| -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \
| -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \
| -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \
| -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \
| -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \
| -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \
| -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es* \
| -onefs* | -tirtos*)
| -skyos* | -haiku* | -rdos* | -toppers* | -drops* | -es*)
# Remember, each alternative MUST END IN *, to match a version number.
;;
-qnx*)
@ -1523,6 +1496,9 @@ case $os in
-aros*)
os=-aros
;;
-kaos*)
os=-kaos
;;
-zvmoe)
os=-zvmoe
;;
@ -1531,8 +1507,6 @@ case $os in
;;
-nacl*)
;;
-ios)
;;
-none)
;;
*)
@ -1573,9 +1547,6 @@ case $basic_machine in
c4x-* | tic4x-*)
os=-coff
;;
c8051-*)
os=-elf
;;
hexagon-*)
os=-elf
;;

@ -1,7 +1,7 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${exec_prefix}
libdir=${libdir}
includedir=${includedir}
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include
Name: @CMAKE_PROJECT_NAME@
Description: fast Fourier transform library

@ -3,6 +3,7 @@
This file is part of FFTS.
Copyright (c) 2012, Anthony M. Blake
Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -75,6 +76,9 @@ typedef struct _ffts_plan_t ffts_plan_t;
FFTS_API ffts_plan_t*
ffts_init_1d(size_t N, int sign);
FFTS_API ffts_plan_t*
ffts_init_1d_64f(size_t N, int sign);
FFTS_API ffts_plan_t*
ffts_init_2d(size_t N1, size_t N2, int sign);

@ -2,7 +2,7 @@
lib_LTLIBRARIES = libffts.la
libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c
libffts_la_SOURCES = ffts.c ffts_nd.c ffts_real.c ffts_real_nd.c ffts_transpose.c ffts_trig.c ffts_static.c ffts_chirp_z.c
libffts_la_SOURCES += codegen.h codegen_arm.h codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h macros-neon.h macros-sse.h macros.h neon.h neon_float.h patterns.h types.h vfp.h
if DYNAMIC_DISABLED
@ -14,7 +14,7 @@ endif
libffts_includedir=$(includedir)/ffts
libffts_include_HEADERS = ../include/ffts.h
AM_CFLAGS = -I$(top_srcdir)/include
AM_CFLAGS = -I$(top_srcdir)/include -DAUTOTOOLS_BUILD=yes
if HAVE_VFP
libffts_la_SOURCES += vfp.s

@ -139,9 +139,9 @@ transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N
#ifdef HAVE_SSE
if (sign < 0) {
p->constants = sse_constants;
p->constants = (const void*) sse_constants;
} else {
p->constants = sse_constants_inv;
p->constants = (const void*) sse_constants_inv;
}
#endif

@ -488,7 +488,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[0], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RDX, offsets[2], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[3], X64_RAX, 2);
@ -507,14 +507,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[6], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[7], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM8);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_movsxd_reg_memindex(ins, X64_R10, X64_R9, 0, X64_RAX, 2);
@ -530,7 +530,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0);
extend--;
x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
@ -538,10 +538,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM8);
x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
@ -551,7 +551,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_shufps_reg_reg_imm(ins, X64_XMM8, X64_XMM8, 0xB1);
x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
@ -580,7 +580,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
@ -588,7 +588,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0);
extend--;
x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
@ -620,7 +620,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[0], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM12, X64_RSI, offsets[2], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[3], X64_RAX, 2);
@ -640,14 +640,14 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
x64_sse_movaps_reg_memindex(ins, X64_XMM3, X64_RSI, offsets[6], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[7], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM15, X64_XMM3);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM12, X64_XMM12, 0xB1);
x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_movsxd_reg_memindex(ins, X64_R11, X64_R8, 0, X64_RAX, 2);
@ -663,7 +663,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movaps_reg_reg(ins, X64_XMM1, X64_XMM9);
x64_sse_movaps_reg_reg(ins, X64_XMM11, X64_XMM12);
x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM5, extend > 0 ? 8 : 0);
extend--;
x64_sse_mulps_reg_reg(ins, X64_XMM12, X64_XMM10);
@ -671,10 +671,10 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_addps_reg_reg(ins, X64_XMM1, X64_XMM15);
x64_sse_mulps_reg_reg(ins, X64_XMM11, X64_XMM3);
x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
x64_sse_addps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0);
x64_sse_subps_reg_reg_size(ins, X64_XMM5, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_shufps_reg_reg_imm(ins, X64_XMM10, X64_XMM10, 0xB1);
@ -684,7 +684,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_shufps_reg_reg_imm(ins, X64_XMM3, X64_XMM3, 0xB1);
x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM1, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_mulps_reg_reg(ins, X64_XMM10, X64_XMM0);
@ -713,7 +713,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_alu_reg_imm_size(ins, X86_ADD, X64_RAX, 4, 8);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM2, X64_XMM4, 0xEE);
x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM4, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg(ins, X64_XMM7, X64_XMM12);
@ -721,7 +721,7 @@ generate_leaf_ee(insns_t **fp, uint32_t *offsets, int extend)
x64_sse_movlhps_reg_reg(ins, X64_XMM4, X64_XMM7);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM1, X64_XMM7, 0xEE);
x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM5, extend > 0 ? 8 : 0);
extend--;
x64_sse_movlhps_reg_reg(ins, X64_XMM7, X64_XMM13);
@ -1157,28 +1157,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RDX, offsets[0], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RDX, offsets[1], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RDX, offsets[2], X64_RAX, 2);
x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RDX, offsets[3], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RDX, offsets[4], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM5, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RDX, offsets[5], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RDX, offsets[6], X64_RAX, 2);
@ -1206,7 +1206,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_movsxd_reg_memindex(ins, X64_R11, X64_R9, 8, X64_RAX, 2);
@ -1218,7 +1218,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);
@ -1257,28 +1257,28 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_memindex(ins, X64_XMM4, X64_RSI, offsets[0], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM6, X64_XMM4, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM7, X64_RSI, offsets[1], X64_RAX, 2);
x64_sse_movaps_reg_memindex(ins, X64_XMM10, X64_RSI, offsets[2], X64_RAX, 2);
x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0);
x64_sse_addps_reg_reg_size(ins, X64_XMM6, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0);
x64_sse_subps_reg_reg_size(ins, X64_XMM4, X64_XMM7, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM8, X64_RSI, offsets[3], X64_RAX, 2);
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM10);
x64_sse_movaps_reg_memindex(ins, X64_XMM1, X64_RSI, offsets[4], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM3, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM11, X64_RSI, offsets[5], X64_RAX, 2);
x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM2, X64_XMM1, extend > 0 ? 8 : 0);
extend--;
x64_sse_movaps_reg_memindex(ins, X64_XMM14, X64_RSI, offsets[6], X64_RAX, 2);
@ -1306,7 +1306,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_reg(ins, X64_XMM9, X64_XMM2);
x64_sse_shufps_reg_reg_imm(ins, X64_XMM14, X64_XMM14, 0xB1);
x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0);
x64_sse_movaps_reg_reg_size(ins, X64_XMM7, X64_XMM6, extend > 0 ? 8 : 0);
extend--;
x64_movsxd_reg_memindex(ins, X64_R12, X64_R8, 8, X64_RAX, 2);
@ -1318,7 +1318,7 @@ generate_leaf_oo(insns_t **fp, uint32_t loop_count, uint32_t *offsets, int exten
x64_sse_movaps_reg_reg(ins, X64_XMM13, X64_XMM1);
x64_sse_movaps_reg_reg(ins, X64_XMM8, X64_XMM2);
x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0);
x64_sse_movlhps_reg_reg_size(ins, X64_XMM7, X64_XMM4, extend > 0 ? 8 : 0);
extend--;
x64_sse_subps_reg_reg(ins, X64_XMM13, X64_XMM14);

@ -34,6 +34,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ffts.h"
#include "ffts_internal.h"
#include "ffts_chirp_z.h"
#include "ffts_static.h"
#include "ffts_trig.h"
#include "macros.h"
@ -76,7 +77,8 @@ static const FFTS_ALIGN(64) float w_data[16] = {
};
#endif
static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
static FFTS_INLINE int
ffts_allow_execute(void *start, size_t len)
{
int result;
@ -90,7 +92,8 @@ static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
return result;
}
static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
static FFTS_INLINE int
ffts_deny_execute(void *start, size_t len)
{
int result;
@ -104,7 +107,8 @@ static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
return result;
}
static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
static FFTS_INLINE int
ffts_flush_instruction_cache(void *start, size_t length)
{
#ifdef _WIN32
return !FlushInstructionCache(GetCurrentProcess(), start, length);
@ -124,7 +128,8 @@ static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
#endif
}
static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
static FFTS_INLINE void*
ffts_vmem_alloc(size_t length)
{
#if __APPLE__
return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
@ -139,7 +144,8 @@ static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
#endif
}
static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length)
static FFTS_INLINE void
ffts_vmem_free(void *addr, size_t length)
{
#ifdef _WIN32
(void) length;
@ -174,7 +180,8 @@ ffts_free(ffts_plan_t *p)
}
}
void ffts_free_1d(ffts_plan_t *p)
static void
ffts_free_1d(ffts_plan_t *p)
{
#if !defined(DYNAMIC_DISABLED)
if (p->transform_base) {
@ -188,7 +195,7 @@ void ffts_free_1d(ffts_plan_t *p)
}
if (p->ws) {
FFTS_FREE(p->ws);
ffts_aligned_free(p->ws);
}
if (p->is) {
@ -233,7 +240,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f);
#endif
p->ws = FFTS_MALLOC(lut_size, 32);
p->ws = ffts_aligned_malloc(lut_size);
if (!p->ws) {
goto cleanup;
}
@ -253,7 +260,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
/* calculate factors */
m = leaf_N << (n_luts - 2);
tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32);
tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_32f));
ffts_generate_cosine_sine_pow2_32f(tmp, m);
@ -263,7 +270,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
p->ws_is[i] = w - (ffts_cpx_32f*) p->ws;
if (!i) {
ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32);
ffts_cpx_32f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_32f));
float *fw0 = (float*) w0;
float *fw = (float*) w;
@ -300,11 +307,11 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
w += n/4 * 2;
#endif
FFTS_FREE(w0);
ffts_aligned_free(w0);
} else {
ffts_cpx_32f *w0 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
ffts_cpx_32f *w1 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
ffts_cpx_32f *w2 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
ffts_cpx_32f *w0 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
ffts_cpx_32f *w1 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
ffts_cpx_32f *w2 = (ffts_cpx_32f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_32f));
float *fw0 = (float*) w0;
float *fw1 = (float*) w1;
@ -380,9 +387,9 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
w += n/8 * 3 * 2;
#endif
FFTS_FREE(w0);
FFTS_FREE(w1);
FFTS_FREE(w2);
ffts_aligned_free(w0);
ffts_aligned_free(w1);
ffts_aligned_free(w2);
}
n *= 2;
@ -401,7 +408,7 @@ ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
}
#endif
FFTS_FREE(tmp);
ffts_aligned_free(tmp);
p->lastlut = w;
p->n_luts = n_luts;
@ -411,18 +418,166 @@ cleanup:
return -1;
}
#ifdef FFTS_DOUBLE
static int
ffts_generate_luts_64f(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
{
V4DF MULI_SIGN;
size_t n_luts;
ffts_cpx_64f *w;
ffts_cpx_64f *tmp;
size_t i, j, m, n;
int stride;
if (sign < 0) {
MULI_SIGN = V4DF_LIT4(-0.0, 0.0, -0.0, 0.0);
} else {
MULI_SIGN = V4DF_LIT4(0.0, -0.0, 0.0, -0.0);
}
/* LUTS */
n_luts = ffts_ctzl(N / leaf_N);
if (n_luts >= 32) {
n_luts = 0;
}
if (n_luts) {
size_t lut_size;
lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_64f);
p->ws = ffts_aligned_malloc(lut_size);
if (!p->ws) {
goto cleanup;
}
p->ws_is = (size_t*) malloc(n_luts * sizeof(*p->ws_is));
if (!p->ws_is) {
goto cleanup;
}
}
w = p->ws;
n = leaf_N * 2;
/* calculate factors */
m = leaf_N << (n_luts - 2);
tmp = ffts_aligned_malloc(m * sizeof(ffts_cpx_64f));
ffts_generate_cosine_sine_pow2_64f(tmp, m);
/* generate lookup tables */
stride = 1 << (n_luts - 1);
for (i = 0; i < n_luts; i++) {
p->ws_is[i] = w - (ffts_cpx_64f*) p->ws;
if (!i) {
ffts_cpx_64f *w0 = ffts_aligned_malloc(n/4 * sizeof(ffts_cpx_64f));
double *fw0 = (double*) w0;
double *fw = (double*) w;
for (j = 0; j < n/4; j++) {
w0[j][0] = tmp[j * stride][0];
w0[j][1] = tmp[j * stride][1];
}
for (j = 0; j < n/4; j += 2) {
V4DF re, im, temp0;
temp0 = V4DF_LD(fw0 + j*2);
re = V4DF_DUPLICATE_RE(temp0);
im = V4DF_DUPLICATE_IM(temp0);
im = V4DF_XOR(im, MULI_SIGN);
V4DF_ST(fw + j*4 + 0, re);
V4DF_ST(fw + j*4 + 4, im);
}
w += n/4 * 2;
ffts_aligned_free(w0);
} else {
ffts_cpx_64f *w0 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
ffts_cpx_64f *w1 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
ffts_cpx_64f *w2 = (ffts_cpx_64f*) ffts_aligned_malloc(n/8 * sizeof(ffts_cpx_64f));
double *fw0 = (double*) w0;
double *fw1 = (double*) w1;
double *fw2 = (double*) w2;
double *fw = (double*)w;
for (j = 0; j < n/8; j++) {
w0[j][0] = tmp[2 * j * stride][0];
w0[j][1] = tmp[2 * j * stride][1];
w1[j][0] = tmp[j * stride][0];
w1[j][1] = tmp[j * stride][1];
w2[j][0] = tmp[(j + (n/8)) * stride][0];
w2[j][1] = tmp[(j + (n/8)) * stride][1];
}
for (j = 0; j < n/8; j += 2) {
V4DF temp0, temp1, temp2, re, im;
temp0 = V4DF_LD(fw0 + j*2);
re = V4DF_DUPLICATE_RE(temp0);
im = V4DF_DUPLICATE_IM(temp0);
im = V4DF_XOR(im, MULI_SIGN);
V4DF_ST(fw + j*2*6+0, re);
V4DF_ST(fw + j*2*6+4, im);
temp1 = V4DF_LD(fw1 + j*2);
re = V4DF_DUPLICATE_RE(temp1);
im = V4DF_DUPLICATE_IM(temp1);
im = V4DF_XOR(im, MULI_SIGN);
V4DF_ST(fw + j*2*6+8 , re);
V4DF_ST(fw + j*2*6+12, im);
temp2 = V4DF_LD(fw2 + j*2);
re = V4DF_DUPLICATE_RE(temp2);
im = V4DF_DUPLICATE_IM(temp2);
im = V4DF_XOR(im, MULI_SIGN);
V4DF_ST(fw + j*2*6+16, re);
V4DF_ST(fw + j*2*6+20, im);
}
w += n/8 * 3 * 2;
ffts_aligned_free(w0);
ffts_aligned_free(w1);
ffts_aligned_free(w2);
}
n *= 2;
stride >>= 1;
}
ffts_aligned_free(tmp);
p->lastlut = w;
p->n_luts = n_luts;
return 0;
cleanup:
return -1;
}
#endif
FFTS_API ffts_plan_t*
ffts_init_1d(size_t N, int sign)
{
const size_t leaf_N = 8;
ffts_plan_t *p;
if (N < 2 || (N & (N - 1)) != 0) {
LOG("FFT size must be a power of two\n");
if (N < 2) {
LOG("FFT size must be greater than 1");
return NULL;
}
p = calloc(1, sizeof(*p));
/* check if size is not a power of two */
if (N & (N - 1)) {
return ffts_chirp_z_init(N, sign);
}
p = (ffts_plan_t*) calloc(1, sizeof(*p));
if (!p) {
return NULL;
}
@ -537,3 +692,98 @@ cleanup:
ffts_free_1d(p);
return NULL;
}
#ifdef FFTS_DOUBLE
FFTS_API ffts_plan_t*
ffts_init_1d_64f(size_t N, int sign)
{
const size_t leaf_N = 8;
ffts_plan_t *p;
if (N < 2) {
LOG("FFT size must be greater than 1");
return NULL;
}
p = (ffts_plan_t*) calloc(1, sizeof(*p));
if (!p) {
return NULL;
}
p->destroy = ffts_free_1d;
p->N = N;
if (N >= 32) {
/* generate lookup tables */
if (ffts_generate_luts_64f(p, N, leaf_N, sign)) {
goto cleanup;
}
p->offsets = ffts_init_offsets(N, leaf_N);
if (!p->offsets) {
goto cleanup;
}
p->is = ffts_init_is(N, leaf_N, 1);
if (!p->is) {
goto cleanup;
}
p->i0 = N/leaf_N/3 + 1;
p->i1 = p->i2 = N/leaf_N/3;
if ((N/leaf_N) % 3 > 1) {
p->i1++;
}
p->i0 /= 2;
p->i1 /= 2;
if (sign < 0) {
p->transform = ffts_static_transform_f_64f;
} else {
p->transform = ffts_static_transform_i_64f;
}
} else {
switch (N) {
case 2:
p->transform = &ffts_small_2_64f;
break;
case 4:
if (sign == -1) {
p->transform = &ffts_small_forward4_64f;
} else if (sign == 1) {
p->transform = &ffts_small_backward4_64f;
}
break;
case 8:
if (sign == -1) {
p->transform = &ffts_small_forward8_64f;
} else if (sign == 1) {
p->transform = &ffts_small_backward8_64f;
}
break;
case 16:
default:
if (sign == -1) {
p->transform = &ffts_small_forward16_64f;
} else {
p->transform = &ffts_small_backward16_64f;
}
break;
}
}
return p;
cleanup:
ffts_free_1d(p);
return NULL;
}
#else
FFTS_API ffts_plan_t*
ffts_init_1d_64f(size_t N, int sign)
{
/* disabled */
return NULL;
}
#endif

@ -0,0 +1,225 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_chirp_z.h"
#include "ffts_internal.h"
#include "ffts_trig.h"
/*
* For more information on algorithms:
*
* L. I. Bluestein, A linear filtering approach to the computation of
* the discrete Fourier transform, 1968 NEREM Rec., pp. 218-219
*
* Lawrence R. Rabiner, Ronald W. Schafer, Charles M. Rader,
* The Chirp z-Transform Algorithm and Its Application
* Bell Sys. Tech. J., vol. 48, pp. 1249-1292, May 1969.
*
* Rick Lyons, Four Ways to Compute an Inverse FFT Using the Forward FFT Algorithm
* https://www.dsprelated.com/showarticle/800.php, July 7, 2015
*/
/* forward declarations */
static void
ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out);
static void
ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out);
static void
ffts_chirp_z_free(ffts_plan_t *p)
{
if (p->B)
ffts_aligned_free(p->B);
if (p->A)
ffts_aligned_free(p->A);
if (p->buf)
ffts_aligned_free(p->buf);
if (p->plans[0])
ffts_free(p->plans[0]);
free(p);
}
ffts_plan_t*
ffts_chirp_z_init(size_t N, int sign)
{
float *A, *B, reciprocal_M, *tmp;
ffts_plan_t *p;
size_t i, M;
FFTS_ASSUME(N > 2);
p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
if (!p)
return NULL;
p->destroy = ffts_chirp_z_free;
p->N = N;
p->rank = 1;
p->plans = (ffts_plan_t**) &p[1];
if (sign < 0)
p->transform = ffts_chirp_z_transform_f_32f;
else
p->transform = ffts_chirp_z_transform_i_32f;
/* determinate next power of two such that M >= 2*N-1 */
M = ffts_next_power_of_2(2*N-1);
p->plans[0] = ffts_init_1d(M, FFTS_FORWARD);
if (!p->plans[0])
goto cleanup;
p->A = A = (float*) ffts_aligned_malloc(2 * N * sizeof(float));
if (!p->A)
goto cleanup;
p->B = B = (float*) ffts_aligned_malloc(2 * M * sizeof(float));
if (!p->B)
goto cleanup;
p->buf = tmp = (float*) ffts_aligned_malloc(2 * 2 * M * sizeof(float));
ffts_generate_chirp_32f((ffts_cpx_32f*) A, N);
/* scale with reciprocal of length */
reciprocal_M = 1.0f / M;
tmp[0] = A[0] * reciprocal_M;
tmp[1] = A[1] * reciprocal_M;
for (i = 1; i < N; ++i) {
tmp[2 * i + 0] = tmp[2 * (M - i) + 0] = A[2 * i + 0] * reciprocal_M;
tmp[2 * i + 1] = tmp[2 * (M - i) + 1] = A[2 * i + 1] * reciprocal_M;
}
/* zero pad */
for (; i <= M - N; ++i)
tmp[2 * i] = tmp[2 * i + 1] = 0.0f;
/* FFT */
p->plans[0]->transform(p->plans[0], tmp, B);
return p;
cleanup:
ffts_chirp_z_free(p);
return NULL;
}
static void
ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out)
{
const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
size_t i, M = p->plans[0]->N, N = p->N;
float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
const float *din = (const float*) in;
float *dout = (float*) out;
/* we know this */
FFTS_ASSUME(M >= 8);
/* multiply input with conjugated sequence */
for (i = 0; i < N; ++i) {
t1[2 * i + 0] = din[2 * i + 0] * A[2 * i + 0] + din[2 * i + 1] * A[2 * i + 1];
t1[2 * i + 1] = din[2 * i + 1] * A[2 * i + 0] - din[2 * i + 0] * A[2 * i + 1];
}
/* zero pad */
for (; i < M; ++i)
t1[2 * i] = t1[2 * i + 1] = 0.0f;
/* convolution using FFT */
p->plans[0]->transform(p->plans[0], t1, t2);
/* complex multiply */
for (i = 0; i < M; ++i) {
t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
}
/* IFFT using FFT with real and imaginary parts swapped */
p->plans[0]->transform(p->plans[0], t1, t2);
/* multiply output with conjugated sequence */
for (i = 0; i < N; ++i) {
dout[2 * i + 0] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
dout[2 * i + 1] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
}
}
/* IFFT using FFT with real and imaginary parts swapped */
static void
ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out)
{
const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
size_t i, M = p->plans[0]->N, N = p->N;
float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
const float *din = (const float*) in;
float *dout = (float*) out;
/* we know this */
FFTS_ASSUME(M >= 8);
/* multiply input with conjugated sequence */
for (i = 0; i < N; ++i) {
t1[2 * i + 0] = din[2 * i + 1] * A[2 * i + 0] + din[2 * i + 0] * A[2 * i + 1];
t1[2 * i + 1] = din[2 * i + 0] * A[2 * i + 0] - din[2 * i + 1] * A[2 * i + 1];
}
/* zero pad */
for (; i < M; ++i)
t1[2 * i] = t1[2 * i + 1] = 0.0f;
/* convolution using FFT */
p->plans[0]->transform(p->plans[0], t1, t2);
/* complex multiply */
for (i = 0; i < M; ++i) {
t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
}
/* IFFT using FFT with real and imaginary parts swapped */
p->plans[0]->transform(p->plans[0], t1, t2);
/* multiply output with conjugated sequence */
for (i = 0; i < N; ++i) {
dout[2 * i + 0] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
dout[2 * i + 1] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
}
}

@ -0,0 +1,45 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef FFTS_CHIRP_Z_H
#define FFTS_CHIRP_Z_H
#if defined (_MSC_VER) && (_MSC_VER >= 1020)
#pragma once
#endif
#include "ffts.h"
ffts_plan_t*
ffts_chirp_z_init(size_t N, int sign);
#endif /* FFTS_CHIRP_Z_H */

@ -0,0 +1,371 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_cpu.h"
#if defined(FFTS_BUILDING_CPU_TEST)
#include <stdio.h>
#endif
#if defined(_WIN32)
#include <intrin.h>
#include <windows.h>
#endif
/* TODO: add detection/declaration of these to CMake phase */
#if !defined(FFTS_CPU_X64)
#if defined(_M_AMD64) || defined(__amd64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64) || defined(__x86_64__)
/* 64 bit x86 detected */
#define FFTS_CPU_X64
#endif
#endif
#if !defined(FFTS_CPU_X64) && !defined(FFTS_CPU_X86)
#if defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_)
/* 32 bit x86 detected */
#define FFTS_CPU_X86
#endif
#endif
/* check if build is 32 bit or 64 bit x86 */
#if defined(FFTS_CPU_X64) || defined(FFTS_CPU_X86)
/* Build and tested on
CentOS 6.8 2.6.32-642.11.1.el6.x86_64 - gcc version 4.4.7 20120313
Mac OSX 10.9 - Apple Clang 6.0
Ubuntu 14.04 LTS 4.2.0-42 x86_64 - gcc version 4.8.4
Windows XP SP3 - Visual Studio 2005 SP1 x86/x64
Windows Vista SP2 - Visual Studio 2010 SP1 x86/x64
Windows 7 Ultimate SP1 - Visual Studio 2015 x86/x64
Windows 7 Ultimate SP1 - gcc version 4.9.2 (i686-posix-dwarf-rev1)
Windows 7 Ultimate SP1 - gcc version 4.9.2 (x86_64-posix-seh-rev3)
Windows 10 Pro - Visual Studio 2017 x86/x64
*/
/* Visual Studio 2010 SP1 or newer have _xgetbv intrinsic */
#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219)
#define FFTS_HAVE_XGETBV
#endif
#ifndef BIT
#define BIT(n) (1u << n)
#endif
/* bit masks */
#define FFTS_CPU_X86_SSE_BITS (BIT(0) | BIT(15) | BIT(23) | BIT(24) | BIT(25))
#define FFTS_CPU_X86_SSE2_BITS (BIT(26))
#define FFTS_CPU_X86_SSE3_BITS (BIT(0))
#define FFTS_CPU_X86_SSSE3_BITS (BIT(9))
#define FFTS_CPU_X86_SSE4_1_BITS (BIT(19))
#define FFTS_CPU_X86_SSE4_2_BITS (BIT(20) | BIT(23))
#define FFTS_CPU_X86_AVX_BITS (BIT(26) | BIT(27) | BIT(28))
#define FFTS_CPU_X86_XCR0_BITS (
#define FFTS_CPU_X86_AVX2_BITS (BIT(5))
#define FFTS_CPU_X86_AVX512_BITS (BIT(16))
/* Visual Studio 2008 or older */
#if defined(FFTS_CPU_X64) && defined(_MSC_VER) && _MSC_VER <= 1500
#pragma optimize("", off)
static void __fastcall ffts_cpuidex(int subleaf, int regs[4], int leaf)
{
/* x64 uses a four register fast-call calling convention by default and
arguments are passed in registers RCX, RDX, R8, and R9. By disabling
optimization and passing subleaf as first argument we get __cpuidex
*/
(void) subleaf;
__cpuid(regs, leaf);
}
#pragma optimize("", on)
#endif
static FFTS_INLINE void ffts_cpuid(int regs[4], int leaf, int subleaf)
{
#if defined(_MSC_VER)
#if defined(FFTS_CPU_X64)
/* Visual Studio 2010 or newer */
#if _MSC_VER > 1500
__cpuidex(regs, leaf, subleaf);
#else
ffts_cpuidex(subleaf, regs, leaf);
#endif
#else
__asm {
mov eax, leaf
mov ecx, subleaf
mov esi, regs
cpuid
mov [esi + 0x0], eax
mov [esi + 0x4], ebx
mov [esi + 0x8], ecx
mov [esi + 0xc], edx
}
#endif
#elif defined(__GNUC__) && __GNUC__
#if defined(FFTS_CPU_X64)
__asm__ __volatile__(
"cpuid\n\t"
: "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
: "a"(leaf), "c"(subleaf));
#elif defined(__PIC__)
__asm__ __volatile__(
"xchgl %%ebx, %1\n\t"
"cpuid \n\t"
"xchgl %%ebx, %1\n\t"
: "=a"(regs[0]), "=r"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
: "a"(leaf), "c"(subleaf));
#else
__asm__ __volatile__(
"cpuid\n\t"
: "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
: "a"(leaf), "c"(subleaf));
#endif
#else
/* unknown compiler for x86 */
regs[0] = regs[1] = regs[2] = regs[3] = 0;
#endif
}
/* at least Visual Studio 2010 generates invalidate optimized _xgetbv */
#if defined(FFTS_HAVE_XGETBV)
#pragma optimize("", off)
#endif
static FFTS_INLINE unsigned int ffts_get_xcr0(void)
{
#if defined(FFTS_HAVE_XGETBV)
return (unsigned int) _xgetbv(0);
#elif defined(_MSC_VER)
#if defined(FFTS_CPU_X64)
/* emulate xgetbv(0) on Windows 7 SP1 or newer */
typedef DWORD64 (WINAPI *PGETENABLEDXSTATEFEATURES)(VOID);
PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures =
(PGETENABLEDXSTATEFEATURES) GetProcAddress(
GetModuleHandle(TEXT("kernel32.dll")), "GetEnabledXStateFeatures");
return pfnGetEnabledXStateFeatures ? (unsigned int) pfnGetEnabledXStateFeatures() : 0;
#else
/* note that we have to touch edx register to tell compiler it's used by emited xgetbv */
unsigned __int32 hi, lo;
__asm {
xor ecx, ecx
_emit 0x0f
_emit 0x01
_emit 0xd0
mov lo, eax
mov hi, edx
}
return (unsigned int) lo;
#endif
#elif defined(__GNUC__) && __GNUC__
unsigned int lo;
__asm__ __volatile__(".byte 0x0f, 0x01, 0xd0\n"
: "=a"(lo)
: "c"(0)
: "edx");
return lo;
#else
/* unknown x86 compiler */
return 0;
#endif
}
#if defined(FFTS_HAVE_XGETBV)
#pragma optimize("", on)
#endif
int
ffts_cpu_detect(int *extra_flags)
{
static int cpu_flags = -1;
static int cpu_extra_flags = -1;
int max_basic_func;
int regs[4];
unsigned int xcr0;
if (cpu_flags >= 0) {
goto exit;
}
/* initialize */
cpu_flags = cpu_extra_flags = 0;
#if defined(FFTS_BUILDING_CPU_TEST)
printf("cpuid check: ");
#endif
#if defined(FFTS_CPU_X64)
/* cpuid is always supported on x64 */
#if defined(FFTS_BUILDING_CPU_TEST)
printf("skipped\n");
#endif
#else
#if defined(_MSC_VER)
_asm {
pushfd
pop eax
mov ebx,eax
xor eax,200000h
push eax
popfd
pushfd
pop eax
push ebx
popfd
mov regs[0 * TYPE regs],eax
mov regs[1 * TYPE regs],ebx
}
#else
__asm__ (
"pushfl\n\t"
"pop %0\n\t"
"movl %0,%1\n\t"
"xorl $0x200000,%0\n\t"
"pushl %0\n\t"
"popfl\n\t"
"pushfl\n\t"
"popl %0\n\t"
"pushl %1\n\t"
"popfl\n\t"
: "=r" (regs[0]), "=r" (regs[1])
);
#endif
/* check CPUID bit (bit 21) in EFLAGS register can be toggled */
if (((regs[0] ^ regs[1]) & 0x200000) == 0) {
#if defined(FFTS_BUILDING_CPU_TEST)
printf("not supported\n");
#endif
goto exit;
}
#if defined(FFTS_BUILDING_CPU_TEST)
printf("supported\n");
#endif
#endif
/* get the number of basic functions */
ffts_cpuid(regs, 0, 0);
max_basic_func = regs[0];
#if defined(FFTS_BUILDING_CPU_TEST)
printf("cpuid eax=0, ecx=0: %d\n", max_basic_func);
#endif
if (max_basic_func == 0)
goto exit;
/* get feature flags */
ffts_cpuid(regs, 1, 0);
#if defined(FFTS_BUILDING_CPU_TEST)
printf("cpuid eax=1, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
#endif
#if defined(FFTS_CPU_X64)
/* minimum for any x64 */
cpu_flags = FFTS_CPU_X86_SSE | FFTS_CPU_X86_SSE2;
#else
/* test if SSE is supported */
if ((regs[3] & FFTS_CPU_X86_SSE_BITS) != FFTS_CPU_X86_SSE_BITS)
goto exit;
cpu_flags = FFTS_CPU_X86_SSE;
/* test if SSE2 is supported */
if (!(regs[3] & FFTS_CPU_X86_SSE2_BITS))
goto exit;
cpu_flags |= FFTS_CPU_X86_SSE2;
#endif
/* test if SSE3 is supported */
if (!(regs[2] & FFTS_CPU_X86_SSE3_BITS))
goto exit;
cpu_flags |= FFTS_CPU_X86_SSE3;
/* test if SSSE3 is supported */
if (!(regs[2] & FFTS_CPU_X86_SSSE3_BITS))
goto exit;
cpu_flags |= FFTS_CPU_X86_SSSE3;
/* test if SSE4.1 is supported */
if (!(regs[2] & FFTS_CPU_X86_SSE4_1_BITS))
goto exit;
cpu_flags |= FFTS_CPU_X86_SSE4_1;
/* test if SSE4.2 is supported */
if ((regs[2] & FFTS_CPU_X86_SSE4_2_BITS) != FFTS_CPU_X86_SSE4_2_BITS)
goto exit;
cpu_flags |= FFTS_CPU_X86_SSE4_2;
/* test if AVX is supported */
if ((regs[2] & FFTS_CPU_X86_AVX_BITS) != FFTS_CPU_X86_AVX_BITS)
goto exit;
/* test if legaxy x87, 128-bit SSE and 256-bit AVX states are enabled in XCR0 */
xcr0 = ffts_get_xcr0();
#if defined(FFTS_BUILDING_CPU_TEST)
printf("xcr0: %u\n", xcr0);
#endif
if ((xcr0 & 0x6) != 0x6)
goto exit;
cpu_flags |= FFTS_CPU_X86_AVX;
/* check that cpuid extended features exist */
if (max_basic_func < 7)
goto exit;
/* get extended features */
ffts_cpuid(regs, 7, 0);
#if defined(FFTS_BUILDING_CPU_TEST)
printf("cpuid eax=7, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
#endif
/* test if AVX2 is supported */
if ((regs[1] & FFTS_CPU_X86_AVX2_BITS) != FFTS_CPU_X86_AVX2_BITS)
goto exit;
cpu_flags |= FFTS_CPU_X86_AVX2;
/* test if AVX512 is supported */
if ((regs[1] & FFTS_CPU_X86_AVX512_BITS) != FFTS_CPU_X86_AVX512_BITS)
goto exit;
cpu_flags |= FFTS_CPU_X86_AVX512;
exit:
if (extra_flags) {
*extra_flags = cpu_extra_flags;
}
return cpu_flags;
}
#else
int
ffts_cpu_detect(int *extra_flags)
{
/* not implemented */
#if defined(FFTS_BUILDING_CPU_TEST)
printf("CPU detection not implemented!!\n");
#endif
return 0;
}
#endif

@ -0,0 +1,54 @@
/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef FFTS_CPU_H
#if defined (_MSC_VER) && (_MSC_VER >= 1020)
#pragma once
#endif
#include "ffts_internal.h"
#define FFTS_CPU_X86_SSE 0x001
#define FFTS_CPU_X86_SSE2 0x002
#define FFTS_CPU_X86_SSE3 0x004
#define FFTS_CPU_X86_SSSE3 0x008
#define FFTS_CPU_X86_SSE4_1 0x010
#define FFTS_CPU_X86_SSE4_2 0x020
#define FFTS_CPU_X86_AVX 0x040
#define FFTS_CPU_X86_AVX2 0x080
#define FFTS_CPU_X86_AVX512 0x100
int
ffts_cpu_detect(int *extra_flags);
#endif /* FFTS_CPU_H */

@ -2,6 +2,7 @@
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
@ -34,7 +35,10 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef FFTS_INTERNAL_H
#define FFTS_INTERNAL_H
#ifdef AUTOTOOLS_BUILD
#include "config.h"
#endif
#include "ffts_attributes.h"
#include "types.h"
@ -42,18 +46,59 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <malloc.h>
#endif
#ifdef HAVE_MM_ALLOC_H
#include <mm_malloc.h>
#ifndef HAVE__MM_MALLOC
#define HAVE__MM_MALLOC
#endif
#endif
#include <stddef.h>
#ifdef HAVE_STDINT_H
#ifdef HAVE_INTTYPES_H
#include <inttypes.h>
#elif HAVE_STDINT_H
#include <stdint.h>
#elif _MSC_VER
typedef __int32 int32_t;
typedef __int64 int64_t;
typedef unsigned __int32 uint32_t;
typedef unsigned __int64 uint64_t;
#else
typedef signed long int int32_t;
typedef unsigned long int uint32_t;
typedef signed long long int int64_t;
typedef unsigned long long int uint64_t;
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef HAVE_STRING_H
#include <string.h>
#endif
#include <stdio.h>
#if defined(HAVE_DECL_MEMALIGN) && !HAVE_DECL_MEMALIGN
extern void *memalign(size_t, size_t);
#endif
#if defined(HAVE_DECL_POSIX_MEMALIGN) && !HAVE_DECL_POSIX_MEMALIGN
extern int posix_memalign(void **, size_t, size_t);
#endif
#if defined(HAVE_DECL_VALLOC) && !HAVE_DECL_VALLOC
extern void *valloc(size_t);
#endif
#ifdef _mm_malloc
#ifndef HAVE__MM_MALLOC
#define HAVE__MM_MALLOC
#endif
#endif
#ifdef ENABLE_LOG
#ifdef __ANDROID__
#include <android/log.h>
@ -142,11 +187,9 @@ struct _ffts_plan_t {
*/
size_t transform_size;
/**
* Points to the cosnant variables used by
* the Assembly Code
*/
void *constants;
/* pointer to the constant variable used by SSE for sign change */
/* TODO: #ifdef HAVE_SSE */
const void *constants;
// multi-dimensional stuff:
struct _ffts_plan_t **plans;
@ -171,44 +214,96 @@ struct _ffts_plan_t {
size_t i2;
};
static FFTS_INLINE void *ffts_aligned_malloc(size_t size)
static FFTS_INLINE void*
ffts_aligned_malloc(size_t size)
{
#if defined(_WIN32)
return _aligned_malloc(size, 32);
void *p = NULL;
/* various ways to allocate aligned memory in order of preferance */
#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
p = (void*) _mm_malloc(size, 32);
#elif defined(HAVE_POSIX_MEMALIGN)
if (posix_memalign(&p, 32, size))
p = NULL;
#elif defined(HAVE_MEMALIGN)
p = memalign(32, size);
#elif defined(__ALTIVEC__)
p = vec_malloc(size);
#elif defined(_MSC_VER) || defined(WIN32)
p = _aligned_malloc(size, 32);
#elif defined(HAVE_VALLOC)
p = valloc(size);
#else
return valloc(size);
p = malloc(size);
#endif
return p;
}
static FFTS_INLINE void ffts_aligned_free(void *p)
static FFTS_INLINE
void ffts_aligned_free(void *p)
{
#if defined(_WIN32)
/* order must match with ffts_aligned_malloc */
#if defined(__ICC) || defined(__INTEL_COMPILER) || defined(HAVE__MM_MALLOC)
_mm_free(p);
#elif defined(HAVE_POSIX_MEMALIGN) || defined(HAVE_MEMALIGN)
free(p);
#elif defined(__ALTIVEC__)
vec_free(p);
#elif defined(_MSC_VER) || defined(WIN32)
_aligned_free(p);
#else
/* valloc or malloc */
free(p);
#endif
}
#if GCC_VERSION_AT_LEAST(3,3)
#define ffts_ctzl __builtin_ctzl
static FFTS_INLINE size_t
ffts_next_power_of_2(size_t N)
{
return 1 << (32 - __builtin_clzl(N));
}
#elif defined(_MSC_VER)
#include <intrin.h>
#ifdef _M_X64
#pragma intrinsic(_BitScanForward64)
static __inline unsigned long ffts_ctzl(size_t N)
static FFTS_INLINE unsigned long
ffts_ctzl(size_t N)
{
unsigned long count;
_BitScanForward64((unsigned long*) &count, N);
return count;
}
#pragma intrinsic(_BitScanReverse64)
static FFTS_INLINE size_t
ffts_next_power_of_2(size_t N)
{
unsigned long log_2;
_BitScanReverse64((unsigned long*)&log_2, N);
return 1ULL << (log_2 + 1);
}
#else
#pragma intrinsic(_BitScanForward)
static __inline unsigned long ffts_ctzl(size_t N)
static FFTS_INLINE unsigned long
ffts_ctzl(size_t N)
{
unsigned long count;
_BitScanForward((unsigned long*) &count, N);
return count;
}
#pragma intrinsic(_BitScanReverse)
static FFTS_INLINE size_t
ffts_next_power_of_2(size_t N)
{
unsigned long log_2;
_BitScanReverse((unsigned long*)&log_2, N);
return 1 << (log_2 + 1);
}
#endif /* _WIN64 */
#endif /* _MSC_VER */

@ -4,7 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
Copyright (c) 2015 - 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@ -33,6 +33,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include "ffts_real.h"
#include "ffts_cpu.h"
#include "ffts_internal.h"
#include "ffts_trig.h"
@ -46,7 +47,8 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <pmmintrin.h>
#elif HAVE_INTRIN_H
#include <intrin.h>
#else
#endif
/* avoid using negative zero as some configurations have problems with those */
static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = {
0x80000000, 0, 0x80000000, 0
@ -55,7 +57,6 @@ static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = {
0, 0x80000000, 0, 0x80000000
};
#endif
#endif
static void
ffts_free_1d_real(ffts_plan_t *p)
@ -79,8 +80,9 @@ ffts_free_1d_real(ffts_plan_t *p)
free(p);
}
#ifdef __ARM_NEON__
static void
ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
ffts_execute_1d_real_neon(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT out =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
@ -91,25 +93,19 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
int i;
#ifdef __ARM_NEON__
float *p_buf0 = buf;
float *p_buf1 = buf + N - 2;
float *p_out = out;
#endif
int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
p->plans[0]->transform(p->plans[0], input, buf);
#ifndef HAVE_SSE
buf[N + 0] = buf[0];
buf[N + 1] = buf[1];
#endif
#ifdef __ARM_NEON__
for (i = 0; i < N; i += 4) {
__asm__ __volatile__ (
"vld1.32 {q8}, [%[pa]]!\n\t"
@ -151,7 +147,35 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#elif HAVE_SSE3
out[N + 0] = buf[0] - buf[1];
out[N + 1] = 0.0f;
}
#endif
#if HAVE_SSE3
static void
ffts_execute_1d_real_sse3(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT out =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
float *const FFTS_RESTRICT buf =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
const float *const FFTS_RESTRICT A =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
p->plans[0]->transform(p->plans[0], input, buf);
buf[N + 0] = buf[0];
buf[N + 1] = buf[1];
if (FFTS_UNLIKELY(N <= 8)) {
__m128 t0 = _mm_load_ps(buf);
__m128 t1 = _mm_load_ps(buf + N - 4);
@ -235,7 +259,32 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
}
#elif HAVE_SSE
out[N + 0] = buf[0] - buf[1];
out[N + 1] = 0.0f;
}
#endif
#ifdef HAVE_SSE
static void
ffts_execute_1d_real_sse(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT out =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
float *const FFTS_RESTRICT buf =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
const float *const FFTS_RESTRICT A =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
p->plans[0]->transform(p->plans[0], input, buf);
if (FFTS_UNLIKELY(N <= 8)) {
__m128 c0 = _mm_load_ps((const float*) sign_mask_even);
__m128 t0 = _mm_load_ps(buf);
@ -327,7 +376,34 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
_MM_SHUFFLE(2,3,0,1)))));
}
}
#else
out[N + 0] = buf[0] - buf[1];
out[N + 1] = 0.0f;
}
#endif
static void
ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT out =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
float *const FFTS_RESTRICT buf =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
const float *const FFTS_RESTRICT A =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
p->plans[0]->transform(p->plans[0], input, buf);
buf[N + 0] = buf[0];
buf[N + 1] = buf[1];
for (i = 0; i < N/2; i++) {
out[2*i + 0] =
buf[ 2*i + 0] * A[2*i + 0] - buf[ 2*i + 1] * A[2*i + 1] +
@ -336,14 +412,14 @@ ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
buf[ 2*i + 1] * A[2*i + 0] + buf[ 2*i + 0] * A[2*i + 1] +
buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
}
#endif
out[N + 0] = buf[0] - buf[1];
out[N + 1] = 0.0f;
}
#ifdef __ARM_NEON__
static void
ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
ffts_execute_1d_real_inv_neon(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT in =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
@ -354,18 +430,14 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
int i;
#ifdef __ARM_NEON__
float *p_buf0 = in;
float *p_buf1 = in + N - 2;
float *p_out = buf;
#endif
int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
#ifdef __ARM_NEON__
for (i = 0; i < N/2; i += 2) {
__asm__ __volatile__ (
"vld1.32 {q8}, [%[pa]]!\n\t"
@ -407,7 +479,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
#elif HAVE_SSE3
p->plans[0]->transform(p->plans[0], buf, output);
}
#endif
#if HAVE_SSE3
static void
ffts_execute_1d_real_inv_sse3(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT in =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
float *const FFTS_RESTRICT buf =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
const float *const FFTS_RESTRICT A =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
if (FFTS_UNLIKELY(N <= 8)) {
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
__m128 t1 = _mm_load_ps(in);
@ -492,7 +586,29 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
_mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
}
}
#elif HAVE_SSE
p->plans[0]->transform(p->plans[0], buf, output);
}
#endif
#if HAVE_SSE
static void
ffts_execute_1d_real_inv_sse(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT in =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
float *const FFTS_RESTRICT buf =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
const float *const FFTS_RESTRICT A =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
if (FFTS_UNLIKELY(N <= 8)) {
__m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
__m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
@ -585,7 +701,28 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
_mm_xor_ps(t4, c0))));
}
}
#else
p->plans[0]->transform(p->plans[0], buf, output);
}
#endif
static void
ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
{
float *const FFTS_RESTRICT in =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
float *const FFTS_RESTRICT buf =
(float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
const float *const FFTS_RESTRICT A =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
const float *const FFTS_RESTRICT B =
(const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
const int N = (const int) p->N;
int i;
/* we know this */
FFTS_ASSUME(N/2 > 0);
for (i = 0; i < N/2; i++) {
buf[2*i + 0] =
in[ 2*i + 0] * A[2*i + 0] + in[ 2*i + 1] * A[2*i + 1] +
@ -594,7 +731,6 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
in[ 2*i + 1] * A[2*i + 0] - in[ 2*i + 0] * A[2*i + 1] -
in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
}
#endif
p->plans[0]->transform(p->plans[0], buf, output);
}
@ -602,18 +738,35 @@ ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
FFTS_API ffts_plan_t*
ffts_init_1d_real(size_t N, int sign)
{
#ifndef __ARM_NEON__
int cpu_flags = ffts_cpu_detect(NULL);
#endif
ffts_plan_t *p;
int invert = 0;
p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
if (!p) {
return NULL;
}
if (sign < 0) {
p->transform = &ffts_execute_1d_real;
} else {
p->transform = &ffts_execute_1d_real_inv;
#ifdef __ARM_NEON__
p->transform = (sign < 0) ? &ffts_execute_1d_real_neon : &ffts_execute_1d_real_inv;
#else
#ifdef HAVE_SSE3
if (cpu_flags & FFTS_CPU_X86_SSE3) {
p->transform = (sign < 0) ? &ffts_execute_1d_real_sse3 : &ffts_execute_1d_real_inv_sse3;
invert = 1;
} else
#endif
#ifdef HAVE_SSE
if (cpu_flags & FFTS_CPU_X86_SSE) {
p->transform = (sign < 0) ? &ffts_execute_1d_real_sse : &ffts_execute_1d_real_inv_sse;
} else
#endif
{
p->transform = (sign < 0) ? &ffts_execute_1d_real : &ffts_execute_1d_real_inv;
}
#endif
p->destroy = &ffts_free_1d_real;
p->N = N;
@ -640,12 +793,7 @@ ffts_init_1d_real(size_t N, int sign)
goto cleanup;
}
#ifdef HAVE_SSE3
ffts_generate_table_1d_real_32f(p, sign, 1);
#else
ffts_generate_table_1d_real_32f(p, sign, 0);
#endif
ffts_generate_table_1d_real_32f(p, sign, invert);
return p;
cleanup:

@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@ -258,6 +259,29 @@ static const FFTS_ALIGN(16) double ffts_constants_inv_64f[16] = {
-0.7071067811865475244008443621048490392848359376884740
};
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_K_0(int inv,
V4DF *r0,
V4DF *r1,
V4DF *r2,
V4DF *r3)
{
V4DF t0, t1, t2, t3;
t0 = *r0;
t1 = *r1;
t2 = V4DF_ADD(*r2, *r3);
t3 = V4DF_IMULI(inv, V4DF_SUB(*r2, *r3));
*r0 = V4DF_ADD(t0, t2);
*r2 = V4DF_SUB(t0, t2);
*r1 = V4DF_SUB(t1, t3);
*r3 = V4DF_ADD(t1, t3);
}
#endif
static FFTS_INLINE void
V4SF_K_0(int inv,
V4SF *r0,
@ -279,6 +303,31 @@ V4SF_K_0(int inv,
*r3 = V4SF_ADD(t1, t3);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_L_2(const double *FFTS_RESTRICT i0,
const double *FFTS_RESTRICT i1,
const double *FFTS_RESTRICT i2,
const double *FFTS_RESTRICT i3,
V4DF *r0,
V4DF *r1,
V4DF *r2,
V4DF *r3)
{
V4DF t0, t1, t2, t3;
t0 = V4DF_LD(i0);
t1 = V4DF_LD(i1);
t2 = V4DF_LD(i2);
t3 = V4DF_LD(i3);
*r0 = V4DF_ADD(t0, t1);
*r1 = V4DF_SUB(t0, t1);
*r2 = V4DF_ADD(t2, t3);
*r3 = V4DF_SUB(t2, t3);
}
#endif
static FFTS_INLINE void
V4SF_L_2(const float *FFTS_RESTRICT i0,
const float *FFTS_RESTRICT i1,
@ -302,6 +351,37 @@ V4SF_L_2(const float *FFTS_RESTRICT i0,
*r3 = V4SF_SUB(t2, t3);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_L_4(int inv,
const double *FFTS_RESTRICT i0,
const double *FFTS_RESTRICT i1,
const double *FFTS_RESTRICT i2,
const double *FFTS_RESTRICT i3,
V4DF *r0,
V4DF *r1,
V4DF *r2,
V4DF *r3)
{
V4DF t0, t1, t2, t3, t4, t5, t6, t7;
t0 = V4DF_LD(i0);
t1 = V4DF_LD(i1);
t2 = V4DF_LD(i2);
t3 = V4DF_LD(i3);
t4 = V4DF_ADD(t0, t1);
t5 = V4DF_SUB(t0, t1);
t6 = V4DF_ADD(t2, t3);
t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3));
*r0 = V4DF_ADD(t4, t6);
*r2 = V4DF_SUB(t4, t6);
*r1 = V4DF_SUB(t5, t7);
*r3 = V4DF_ADD(t5, t7);
}
#endif
static FFTS_INLINE void
V4SF_L_4(int inv,
const float *FFTS_RESTRICT i0,
@ -331,6 +411,36 @@ V4SF_L_4(int inv,
*r3 = V4SF_ADD(t5, t7);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_LEAF_EE(double *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
const double *FFTS_RESTRICT in,
const ptrdiff_t *FFTS_RESTRICT is,
int inv)
{
const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
V4DF r0, r1, r2, r3, r4, r5, r6, r7;
double *out0 = out + os[0];
double *out1 = out + os[1];
V4DF_L_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
V4DF_L_2(in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7);
V4DF_K_0(inv, &r0, &r2, &r4, &r6);
V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7);
V4DF_TX2(&r0, &r1);
V4DF_TX2(&r2, &r3);
V4DF_TX2(&r4, &r5);
V4DF_TX2(&r6, &r7);
V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
#endif
static FFTS_INLINE void
V4SF_LEAF_EE(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@ -359,6 +469,36 @@ V4SF_LEAF_EE(float *const FFTS_RESTRICT out,
V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_LEAF_EE2(double *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
const double *FFTS_RESTRICT in,
const ptrdiff_t *FFTS_RESTRICT is,
int inv)
{
const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
V4DF r0, r1, r2, r3, r4, r5, r6, r7;
double *out0 = out + os[0];
double *out1 = out + os[1];
V4DF_L_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r0, &r1, &r2, &r3);
V4DF_L_2(in + is[0], in + is[1], in + is[3], in + is[2], &r4, &r5, &r6, &r7);
V4DF_K_0(inv, &r0, &r2, &r4, &r6);
V4DF_K_N(inv, V4DF_LD(LUT + 0), V4DF_LD(LUT + 4), &r1, &r3, &r5, &r7);
V4DF_TX2(&r0, &r1);
V4DF_TX2(&r2, &r3);
V4DF_TX2(&r4, &r5);
V4DF_TX2(&r6, &r7);
V4DF_S_4(r0, r2, r4, r6, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
V4DF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
#endif
static FFTS_INLINE void
V4SF_LEAF_EE2(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@ -387,6 +527,30 @@ V4SF_LEAF_EE2(float *const FFTS_RESTRICT out,
V4SF_S_4(r1, r3, r5, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_LEAF_EO(double *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
const double *FFTS_RESTRICT in,
const ptrdiff_t *FFTS_RESTRICT is,
int inv)
{
const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
V4DF r0, r1, r2, r3, r4, r5, r6, r7;
double *out0 = out + os[0];
double *out1 = out + os[1];
V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
V4DF_L_2_4(inv, in + is[4], in + is[5], in + is[6], in + is[7], &r4, &r5, &r6, &r7);
V4DF_S_4(r2, r3, r7, r6, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r1, &r4, &r5);
V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
}
#endif
static FFTS_INLINE void
V4SF_LEAF_EO(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@ -409,6 +573,30 @@ V4SF_LEAF_EO(float *const FFTS_RESTRICT out,
V4SF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_LEAF_OE(double *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
const double *FFTS_RESTRICT in,
const ptrdiff_t *FFTS_RESTRICT is,
int inv)
{
const double *FFTS_RESTRICT LUT = inv ? ffts_constants_inv_64f : ffts_constants_64f;
V4DF r0, r1, r2, r3, r4, r5, r6, r7;
double *out0 = out + os[0];
double *out1 = out + os[1];
V4DF_L_4_2(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7);
V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r6, &r7, &r2, &r3);
V4DF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
#endif
static FFTS_INLINE void
V4SF_LEAF_OE(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@ -431,6 +619,27 @@ V4SF_LEAF_OE(float *const FFTS_RESTRICT out,
V4SF_S_4(r6, r7, r2, r3, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_LEAF_OO(double *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
const double *FFTS_RESTRICT in,
const ptrdiff_t *FFTS_RESTRICT is,
int inv)
{
V4DF r0, r1, r2, r3, r4, r5, r6, r7;
double *out0 = out + os[0];
double *out1 = out + os[1];
V4DF_L_4_4(inv, in + is[0], in + is[1], in + is[2], in + is[3], &r0, &r1, &r2, &r3);
V4DF_L_4_4(inv, in + is[6], in + is[7], in + is[4], in + is[5], &r4, &r5, &r6, &r7);
V4DF_S_4(r0, r1, r4, r5, out0 + 0, out0 + 4, out0 + 8, out0 + 12);
V4DF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
#endif
static FFTS_INLINE void
V4SF_LEAF_OO(float *const FFTS_RESTRICT out,
const ptrdiff_t *FFTS_RESTRICT os,
@ -450,6 +659,34 @@ V4SF_LEAF_OO(float *const FFTS_RESTRICT out,
V4SF_S_4(r2, r3, r6, r7, out1 + 0, out1 + 4, out1 + 8, out1 + 12);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_X_4(int inv,
double *FFTS_RESTRICT data,
size_t N,
const double *FFTS_RESTRICT LUT)
{
size_t i;
for (i = 0; i < N/8; i++) {
V4DF r0 = V4DF_LD(data);
V4DF r1 = V4DF_LD(data + 2*N/4);
V4DF r2 = V4DF_LD(data + 4*N/4);
V4DF r3 = V4DF_LD(data + 6*N/4);
V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3);
V4DF_ST(data , r0);
V4DF_ST(data + 2*N/4, r1);
V4DF_ST(data + 4*N/4, r2);
V4DF_ST(data + 6*N/4, r3);
LUT += 8;
data += 4;
}
}
#endif
static FFTS_INLINE void
V4SF_X_4(int inv,
float *FFTS_RESTRICT data,
@ -536,6 +773,68 @@ V4SF_X_8(int inv,
}
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_X_8(int inv,
double *FFTS_RESTRICT data0,
size_t N,
const double *FFTS_RESTRICT LUT)
{
double *data1 = data0 + 1*N/4;
double *data2 = data0 + 2*N/4;
double *data3 = data0 + 3*N/4;
double *data4 = data0 + 4*N/4;
double *data5 = data0 + 5*N/4;
double *data6 = data0 + 6*N/4;
double *data7 = data0 + 7*N/4;
size_t i;
for (i = 0; i < N/16; i++) {
V4DF r0, r1, r2, r3, r4, r5, r6, r7;
r0 = V4DF_LD(data0);
r1 = V4DF_LD(data1);
r2 = V4DF_LD(data2);
r3 = V4DF_LD(data3);
V4DF_K_N(inv, V4DF_LD(LUT), V4DF_LD(LUT + 4), &r0, &r1, &r2, &r3);
r4 = V4DF_LD(data4);
r6 = V4DF_LD(data6);
V4DF_K_N(inv, V4DF_LD(LUT + 8), V4DF_LD(LUT + 12), &r0, &r2, &r4, &r6);
r5 = V4DF_LD(data5);
r7 = V4DF_LD(data7);
V4DF_K_N(inv, V4DF_LD(LUT + 16), V4DF_LD(LUT + 20), &r1, &r3, &r5, &r7);
LUT += 24;
V4DF_ST(data0, r0);
data0 += 4;
V4DF_ST(data1, r1);
data1 += 4;
V4DF_ST(data2, r2);
data2 += 4;
V4DF_ST(data3, r3);
data3 += 4;
V4DF_ST(data4, r4);
data4 += 4;
V4DF_ST(data5, r5);
data5 += 4;
V4DF_ST(data6, r6);
data6 += 4;
V4DF_ST(data7, r7);
data7 += 4;
}
}
#endif
static FFTS_INLINE void
ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out,
const float *FFTS_RESTRICT in,
@ -569,6 +868,41 @@ ffts_static_firstpass_odd_32f(float *const FFTS_RESTRICT out,
}
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
ffts_static_firstpass_odd_64f(double *const FFTS_RESTRICT out,
const double *FFTS_RESTRICT in,
const ffts_plan_t *FFTS_RESTRICT p,
int inv)
{
size_t i, i0 = p->i0, i1 = p->i1;
const ptrdiff_t *is = (const ptrdiff_t*) p->is;
const ptrdiff_t *os = (const ptrdiff_t*) p->offsets;
for (i = i0; i > 0; --i) {
V4DF_LEAF_EE(out, os, in, is, inv);
in += 4;
os += 2;
}
for (i = i1; i > 0; --i) {
V4DF_LEAF_OO(out, os, in, is, inv);
in += 4;
os += 2;
}
V4DF_LEAF_OE(out, os, in, is, inv);
in += 4;
os += 2;
for (i = i1; i > 0; --i) {
V4DF_LEAF_EE2(out, os, in, is, inv);
in += 4;
os += 2;
}
}
#endif
void
ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out)
{
@ -789,23 +1123,23 @@ ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out)
V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
}
#ifdef FFTS_DOUBLE
void
ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out)
{
const double *FFTS_RESTRICT lut = ffts_constants_small_64f;
const double *din = (const double*) in;
double *dout = (double*) out;
// V4SF r0_1, r2_3, r4_5, r6_7;
// double *LUT8 = (double*) p->ws + p->ws_is[0];
V4DF r0_1, r2_3, r4_5, r6_7;
/* unreferenced parameter */
(void) p;
(void) din;
(void) dout;
#if MACROS_READY
L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
#endif
V4DF_L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut + 4), &r0_1, &r2_3, &r4_5, &r6_7);
V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
}
#endif
void
ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out)
@ -823,24 +1157,23 @@ ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out)
V4SF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
}
#ifdef FFTS_DOUBLE
void
ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out)
{
const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f;
const double *din = (const double*) in;
double *dout = (double*) out;
// V4SF r0_1, r2_3, r4_5, r6_7;
// double *LUT8 = (double*) p->ws + p->ws_is[0];
(void) p;
(void) din;
(void) dout;
V4DF r0_1, r2_3, r4_5, r6_7;
/* unreferenced parameter */
(void) p;
#if MACROS_READY
L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
#endif
V4DF_L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
V4DF_S_4(r0_1, r2_3, r4_5, r6_7, dout+0, dout+4, dout+8, dout+12);
}
#endif
void
ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out)
@ -862,27 +1195,27 @@ ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out)
V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
}
#ifdef FFTS_DOUBLE
void
ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out)
{
const double *FFTS_RESTRICT lut = ffts_constants_small_64f;
const double *din = (const double*) in;
double *dout = (double*) out;
// double *LUT8 = (double*) p->ws;
// V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
/* unreferenced parameter */
(void) p;
(void) din;
(void) dout;
#ifdef MACROS_READY
L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
K_N(0, VLD(LUT8+8), VLD(LUT8+12), &r0_1, &r4_5, &r8_9, &r12_13);
S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
K_N(0, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
#endif
V4DF_L_4_4(0, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
V4DF_L_2_4(0, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
V4DF_K_N(0, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
V4DF_K_N(0, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13);
V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
V4DF_K_N(0, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15);
V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
}
#endif
void
ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out)
@ -904,27 +1237,27 @@ ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out)
V4SF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
}
#ifdef FFTS_DOUBLE
void
ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out)
{
const double *FFTS_RESTRICT lut = ffts_constants_small_inv_64f;
const double *din = (const double*) in;
double *dout = (double*) out;
// double *LUT8 = (double*) p->ws;
// V4SF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
V4DF r0_1, r2_3, r4_5, r6_7, r8_9, r10_11, r12_13, r14_15;
/* unreferenced parameter */
(void) p;
(void) din;
(void) dout;
#ifdef MACROS_READY
L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
K_N(1, VLD(LUT8+8), VLD(LUT8+12),&r0_1, &r4_5, &r8_9, &r12_13);
S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
K_N(1, VLD(LUT8+16), VLD(LUT8+20), &r2_3, &r6_7, &r10_11, &r14_15);
S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
#endif
V4DF_L_4_4(1, din+0, din+16, din+8, din+24, &r0_1, &r2_3, &r8_9, &r10_11);
V4DF_L_2_4(1, din+4, din+20, din+28, din+12, &r4_5, &r6_7, &r14_15, &r12_13);
V4DF_K_N(1, V4DF_LD(lut), V4DF_LD(lut+4), &r0_1, &r2_3, &r4_5, &r6_7);
V4DF_K_N(1, V4DF_LD(lut+8), V4DF_LD(lut+12), &r0_1, &r4_5, &r8_9, &r12_13);
V4DF_S_4(r0_1, r4_5, r8_9, r12_13, dout+0, dout+8, dout+16, dout+24);
V4DF_K_N(1, V4DF_LD(lut+16), V4DF_LD(lut+20), &r2_3, &r6_7, &r10_11, &r14_15);
V4DF_S_4(r2_3, r6_7, r10_11, r14_15, dout+4, dout+12, dout+20, dout+28);
}
#endif
static FFTS_INLINE void
ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out,
@ -959,6 +1292,41 @@ ffts_static_firstpass_even_32f(float *FFTS_RESTRICT out,
}
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
ffts_static_firstpass_even_64f(double *FFTS_RESTRICT out,
const double *FFTS_RESTRICT in,
const ffts_plan_t *FFTS_RESTRICT p,
int inv)
{
size_t i, i0 = p->i0, i1 = p->i1;
const ptrdiff_t *is = (const ptrdiff_t*) p->is;
const ptrdiff_t *os = (const ptrdiff_t*) p->offsets;
for(i = i0; i > 0; --i) {
V4DF_LEAF_EE(out, os, in, is, inv);
in += 4;
os += 2;
}
V4DF_LEAF_EO(out, os, in, is, inv);
in += 4;
os += 2;
for (i = i1; i > 0; --i) {
V4DF_LEAF_OO(out, os, in, is, inv);
in += 4;
os += 2;
}
for (i = i1; i > 0; --i) {
V4DF_LEAF_EE2(out, os, in, is, inv);
in += 4;
os += 2;
}
}
#endif
static void
ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
{
@ -1035,6 +1403,47 @@ ffts_static_rec_f_32f(const ffts_plan_t *p, float *data, size_t N)
#endif
}
#ifdef FFTS_DOUBLE
static void
ffts_static_rec_f_64f(const ffts_plan_t *p, double *data, size_t N)
{
const double *ws = (const double*) p->ws;
if (N > 128) {
const size_t N1 = N >> 1;
const size_t N2 = N >> 2;
const size_t N3 = N >> 3;
ffts_static_rec_f_64f(p, data , N2);
ffts_static_rec_f_64f(p, data + N1 , N3);
ffts_static_rec_f_64f(p, data + N1 + N2, N3);
ffts_static_rec_f_64f(p, data + N , N2);
ffts_static_rec_f_64f(p, data + N + N1 , N2);
V4DF_X_8(0, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
} else if (N == 128) {
const double *ws1 = ws + (p->ws_is[1] << 1);
V4DF_X_8(0, data + 0, 32, ws1);
V4DF_X_4(0, data + 64, 16, ws);
V4DF_X_4(0, data + 96, 16, ws);
V4DF_X_8(0, data + 128, 32, ws1);
V4DF_X_8(0, data + 192, 32, ws1);
V4DF_X_8(0, data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
V4DF_X_4(0, data + 0, 16, ws);
V4DF_X_4(0, data + 64, 16, ws);
V4DF_X_4(0, data + 96, 16, ws);
V4DF_X_8(0, data, 64, ws + (p->ws_is[2] << 1));
} else {
assert(N == 32);
V4DF_X_8(0, data, 32, ws + (p->ws_is[1] << 1));
}
}
#endif
static void
ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
{
@ -1111,6 +1520,47 @@ ffts_static_rec_i_32f(const ffts_plan_t *p, float *data, size_t N)
#endif
}
#ifdef FFTS_DOUBLE
static void
ffts_static_rec_i_64f(const ffts_plan_t *p, double *data, size_t N)
{
const double *ws = (const double*) p->ws;
if (N > 128) {
const size_t N1 = N >> 1;
const size_t N2 = N >> 2;
const size_t N3 = N >> 3;
ffts_static_rec_i_64f(p, data , N2);
ffts_static_rec_i_64f(p, data + N1 , N3);
ffts_static_rec_i_64f(p, data + N1 + N2, N3);
ffts_static_rec_i_64f(p, data + N , N2);
ffts_static_rec_i_64f(p, data + N + N1 , N2);
V4DF_X_8(1, data, N, ws + (p->ws_is[ffts_ctzl(N) - 4] << 1));
} else if (N == 128) {
const double *ws1 = ws + (p->ws_is[1] << 1);
V4DF_X_8(1, data + 0, 32, ws1);
V4DF_X_4(1, data + 64, 16, ws);
V4DF_X_4(1, data + 96, 16, ws);
V4DF_X_8(1, data + 128, 32, ws1);
V4DF_X_8(1, data + 192, 32, ws1);
V4DF_X_8(1, data, 128, ws + (p->ws_is[3] << 1));
} else if (N == 64) {
V4DF_X_4(1, data + 0, 16, ws);
V4DF_X_4(1, data + 64, 16, ws);
V4DF_X_4(1, data + 96, 16, ws);
V4DF_X_8(1, data, 64, ws + (p->ws_is[2] << 1));
} else {
assert(N == 32);
V4DF_X_8(1, data, 32, ws + (p->ws_is[1] << 1));
}
}
#endif
void
ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
{
@ -1172,6 +1622,26 @@ ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out)
#endif
}
#ifdef FFTS_DOUBLE
void
ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out)
{
const double *din = (const double*) in;
double *dout = (double*) out;
const size_t N = p->N;
const int N_log_2 = ffts_ctzl(N);
if (N_log_2 & 1) {
ffts_static_firstpass_odd_64f(dout, din, p, 0);
} else {
ffts_static_firstpass_even_64f(dout, din, p, 0);
}
ffts_static_rec_f_64f(p, dout, N);
}
#endif
void
ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
{
@ -1231,4 +1701,24 @@ ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out)
ffts_static_rec_i_32f(p, dout, N);
#endif
}
}
#ifdef FFTS_DOUBLE
void
ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out)
{
const double *din = (const double*) in;
double *dout = (double*) out;
const size_t N = p->N;
const int N_log_2 = ffts_ctzl(N);
if (N_log_2 & 1) {
ffts_static_firstpass_odd_64f(dout, din, p, 1);
} else {
ffts_static_firstpass_even_64f(dout, din, p, 1);
}
ffts_static_rec_i_64f(p, dout, N);
}
#endif

@ -43,49 +43,73 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
void
ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out);
#ifdef FFTS_DOUBLE
void
ffts_small_2_64f(ffts_plan_t *p, const void *in, void *out);
#endif
void
ffts_small_forward4_32f(ffts_plan_t *p, const void *in, void *out);
#ifdef FFTS_DOUBLE
void
ffts_small_forward4_64f(ffts_plan_t *p, const void *in, void *out);
#endif
void
ffts_small_backward4_32f(ffts_plan_t *p, const void *in, void *out);
#ifdef FFTS_DOUBLE
void
ffts_small_backward4_64f(ffts_plan_t *p, const void *in, void *out);
#endif
void
ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out);
#ifdef FFTS_DOUBLE
void
ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out);
#endif
void
ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out);
#ifdef FFTS_DOUBLE
void
ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out);
#endif
void
ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out);
#ifdef FFTS_DOUBLE
void
ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out);
#endif
void
ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out);
#ifdef FFTS_DOUBLE
void
ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out);
#endif
void
ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out);
#ifdef FFTS_DOUBLE
void
ffts_static_transform_f_64f(ffts_plan_t *p, const void *in, void *out);
#endif
void
ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out);
#ifdef FFTS_DOUBLE
void
ffts_static_transform_i_64f(ffts_plan_t *p, const void *in, void *out);
#endif
#endif /* FFTS_STATIC_H */

File diff suppressed because it is too large Load Diff

@ -2,7 +2,7 @@
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
Copyright (c) 2015-2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@ -39,8 +39,16 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "ffts_internal.h"
/* calculate cos(pi * n / d) and sin(pi * n / d) with maximum error less than 1 ULP, average ~0.5 ULP */
int
ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size);
ffts_cexp_32f(size_t n, size_t d, float *output);
int
ffts_generate_chirp_32f(ffts_cpx_32f *const table, size_t table_size);
/* generate cosine and sine tables with maximum error less than 1 ULP, average ~0.5 ULP */
int
ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, size_t table_size);
int
ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size);

@ -58,9 +58,6 @@ typedef union {
uint32_t u[4];
} V4SF;
#define FFTS_MALLOC(d,a) (malloc(d))
#define FFTS_FREE(d) (free(d))
static FFTS_ALWAYS_INLINE V4SF
V4SF_LIT4(float f3, float f2, float f1, float f0)
{

@ -4,6 +4,7 @@
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2019, Timothy Pearson <tpearson@raptorengineering.com>
All rights reserved.
@ -39,99 +40,89 @@
#define restrict
typedef vector float V;
typedef vector float V4SF;
typedef vector unsigned char VUC;
#ifdef __apple__
#define FFTS_MALLOC(d,a) vec_malloc(d)
#define FFTS_FREE(d) vec_free(d)
#else
/* It appears vec_malloc() and friends are not implemented on Linux */
#include <malloc.h>
#define FFTS_MALLOC(d,a) memalign(16,d)
#define FFTS_FREE(d) free(d)
#endif
#define VLIT4(f0,f1,f2,f3) ((V){f0, f1, f2, f3})
#define V4SF_LIT4(f0,f1,f2,f3) ((V4SF){f0, f1, f2, f3})
#define VADD(x,y) vec_add(x,y)
#define VSUB(x,y) vec_sub(x,y)
#define VMUL(x,y) vec_madd(x,y,(V){0})
#define VMULADD(x,y,z) vec_madd(x,y,z)
#define VNMULSUB(x,y,z) vec_nmsub(x,y,z)
#define VXOR(x,y) vec_xor((x),(y))
#define VSWAPPAIRS(x) \
#define V4SF_ADD(x,y) vec_add(x,y)
#define V4SF_SUB(x,y) vec_sub(x,y)
#define V4SF_MUL(x,y) vec_madd(x,y,(V4SF){0})
#define V4SF_MULADD(x,y,z) vec_madd(x,y,z)
#define V4SF_NMULSUB(x,y,z) vec_nmsub(x,y,z)
#define V4SF_XOR(x,y) vec_xor((x),(y))
#define V4SF_SWAPPAIRS(x) \
vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x00,0x01,0x02,0x03, \
0x0c,0x0d,0x0e,0x0f,0x08,0x09,0x0a,0x0b})
#define VBLEND(x,y) \
#define V4SF_BLEND(x,y) \
vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
#define VUNPACKHI(x,y) \
#define V4SF_UNPACK_HI(x,y) \
vec_perm(x,y,(VUC){0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, \
0x18,0x19,0x1a,0x1b,0x1c,0x1d,0x1e,0x1f})
#define VUNPACKLO(x,y) \
#define V4SF_UNPACK_LO(x,y) \
vec_perm(x,y,(VUC){0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, \
0x10,0x11,0x12,0x13,0x14,0x15,0x16,0x17})
#define VDUPRE(x) \
#define V4SF_DUPLICATE_RE(x) \
vec_perm(x,x,(VUC){0x00,0x01,0x02,0x03,0x00,0x01,0x02,0x03, \
0x18,0x19,0x1a,0x1b,0x18,0x19,0x1a,0x1b})
#define VDUPIM(x) \
#define V4SF_DUPLICATE_IM(x) \
vec_perm(x,x,(VUC){0x04,0x05,0x06,0x07,0x04,0x05,0x06,0x07, \
0x1c,0x1d,0x1e,0x1f,0x1c,0x1d,0x1e,0x1f})
static inline V IMUL(V d, V re, V im)
static inline V4SF V4SF_IMUL(V4SF d, V4SF re, V4SF im)
{
im = VMUL(im, VSWAPPAIRS(d));
re = VMUL(re, d);
return VSUB(re, im);
im = V4SF_MUL(im, V4SF_SWAPPAIRS(d));
re = V4SF_MUL(re, d);
return V4SF_SUB(re, im);
}
static inline V IMULJ(V d, V re, V im)
static inline V4SF V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
{
im = VMUL(im, VSWAPPAIRS(d));
return VMULADD(re, d, im);
im = V4SF_MUL(im, V4SF_SWAPPAIRS(d));
return V4SF_MULADD(re, d, im);
}
#ifndef __GNUC__
/* gcc (4.6 and 4.7) ICEs on this code! */
static inline V MULI(int inv, V x)
static inline V4SF MULI(int inv, V4SF x)
{
return VXOR(x, inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f));
return V4SF_XOR(x, inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f));
}
#else
/* but compiles this fine... */
static inline V MULI(int inv, V x)
static inline V4SF MULI(int inv, V4SF x)
{
V t;
t = inv ? VLIT4(-0.0f,0.0f,-0.0f,0.0f) : VLIT4(0.0f,-0.0f,0.0f,-0.0f);
return VXOR(x, t);
V4SF t;
t = inv ? V4SF_LIT4(-0.0f,0.0f,-0.0f,0.0f) : V4SF_LIT4(0.0f,-0.0f,0.0f,-0.0f);
return V4SF_XOR(x, t);
}
#endif
static inline V IMULI(int inv, V x)
static inline V4SF V4SF_IMULI(int inv, V4SF x)
{
return VSWAPPAIRS(MULI(inv, x));
return V4SF_SWAPPAIRS(MULI(inv, x));
}
static inline V VLD(const void *s)
static inline V4SF V4SF_LD(const void *s)
{
V *d = (V *)s;
V4SF *d = (V4SF *)s;
return *d;
}
static inline void VST(void *d, V s)
static inline void V4SF_ST(void *d, V4SF s)
{
V *r = (V *)d;
V4SF *r = (V4SF *)d;
*r = s;
}
#endif

@ -39,9 +39,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdlib.h>
#endif
#define FFTS_MALLOC(d,a) (valloc(d))
#define FFTS_FREE(d) (free(d))
typedef float32x4_t V4SF;
typedef float32x4x2_t V4SF2;

@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@ -40,9 +41,6 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <xmmintrin.h>
#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
#define FFTS_FREE(d) (_mm_free(d))
typedef __m128 V4SF;
#define V4SF_ADD _mm_add_ps
@ -56,8 +54,9 @@ typedef __m128 V4SF;
#define V4SF_SWAP_PAIRS(x) \
(_mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1)))
/* note: order is swapped */
#define V4SF_UNPACK_HI(x,y) \
(_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,3,2)))
(_mm_movehl_ps(y, x))
#define V4SF_UNPACK_LO(x,y) \
(_mm_movelh_ps(x, y))
@ -97,4 +96,220 @@ V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
return V4SF_ADD(re, im);
}
#ifdef FFTS_DOUBLE
typedef union {
struct {
double r1;
double i1;
double r2;
double i2;
} r;
uint32_t u[8];
} V4DF;
static FFTS_ALWAYS_INLINE V4DF
V4DF_LIT4(double f3, double f2, double f1, double f0)
{
V4DF z;
z.r.r1 = f0;
z.r.i1 = f1;
z.r.r2 = f2;
z.r.i2 = f3;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_ADD(V4DF x, V4DF y)
{
V4DF z;
z.r.r1 = x.r.r1 + y.r.r1;
z.r.i1 = x.r.i1 + y.r.i1;
z.r.r2 = x.r.r2 + y.r.r2;
z.r.i2 = x.r.i2 + y.r.i2;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_SUB(V4DF x, V4DF y)
{
V4DF z;
z.r.r1 = x.r.r1 - y.r.r1;
z.r.i1 = x.r.i1 - y.r.i1;
z.r.r2 = x.r.r2 - y.r.r2;
z.r.i2 = x.r.i2 - y.r.i2;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_MUL(V4DF x, V4DF y)
{
V4DF z;
z.r.r1 = x.r.r1 * y.r.r1;
z.r.i1 = x.r.i1 * y.r.i1;
z.r.r2 = x.r.r2 * y.r.r2;
z.r.i2 = x.r.i2 * y.r.i2;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_XOR(V4DF x, V4DF y)
{
V4DF z;
z.u[0] = x.u[0] ^ y.u[0];
z.u[1] = x.u[1] ^ y.u[1];
z.u[2] = x.u[2] ^ y.u[2];
z.u[3] = x.u[3] ^ y.u[3];
z.u[4] = x.u[4] ^ y.u[4];
z.u[5] = x.u[5] ^ y.u[5];
z.u[6] = x.u[6] ^ y.u[6];
z.u[7] = x.u[7] ^ y.u[7];
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_SWAP_PAIRS(V4DF x)
{
V4DF z;
z.r.r1 = x.r.i1;
z.r.i1 = x.r.r1;
z.r.r2 = x.r.i2;
z.r.i2 = x.r.r2;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_BLEND(V4DF x, V4DF y)
{
V4DF z;
z.r.r1 = x.r.r1;
z.r.i1 = x.r.i1;
z.r.r2 = y.r.r2;
z.r.i2 = y.r.i2;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_UNPACK_HI(V4DF x, V4DF y)
{
V4DF z;
z.r.r1 = x.r.r2;
z.r.i1 = x.r.i2;
z.r.r2 = y.r.r2;
z.r.i2 = y.r.i2;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_UNPACK_LO(V4DF x, V4DF y)
{
V4DF z;
z.r.r1 = x.r.r1;
z.r.i1 = x.r.i1;
z.r.r2 = y.r.r1;
z.r.i2 = y.r.i1;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_DUPLICATE_RE(V4DF x)
{
V4DF z;
z.r.r1 = x.r.r1;
z.r.i1 = x.r.r1;
z.r.r2 = x.r.r2;
z.r.i2 = x.r.r2;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_DUPLICATE_IM(V4DF x)
{
V4DF z;
z.r.r1 = x.r.i1;
z.r.i1 = x.r.i1;
z.r.r2 = x.r.i2;
z.r.i2 = x.r.i2;
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_IMUL(V4DF d, V4DF re, V4DF im)
{
re = V4DF_MUL(re, d);
im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d));
return V4DF_SUB(re, im);
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_IMULJ(V4DF d, V4DF re, V4DF im)
{
re = V4DF_MUL(re, d);
im = V4DF_MUL(im, V4DF_SWAP_PAIRS(d));
return V4DF_ADD(re, im);
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_MULI(int inv, V4DF x)
{
V4DF z;
if (inv) {
z.r.r1 = -x.r.r1;
z.r.i1 = x.r.i1;
z.r.r2 = -x.r.r2;
z.r.i2 = x.r.i2;
} else {
z.r.r1 = x.r.r1;
z.r.i1 = -x.r.i1;
z.r.r2 = x.r.r2;
z.r.i2 = -x.r.i2;
}
return z;
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_IMULI(int inv, V4DF x)
{
return V4DF_SWAP_PAIRS(V4DF_MULI(inv, x));
}
static FFTS_ALWAYS_INLINE V4DF
V4DF_LD(const void *s)
{
V4DF z;
memcpy(&z, s, sizeof(z));
return z;
}
static FFTS_ALWAYS_INLINE void
V4DF_ST(void *d, V4DF s)
{
V4DF *r = (V4DF*) d;
*r = s;
}
#endif
#endif /* FFTS_MACROS_SSE_H */

@ -4,6 +4,7 @@ This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
All rights reserved.
@ -41,14 +42,29 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef HAVE_NEON
#include "macros-neon.h"
#elif HAVE_SSE
#ifdef HAVE_AVX
#include "macros-avx.h"
#else
#include "macros-sse.h"
#endif
// NOTE: AltiVec support disabled until updated to provide new V4SF variable type
//#elif __powerpc__
//#include "macros-altivec.h"
#elif __powerpc__
#include "macros-altivec.h"
#else
#include "macros-alpha.h"
#endif
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_TX2(V4DF *a, V4DF *b)
{
V4DF t0 = V4DF_UNPACK_LO(*a, *b);
V4DF t1 = V4DF_UNPACK_HI(*a, *b);
*a = t0;
*b = t1;
}
#endif
static FFTS_INLINE void
V4SF_TX2(V4SF *a, V4SF *b)
{
@ -58,6 +74,34 @@ V4SF_TX2(V4SF *a, V4SF *b)
*b = t1;
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_K_N(int inv,
V4DF re,
V4DF im,
V4DF *r0,
V4DF *r1,
V4DF *r2,
V4DF *r3)
{
V4DF uk, uk2, zk_p, zk_n, zk, zk_d;
uk = *r0;
uk2 = *r1;
zk_p = V4DF_IMUL(*r2, re, im);
zk_n = V4DF_IMULJ(*r3, re, im);
zk = V4DF_ADD(zk_p, zk_n);
zk_d = V4DF_IMULI(inv, V4DF_SUB(zk_p, zk_n));
*r2 = V4DF_SUB(uk, zk);
*r0 = V4DF_ADD(uk, zk);
*r3 = V4DF_ADD(uk2, zk_d);
*r1 = V4DF_SUB(uk2, zk_d);
}
#endif
static FFTS_INLINE void
V4SF_K_N(int inv,
V4SF re,
@ -84,6 +128,45 @@ V4SF_K_N(int inv,
*r1 = V4SF_SUB(uk2, zk_d);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_L_2_4(int inv,
const double *FFTS_RESTRICT i0,
const double *FFTS_RESTRICT i1,
const double *FFTS_RESTRICT i2,
const double *FFTS_RESTRICT i3,
V4DF *r0,
V4DF *r1,
V4DF *r2,
V4DF *r3)
{
V4DF t0, t1, t2, t3, t4, t5, t6, t7;
t0 = V4DF_LD(i0);
t1 = V4DF_LD(i1);
t2 = V4DF_LD(i2);
t3 = V4DF_LD(i3);
t4 = V4DF_ADD(t0, t1);
t5 = V4DF_SUB(t0, t1);
t6 = V4DF_ADD(t2, t3);
t7 = V4DF_SUB(t2, t3);
*r0 = V4DF_UNPACK_LO(t4, t5);
*r1 = V4DF_UNPACK_LO(t6, t7);
t5 = V4DF_IMULI(inv, t5);
t0 = V4DF_ADD(t6, t4);
t2 = V4DF_SUB(t6, t4);
t1 = V4DF_SUB(t7, t5);
t3 = V4DF_ADD(t7, t5);
*r3 = V4DF_UNPACK_HI(t0, t1);
*r2 = V4DF_UNPACK_HI(t2, t3);
}
#endif
static FFTS_INLINE void
V4SF_L_2_4(int inv,
const float *FFTS_RESTRICT i0,
@ -121,6 +204,46 @@ V4SF_L_2_4(int inv,
*r2 = V4SF_UNPACK_HI(t2, t3);
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_L_4_4(int inv,
const double *FFTS_RESTRICT i0,
const double *FFTS_RESTRICT i1,
const double *FFTS_RESTRICT i2,
const double *FFTS_RESTRICT i3,
V4DF *r0,
V4DF *r1,
V4DF *r2,
V4DF *r3)
{
V4DF t0, t1, t2, t3, t4, t5, t6, t7;
t0 = V4DF_LD(i0);
t1 = V4DF_LD(i1);
t2 = V4DF_LD(i2);
t3 = V4DF_LD(i3);
t4 = V4DF_ADD(t0, t1);
t5 = V4DF_SUB(t0, t1);
t6 = V4DF_ADD(t2, t3);
t7 = V4DF_IMULI(inv, V4DF_SUB(t2, t3));
t0 = V4DF_ADD(t4, t6);
t2 = V4DF_SUB(t4, t6);
t1 = V4DF_SUB(t5, t7);
t3 = V4DF_ADD(t5, t7);
V4DF_TX2(&t0, &t1);
V4DF_TX2(&t2, &t3);
*r0 = t0;
*r2 = t1;
*r1 = t2;
*r3 = t3;
}
#endif
static FFTS_INLINE void
V4SF_L_4_4(int inv,
const float *FFTS_RESTRICT i0,
@ -159,6 +282,48 @@ V4SF_L_4_4(int inv,
*r3 = t3;
}
#ifdef FFTS_DOUBLE
static FFTS_INLINE void
V4DF_L_4_2(int inv,
const double *FFTS_RESTRICT i0,
const double *FFTS_RESTRICT i1,
const double *FFTS_RESTRICT i2,
const double *FFTS_RESTRICT i3,
V4DF *r0,
V4DF *r1,
V4DF *r2,
V4DF *r3)
{
V4DF t0, t1, t2, t3, t4, t5, t6, t7;
t0 = V4DF_LD(i0);
t1 = V4DF_LD(i1);
t6 = V4DF_LD(i2);
t7 = V4DF_LD(i3);
t2 = V4DF_BLEND(t6, t7);
t3 = V4DF_BLEND(t7, t6);
t4 = V4DF_ADD(t0, t1);
t5 = V4DF_SUB(t0, t1);
t6 = V4DF_ADD(t2, t3);
t7 = V4DF_SUB(t2, t3);
*r2 = V4DF_UNPACK_HI(t4, t5);
*r3 = V4DF_UNPACK_HI(t6, t7);
t7 = V4DF_IMULI(inv, t7);
t0 = V4DF_ADD(t4, t6);
t2 = V4DF_SUB(t4, t6);
t1 = V4DF_SUB(t5, t7);
t3 = V4DF_ADD(t5, t7);
*r0 = V4DF_UNPACK_LO(t0, t1);
*r1 = V4DF_UNPACK_LO(t2, t3);
}
#endif
static FFTS_INLINE void
V4SF_L_4_2(int inv,
const float *FFTS_RESTRICT i0,
@ -199,6 +364,9 @@ V4SF_L_4_2(int inv,
*r1 = V4SF_UNPACK_LO(t2, t3);
}
#define V4DF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
V4DF_ST(o0, r0); V4DF_ST(o1, r1); V4DF_ST(o2, r2); V4DF_ST(o3, r3);
#define V4SF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
V4SF_ST(o0, r0); V4SF_ST(o1, r1); V4SF_ST(o2, r2); V4SF_ST(o3, r3);

Loading…
Cancel
Save