Switch FFTS to linkotec branch for cross-arch support

8 years ago · e4e92bf2b0
parent d8856bdf08
commit e4e92bf2b0
129 changed files with 36385 additions and 8358 deletions
--- a/lib/ffts/CMakeLists.txt
+++ b/lib/ffts/CMakeLists.txt
@ -0,0 +1,462 @@
+cmake_minimum_required(VERSION 2.8.12 FATAL_ERROR)
+
+project(ffts C ASM)
+
+# TODO: to support AutoConfigure building, this should came from "template" file
+set(FFTS_MAJOR 0)
+set(FFTS_MINOR 9)
+set(FFTS_MICRO 0)
+
+set(FFTS_VERSION "ffts-${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}")
+
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+
+# default build type is Debug which means no optimization
+if(NOT CMAKE_BUILD_TYPE)
+  set(CMAKE_BUILD_TYPE "Release")
+endif(NOT CMAKE_BUILD_TYPE)
+
+# common options
+option(ENABLE_NEON
+  "Enables the use of NEON instructions." OFF
+)
+
+option(ENABLE_VFP
+  "Enables the use of VFP instructions." OFF
+)
+
+option(DISABLE_DYNAMIC_CODE
+  "Disables the use of dynamic machine code generation." OFF
+)
+
+option(GENERATE_POSITION_INDEPENDENT_CODE
+  "Generate position independent code" OFF
+)
+
+option(ENABLE_SHARED
+  "Enable building a shared library." OFF
+)
+
+option(ENABLE_STATIC
+  "Enable building a static library." ON
+)
+
+include(CheckCSourceCompiles)
+include(CheckCSourceRuns)
+include(CheckIncludeFile)
+
+# Ensure defined when building FFTS (as opposed to using it from
+# another project). Used to export functions from Windows DLL.
+add_definitions(-DFFTS_BUILD)
+
+# check existence of various headers
+check_include_file(malloc.h   HAVE_MALLOC_H)
+check_include_file(stdint.h   HAVE_STDINT_H)
+check_include_file(stdlib.h   HAVE_STDLIB_H)
+check_include_file(string.h   HAVE_STRING_H)
+check_include_file(sys/mman.h HAVE_SYS_MMAN_H)
+check_include_file(unistd.h   HAVE_UNISTD_H)
+
+if(HAVE_MALLOC_H)
+  add_definitions(-DHAVE_MALLOC_H)
+endif(HAVE_MALLOC_H)
+
+if(HAVE_STDINT_H)
+  add_definitions(-DHAVE_STDINT_H)
+endif(HAVE_STDINT_H)
+
+if(HAVE_STDLIB_H)
+  add_definitions(-DHAVE_STDLIB_H)
+endif(HAVE_STDLIB_H)
+
+if(HAVE_STRING_H)
+  add_definitions(-DHAVE_STRING_H)
+endif(HAVE_STRING_H)
+
+if(HAVE_SYS_MMAN_H)
+  add_definitions(-DHAVE_SYS_MMAN_H)
+endif(HAVE_SYS_MMAN_H)
+
+if(HAVE_UNISTD_H)
+  add_definitions(-DHAVE_UNISTD_H)
+endif(HAVE_UNISTD_H)
+
+# backup flags
+set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+
+# Determinate if we are cross-compiling
+if(NOT CMAKE_CROSSCOMPILING)
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+    # Determinate ARM architecture
+
+    # Try to execute quietly without messages
+    set(CMAKE_REQUIRED_QUIET 1)
+
+    # The test for ARM architecture
+    set(TEST_SOURCE_CODE "int main() { return 0; }")
+
+    # GCC documentation says "native" is only supported on Linux, but let's try
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -march=native")
+    check_c_source_runs("${TEST_SOURCE_CODE}" GCC_MARCH_NATIVE_FLAG_SUPPORTED)
+
+    if(NOT GCC_MARCH_NATIVE_FLAG_SUPPORTED)
+      # Fallback trying generic ARMv7
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -march=armv7-a")
+      check_c_source_runs("${TEST_SOURCE_CODE}" GCC_MARCH_ARMV7A_FLAG_SUPPORTED)
+
+      if(NOT GCC_MARCH_ARMV7A_FLAG_SUPPORTED)
+        # Fallback trying generic ARMv6
+        set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -march=armv6")
+        check_c_source_runs("${TEST_SOURCE_CODE}" GCC_MARCH_ARMV6_FLAG_SUPPORTED)
+
+        if(NOT GCC_MARCH_ARMV6_FLAG_SUPPORTED)
+          message(WARNING "FFTS failed to determinate ARM architecture")
+          set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+        else()
+          message("FFTS is build using 'march=armv6'")
+          set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=armv6")
+          set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv6")
+        endif(NOT GCC_MARCH_ARMV6_FLAG_SUPPORTED)
+      else()
+        message("FFTS is build using 'march=armv7-a'")
+        set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=armv7-a")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv7-a")
+      endif(NOT GCC_MARCH_ARMV7A_FLAG_SUPPORTED)
+    else()
+       message("FFTS is build using 'march=native'")
+       set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -march=native")
+       set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
+    endif(NOT GCC_MARCH_NATIVE_FLAG_SUPPORTED)
+
+    # Determinate what floating-point hardware (or hardware emulation) is available
+    set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+
+    # The test for ARM NEON support
+    set(TEST_SOURCE_CODE "
+      #include <arm_neon.h>
+      int main()
+      {
+       float32x4_t v;
+       float zeros[4] = {0.0f, 0.0f, 0.0f, 0.0f};
+       v = vld1q_f32(zeros);
+       return 0;
+      }"
+    )
+
+    # Test running with -mfpu=neon and -mfloat-abi=hard
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -mfpu=neon -mfloat-abi=hard")
+    check_c_source_runs("${TEST_SOURCE_CODE}" NEON_HARDFP_SUPPORTED)
+
+    if(NOT NEON_HARDFP_SUPPORTED)
+      # Test running with -mfpu=neon and -mfloat-abi=softfp
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -mfpu=neon -mfloat-abi=softfp")
+      check_c_source_runs("${TEST_SOURCE_CODE}" NEON_SOFTFP_SUPPORTED)
+      
+      if(NOT NEON_SOFTFP_SUPPORTED)
+        if(ENABLE_NEON)
+          message(FATAL_ERROR "FFTS cannot enable NEON on this platform")
+        endif(ENABLE_NEON)
+      else()
+        message("FFTS is using 'neon' FPU and 'softfp' float ABI")
+        set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfpu=neon -mfloat-abi=softfp")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -mfloat-abi=softfp")
+        set(ENABLE_NEON ON)
+      endif(NOT NEON_SOFTFP_SUPPORTED)
+    else()
+      message("FFTS is using 'neon' FPU and 'hard' float ABI")
+      set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfpu=neon -mfloat-abi=hard")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -mfloat-abi=hard")
+      set(ENABLE_NEON ON)
+    endif(NOT NEON_HARDFP_SUPPORTED)
+
+    # Fallback using VFP if NEON is not supported
+    if(NOT NEON_HARDFP_SUPPORTED AND NOT NEON_SOFTFP_SUPPORTED)
+      # Test for ARM VFP support
+      set(TEST_SOURCE_CODE "
+        double sum(double a, double b)
+        {
+         return a + b;
+        }
+        int main()
+        {
+         double s1, s2, v1 = 1.0, v2 = 2.0, v3 = 1.0e-322;
+         s1 = sum(v1, v2);
+         s2 = sum(v3, v3);
+         return 0;
+        }"
+      )
+
+      # Test running with -mfpu=vfp
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -mfpu=vfp")
+      check_c_source_runs("${TEST_SOURCE_CODE}" VFP_SUPPORTED)
+
+      if(NOT VFP_SUPPORTED)
+        # Fallback using emulation if VFP is not supported
+        if(ENABLE_VFP)
+          message(FATAL_ERROR "FFTS cannot enable VFP on this platform")
+        endif(ENABLE_VFP)
+
+        message(WARNING "FFTS is using 'soft' FPU")
+      else()
+        message("FFTS is using 'vfp' FPU")
+        set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfpu=vfp")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=vfp")
+        set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+        set(ENABLE_VFP ON)
+      endif(NOT VFP_SUPPORTED)
+
+      # Test running with -mfloat-abi=hard
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -mfloat-abi=hard")
+
+      # Use the same test as before
+      check_c_source_runs("${TEST_SOURCE_CODE}" HARDFP_SUPPORTED)
+
+      if(NOT HARDFP_SUPPORTED)
+        # Test running with -mfloat-abi=softfp
+        set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -mfloat-abi=softfp")
+        check_c_source_runs("${TEST_SOURCE_CODE}" SOFTFP_SUPPORTED)
+
+        if(NOT SOFTFP_SUPPORTED)
+          # Most likely development libraries are missing
+          message(WARNING "FFTS is using 'soft' float ABI")
+        else()
+          message("FFTS is using 'softfp' float ABI")
+          set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfloat-abi=softfp")
+          set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=softfp")
+        endif(NOT SOFTFP_SUPPORTED)
+      else()
+        message("FFTS is using 'hard' float ABI")
+        set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -mfloat-abi=hard")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard")
+      endif(NOT HARDFP_SUPPORTED)
+    endif(NOT NEON_HARDFP_SUPPORTED AND NOT NEON_SOFTFP_SUPPORTED)
+  else()
+    # enable SSE code generation
+    if(CMAKE_COMPILER_IS_GNUCC)
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -msse")
+    endif(CMAKE_COMPILER_IS_GNUCC)
+
+    # check if the platform has support for SSE intrinsics
+    check_include_file(xmmintrin.h HAVE_XMMINTRIN_H)
+    if(HAVE_XMMINTRIN_H)
+      add_definitions(-DHAVE_SSE)
+      set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+    endif(HAVE_XMMINTRIN_H)
+
+    # enable SSE2 code generation
+    if(CMAKE_COMPILER_IS_GNUCC)
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -msse2")
+    endif(CMAKE_COMPILER_IS_GNUCC)
+
+    # check if the platform has support for SSE2 intrinsics
+    check_include_file(emmintrin.h HAVE_EMMINTRIN_H)
+    if(HAVE_EMMINTRIN_H)
+      add_definitions(-DHAVE_SSE2)
+      set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+    endif(HAVE_EMMINTRIN_H)
+
+    # enable SSE3 code generation
+    if(CMAKE_COMPILER_IS_GNUCC)
+      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS_SAVE} -msse3")
+    endif(CMAKE_COMPILER_IS_GNUCC)
+
+    # check if the platform has support for SSE3 intrinsics
+    check_include_file(pmmintrin.h HAVE_PMMINTRIN_H)
+    if(HAVE_PMMINTRIN_H)
+      add_definitions(-DHAVE_PMMINTRIN_H)
+      add_definitions(-DHAVE_SSE3)
+      set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
+    else()
+      # check if the platform has specific intrinsics
+      check_include_file(intrin.h HAVE_INTRIN_H)
+      if(HAVE_INTRIN_H)
+        add_definitions(-DHAVE_INTRIN_H)
+
+        check_c_source_compiles("
+          #include<intrin.h>
+          int main(int argc, char** argv)
+          {
+           (void) argv;
+           (void) argc;
+           return _mm_movemask_ps(_mm_moveldup_ps(_mm_set_ss(1.0f)));
+          }" HAVE__MM_MOVELDUP_PS
+        )
+
+        if(HAVE__MM_MOVELDUP_PS)
+          # assume that we have all SSE3 intrinsics
+          add_definitions(-DHAVE_SSE3)
+        endif(HAVE__MM_MOVELDUP_PS)
+      endif(HAVE_INTRIN_H)
+    endif(HAVE_PMMINTRIN_H)
+  endif(CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+else()
+  # TODO: Add detections for compiler support and headers
+endif(NOT CMAKE_CROSSCOMPILING)
+
+# restore flags
+set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
+
+# compiler settings
+if(MSVC)
+  # enable all warnings but also disable some..
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4 /wd4127")
+
+  # mark debug versions
+  set(CMAKE_DEBUG_POSTFIX "d")
+
+  add_definitions(-D_USE_MATH_DEFINES)
+elseif(CMAKE_COMPILER_IS_GNUCC)
+  include(CheckCCompilerFlag)
+  include(CheckLibraryExists)
+
+  # enable all warnings
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wextra")
+
+  # check if we can control visibility of symbols
+  check_c_compiler_flag(-fvisibility=hidden HAVE_GCC_VISIBILITY)
+  if(HAVE_GCC_VISIBILITY)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fvisibility=hidden")
+    add_definitions(-DHAVE_GCC_VISIBILITY)
+  endif(HAVE_GCC_VISIBILITY)
+
+  # some systems need libm for the math functions to work
+  check_library_exists(m pow "" HAVE_LIBM)
+  if(HAVE_LIBM)
+    list(APPEND CMAKE_REQUIRED_LIBRARIES m)
+    list(APPEND FFTS_EXTRA_LIBRARIES m)
+  endif(HAVE_LIBM)
+
+  if(HAVE_PMMINTRIN_H)
+    add_definitions(-msse3)
+  elseif(HAVE_EMMINTRIN_H)
+    add_definitions(-msse2)
+  elseif(HAVE_XMMINTRIN_H)
+    add_definitions(-msse)
+  endif(HAVE_PMMINTRIN_H)
+endif(MSVC)
+
+include_directories(include)
+include_directories(src)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+
+set(FFTS_HEADERS
+  include/ffts.h
+)
+
+set(FFTS_SOURCES
+  src/ffts_attributes.h
+  src/ffts.c
+  src/ffts_internal.h
+  src/ffts_nd.c
+  src/ffts_nd.h
+  src/ffts_real.h
+  src/ffts_real.c
+  src/ffts_real_nd.c
+  src/ffts_real_nd.h
+  src/ffts_transpose.c
+  src/ffts_transpose.h
+  src/ffts_trig.c
+  src/ffts_trig.h
+  src/ffts_static.c
+  src/ffts_static.h
+  src/macros.h
+  src/patterns.h
+  src/types.h
+)
+
+if(ENABLE_NEON)
+  list(APPEND FFTS_SOURCES
+    src/neon.s
+  )
+
+  if(DISABLE_DYNAMIC_CODE)
+    list(APPEND FFTS_SOURCES
+      src/neon_static.s
+    )
+  endif(DISABLE_DYNAMIC_CODE)
+
+  add_definitions(-DHAVE_NEON)
+elseif(ENABLE_VFP)
+  if(NOT DISABLE_DYNAMIC_CODE)
+    list(APPEND FFTS_SOURCES
+      src/vfp.s
+    )
+  endif(NOT DISABLE_DYNAMIC_CODE)
+
+  add_definitions(-DHAVE_VFP)
+elseif(HAVE_XMMINTRIN_H)
+  add_definitions(-DHAVE_SSE)
+
+  list(APPEND FFTS_SOURCES
+    src/macros-sse.h
+  )
+
+  if(NOT DISABLE_DYNAMIC_CODE)
+    if(CMAKE_SIZEOF_VOID_P EQUAL 8)
+      list(APPEND FFTS_SOURCES
+        src/codegen_sse.h
+      )
+    else()
+      message(WARNING "Dynamic code is only supported with x64, disabling dynamic code.")
+      set(DISABLE_DYNAMIC_CODE ON)
+    endif(CMAKE_SIZEOF_VOID_P EQUAL 8)
+  endif(NOT DISABLE_DYNAMIC_CODE)
+endif(ENABLE_NEON)
+
+if(DISABLE_DYNAMIC_CODE)
+  add_definitions(-DDYNAMIC_DISABLED)
+else()
+  list(APPEND FFTS_SOURCES
+    src/codegen.c
+    src/codegen.h
+  )
+endif(DISABLE_DYNAMIC_CODE)
+
+if(GENERATE_POSITION_INDEPENDENT_CODE)
+  set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+endif(GENERATE_POSITION_INDEPENDENT_CODE)
+
+if(ENABLE_SHARED)
+  add_library(ffts_shared SHARED
+    ${FFTS_HEADERS}
+    ${FFTS_SOURCES}
+  )
+
+  # On unix-like platforms the library is called "libffts.so" and on Windows "ffts.dll"
+  set_target_properties(ffts_shared PROPERTIES
+    DEFINE_SYMBOL FFTS_SHARED
+    OUTPUT_NAME ffts
+    VERSION ${FFTS_MAJOR}.${FFTS_MINOR}.${FFTS_MICRO}
+  )
+endif(ENABLE_SHARED)
+
+if(ENABLE_STATIC)
+  add_library(ffts_static STATIC
+    ${FFTS_HEADERS}
+    ${FFTS_SOURCES}
+  )
+
+  if(UNIX)
+    # On unix-like platforms the library is called "libffts.a"
+    set_target_properties(ffts_static PROPERTIES OUTPUT_NAME ffts)
+  endif(UNIX)
+endif(ENABLE_STATIC)
+
+if(ENABLE_STATIC OR ENABLE_SHARED)
+  add_executable(ffts_test
+    tests/test.c
+  )
+
+  # link with static library by default
+  if(ENABLE_STATIC)
+    add_library(ffts ALIAS ffts_static)
+  else()
+    add_library(ffts ALIAS ffts_shared)
+  endif(ENABLE_STATIC)
+
+  target_link_libraries(ffts_test
+    ffts
+    ${FFTS_EXTRA_LIBRARIES}
+  )
+endif(ENABLE_STATIC OR ENABLE_SHARED)
--- a/lib/ffts/Makefile.in
+++ b/lib/ffts/Makefile.in
@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.12.4 from Makefile.am.
+# Makefile.in generated by automake 1.14 from Makefile.am.
 # @configure_input@

-# Copyright (C) 1994-2012 Free Software Foundation, Inc.
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.

 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -15,23 +15,51 @@
@SET_MAKE@

 VPATH = @srcdir@
-am__make_dryrun = \
-  { \
-    am__dry=no; \
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
    case $$MAKEFLAGS in \
      *\\[\ \	]*) \
-        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
-          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
-      *) \
-        for am__flg in $$MAKEFLAGS; do \
-          case $$am__flg in \
-            *=*|--*) ;; \
-            *n*) am__dry=yes; break;; \
-          esac; \
-        done;; \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
    esac; \
-    test $$am__dry = yes; \
-  }
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
 pkgdatadir = $(datadir)/@PACKAGE@
 pkgincludedir = $(includedir)/@PACKAGE@
 pkglibdir = $(libdir)/@PACKAGE@
@ -52,10 +80,11 @@ build_triplet = @build@
 host_triplet = @host@
@ENABLE_JNI_TRUE@am__append_1 = java
 subdir = .
-DIST_COMMON = README $(am__configure_deps) $(srcdir)/Makefile.am \
-	$(srcdir)/Makefile.in $(srcdir)/config.h.in \
-	$(srcdir)/ffts.pc.in $(top_srcdir)/configure AUTHORS \
-	config.guess config.sub depcomp install-sh ltmain.sh missing
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+	$(top_srcdir)/configure $(am__configure_deps) \
+	$(srcdir)/config.h.in $(srcdir)/ffts.pc.in AUTHORS README \
+	compile config.guess config.sub depcomp install-sh missing \
+	ltmain.sh
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_classpath.m4 \
 	$(top_srcdir)/m4/ax_check_java_home.m4 \
@ -73,15 +102,28 @@ mkinstalldirs = $(install_sh) -d
 CONFIG_HEADER = config.h
 CONFIG_CLEAN_FILES = ffts.pc
 CONFIG_CLEAN_VPATH_FILES =
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
 SOURCES =
 DIST_SOURCES =
-RECURSIVE_TARGETS = all-recursive check-recursive dvi-recursive \
-	html-recursive info-recursive install-data-recursive \
-	install-dvi-recursive install-exec-recursive \
-	install-html-recursive install-info-recursive \
-	install-pdf-recursive install-ps-recursive install-recursive \
-	installcheck-recursive installdirs-recursive pdf-recursive \
-	ps-recursive uninstall-recursive
+RECURSIVE_TARGETS = all-recursive check-recursive cscopelist-recursive \
+	ctags-recursive dvi-recursive html-recursive info-recursive \
+	install-data-recursive install-dvi-recursive \
+	install-exec-recursive install-html-recursive \
+	install-info-recursive install-pdf-recursive \
+	install-ps-recursive install-recursive installcheck-recursive \
+	installdirs-recursive pdf-recursive ps-recursive \
+	tags-recursive uninstall-recursive
 am__can_run_installinfo = \
  case $$AM_UPDATE_INFO_DIR in \
    n|no|NO) false;; \
@ -118,9 +160,30 @@ am__installdirs = "$(DESTDIR)$(pkgconfigdir)"
 DATA = $(pkgconfig_DATA)
 RECURSIVE_CLEAN_TARGETS = mostlyclean-recursive clean-recursive	\
  distclean-recursive maintainer-clean-recursive
-AM_RECURSIVE_TARGETS = $(RECURSIVE_TARGETS:-recursive=) \
-	$(RECURSIVE_CLEAN_TARGETS:-recursive=) tags TAGS ctags CTAGS \
+am__recursive_targets = \
+  $(RECURSIVE_TARGETS) \
+  $(RECURSIVE_CLEAN_TARGETS) \
+  $(am__extra_recursive_targets)
+AM_RECURSIVE_TARGETS = $(am__recursive_targets:-recursive=) TAGS CTAGS \
 	cscope distdir dist dist-all distcheck
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) \
+	$(LISP)config.h.in
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
 ETAGS = etags
 CTAGS = ctags
 CSCOPE = cscope
@ -169,6 +232,7 @@ am__distuninstallcheck_listfiles = $(distuninstallcheck_listfiles) \
 distcleancheck_listfiles = find . -type f -print
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
@ -343,8 +407,8 @@ $(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 $(am__aclocal_m4_deps):

 config.h: stamp-h1
-	@if test ! -f $@; then rm -f stamp-h1; else :; fi
-	@if test ! -f $@; then $(MAKE) $(AM_MAKEFLAGS) stamp-h1; else :; fi
+	@test -f $@ || rm -f stamp-h1
+	@test -f $@ || $(MAKE) $(AM_MAKEFLAGS) stamp-h1

 stamp-h1: $(srcdir)/config.h.in $(top_builddir)/config.status
 	@rm -f stamp-h1
@ -395,14 +459,13 @@ uninstall-pkgconfigDATA:
 # (1) if the variable is set in 'config.status', edit 'config.status'
 #     (which will cause the Makefiles to be regenerated when you run 'make');
 # (2) otherwise, pass the desired values on the 'make' command line.
-$(RECURSIVE_TARGETS) $(RECURSIVE_CLEAN_TARGETS):
-	@fail= failcom='exit 1'; \
-	for f in x $$MAKEFLAGS; do \
-	  case $$f in \
-	    *=* | --[!k]*);; \
-	    *k*) failcom='fail=yes';; \
-	  esac; \
-	done; \
+$(am__recursive_targets):
+	@fail=; \
+	if $(am__make_keepgoing); then \
+	  failcom='fail=yes'; \
+	else \
+	  failcom='exit 1'; \
+	fi; \
 	dot_seen=no; \
 	target=`echo $@ | sed s/-recursive//`; \
 	case "$@" in \
@ -423,31 +486,13 @@ $(RECURSIVE_TARGETS) $(RECURSIVE_CLEAN_TARGETS):
 	if test "$$dot_seen" = "no"; then \
 	  $(MAKE) $(AM_MAKEFLAGS) "$$target-am" || exit 1; \
 	fi; test -z "$$fail"
-tags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) tags); \
-	done
-ctags-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) ctags); \
-	done
-cscopelist-recursive:
-	list='$(SUBDIRS)'; for subdir in $$list; do \
-	  test "$$subdir" = . || ($(am__cd) $$subdir && $(MAKE) $(AM_MAKEFLAGS) cscopelist); \
-	done

-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-recursive
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
 	set x; \
 	here=`pwd`; \
 	if ($(ETAGS) --etags-include --version) >/dev/null 2>&1; then \
@ -463,12 +508,7 @@ TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
 	      set "$$@" "$$include_option=$$here/$$subdir/TAGS"; \
 	  fi; \
 	done; \
-	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	$(am__define_uniq_tagged_files); \
 	shift; \
 	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
 	  test -n "$$unique" || unique=$$empty_fix; \
@ -480,15 +520,11 @@ TAGS: tags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
 	      $$unique; \
 	  fi; \
 	fi
-ctags: CTAGS
-CTAGS: ctags-recursive $(HEADERS) $(SOURCES) config.h.in $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	list='$(SOURCES) $(HEADERS) config.h.in $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+ctags: ctags-recursive
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
 	test -z "$(CTAGS_ARGS)$$unique" \
 	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
 	     $$unique
@ -497,18 +533,16 @@ GTAGS:
 	here=`$(am__cd) $(top_builddir) && pwd` \
 	  && $(am__cd) $(top_srcdir) \
 	  && gtags -i $(GTAGS_ARGS) "$$here"
-
 cscope: cscope.files
 	test ! -s cscope.files \
 	  || $(CSCOPE) -b -q $(AM_CSCOPEFLAGS) $(CSCOPEFLAGS) -i cscope.files $(CSCOPE_ARGS)
-
 clean-cscope:
 	-rm -f cscope.files
+cscope.files: clean-cscope cscopelist
+cscopelist: cscopelist-recursive

-cscope.files: clean-cscope cscopelist-recursive cscopelist
-
-cscopelist: cscopelist-recursive $(HEADERS) $(SOURCES) $(LISP)
-	list='$(SOURCES) $(HEADERS) $(LISP)'; \
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
 	case "$(srcdir)" in \
 	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
 	  *) sdir=$(subdir)/$(srcdir) ;; \
@ -606,10 +640,16 @@ dist-xz: distdir
 	$(am__post_remove_distdir)

 dist-tarZ: distdir
+	@echo WARNING: "Support for shar distribution archives is" \
+	               "deprecated." >&2
+	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	tardir=$(distdir) && $(am__tar) | compress -c >$(distdir).tar.Z
 	$(am__post_remove_distdir)

 dist-shar: distdir
+	@echo WARNING: "Support for distribution archives compressed with" \
+		       "legacy program 'compress' is deprecated." >&2
+	@echo WARNING: "It will be removed altogether in Automake 2.0" >&2
 	shar $(distdir) | GZIP=$(GZIP_ENV) gzip -c >$(distdir).shar.gz
 	$(am__post_remove_distdir)

@ -814,27 +854,24 @@ ps-am:

 uninstall-am: uninstall-pkgconfigDATA

-.MAKE: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) all \
-	cscopelist-recursive ctags-recursive install-am install-strip \
-	tags-recursive
-
-.PHONY: $(RECURSIVE_CLEAN_TARGETS) $(RECURSIVE_TARGETS) CTAGS GTAGS \
-	all all-am am--refresh check check-am clean clean-cscope \
-	clean-generic clean-libtool cscope cscopelist \
-	cscopelist-recursive ctags ctags-recursive dist dist-all \
-	dist-bzip2 dist-gzip dist-lzip dist-shar dist-tarZ dist-xz \
-	dist-zip distcheck distclean distclean-generic distclean-hdr \
-	distclean-libtool distclean-tags distcleancheck distdir \
-	distuninstallcheck dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-info install-info-am install-man \
-	install-pdf install-pdf-am install-pkgconfigDATA install-ps \
-	install-ps-am install-strip installcheck installcheck-am \
-	installdirs installdirs-am maintainer-clean \
+.MAKE: $(am__recursive_targets) all install-am install-strip
+
+.PHONY: $(am__recursive_targets) CTAGS GTAGS TAGS all all-am \
+	am--refresh check check-am clean clean-cscope clean-generic \
+	clean-libtool cscope cscopelist-am ctags ctags-am dist \
+	dist-all dist-bzip2 dist-gzip dist-lzip dist-shar dist-tarZ \
+	dist-xz dist-zip distcheck distclean distclean-generic \
+	distclean-hdr distclean-libtool distclean-tags distcleancheck \
+	distdir distuninstallcheck dvi dvi-am html html-am info \
+	info-am install install-am install-data install-data-am \
+	install-dvi install-dvi-am install-exec install-exec-am \
+	install-html install-html-am install-info install-info-am \
+	install-man install-pdf install-pdf-am install-pkgconfigDATA \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs installdirs-am maintainer-clean \
 	maintainer-clean-generic mostlyclean mostlyclean-generic \
-	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-recursive \
-	uninstall uninstall-am uninstall-pkgconfigDATA
+	mostlyclean-libtool pdf pdf-am ps ps-am tags tags-am uninstall \
+	uninstall-am uninstall-pkgconfigDATA


 # Tell versions [3.59,3.63) of GNU make to not export all variables.
--- a/lib/ffts/README
+++ b/lib/ffts/README
@ -1,27 +0,0 @@
-FFTS -- The Fastest Fourier Transform in the South
-by Anthony Blake <anthonix@me.com>
-
-To build for Android, edit and run build_android.sh
-
-To build for iOS, edit and run build_iphone.sh 
-
-To build for Linux or OS X on x86, run 
-./configure --enable-sse --enable-single --prefix=/usr/local
-make
-make install
-
-FFTS dynamically generates code at runtime. This can be disabled with 
--disable-dynamic-code
-
-For JNI targets: --enable-jni will build the jni stuff automatically for
-the host target, and --enable-shared must also be added manually for it to
-work.
-
-If you like FFTS, please show your support by sending a postcard to:
-
-Anthony Blake
-Department of Computer Science
-The University of Waikato
-Private Bag 3105
-Hamilton 3240
-NEW ZEALAND
--- a/lib/ffts/README.md
+++ b/lib/ffts/README.md
@ -0,0 +1,35 @@
+# FFTS -- The Fastest Fourier Transform in the South
+
+[![Build Status](https://travis-ci.org/linkotec/ffts.svg?branch=master)](https://travis-ci.org/linkotec/ffts)
+
+To build for Android, edit and run build_android.sh
+
+To build for iOS, edit and run build_iphone.sh 
+
+To build for Linux or OS X on x86, run 
+  ./configure --enable-sse --enable-single --prefix=/usr/local
+  make
+  make install
+
+Optionally build for Windows and Linux with CMake, run
+  mkdir build
+  cd build
+  cmake ..
+  
+FFTS dynamically generates code at runtime. This can be disabled with 
+--disable-dynamic-code
+
+Note that 32 bit x86 dynamic machine code generation is not supported at the moment.
+
+For JNI targets: --enable-jni will build the jni stuff automatically for
+the host target, and --enable-shared must also be added manually for it to
+work.
+
+If you like FFTS, please show your support by sending a postcard to:
+
+Anthony Blake<br>
+Department of Computer Science<br>
+The University of Waikato<br>
+Private Bag 3105<br>
+Hamilton 3240<br>
+NEW ZEALAND
--- a/lib/ffts/aclocal.m4
+++ b/lib/ffts/aclocal.m4
@ -1,6 +1,6 @@
-# generated automatically by aclocal 1.12.4 -*- Autoconf -*-
+# generated automatically by aclocal 1.14 -*- Autoconf -*-

-# Copyright (C) 1996-2012 Free Software Foundation, Inc.
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.

 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -11,6 +11,7 @@
 # even the implied warranty of MERCHANTABILITY or FITNESS FOR A
 # PARTICULAR PURPOSE.

+m4_ifndef([AC_CONFIG_MACRO_DIRS], [m4_defun([_AM_CONFIG_MACRO_DIRS], [])m4_defun([AC_CONFIG_MACRO_DIRS], [_AM_CONFIG_MACRO_DIRS($@)])])
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 m4_if(m4_defn([AC_AUTOCONF_VERSION]), [2.69],,
@ -8606,7 +8607,7 @@ m4_ifndef([_LT_PROG_F77],		[AC_DEFUN([_LT_PROG_F77])])
 m4_ifndef([_LT_PROG_FC],		[AC_DEFUN([_LT_PROG_FC])])
 m4_ifndef([_LT_PROG_CXX],		[AC_DEFUN([_LT_PROG_CXX])])

-# Copyright (C) 2002-2012 Free Software Foundation, Inc.
+# Copyright (C) 2002-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -8618,10 +8619,10 @@ m4_ifndef([_LT_PROG_CXX],		[AC_DEFUN([_LT_PROG_CXX])])
 # generated from the m4 files accompanying Automake X.Y.
 # (This private macro should not be called outside this file.)
 AC_DEFUN([AM_AUTOMAKE_VERSION],
-[am__api_version='1.12'
+[am__api_version='1.14'
 dnl Some users find AM_AUTOMAKE_VERSION and mistake it for a way to
 dnl require some minimum version.  Point them to the right macro.
-m4_if([$1], [1.12.4], [],
+m4_if([$1], [1.14], [],
      [AC_FATAL([Do not call $0, use AM_INIT_AUTOMAKE([$1]).])])dnl
 ])

@ -8637,14 +8638,14 @@ m4_define([_AM_AUTOCONF_VERSION], [])
 # Call AM_AUTOMAKE_VERSION and AM_AUTOMAKE_VERSION so they can be traced.
 # This function is AC_REQUIREd by AM_INIT_AUTOMAKE.
 AC_DEFUN([AM_SET_CURRENT_AUTOMAKE_VERSION],
-[AM_AUTOMAKE_VERSION([1.12.4])dnl
+[AM_AUTOMAKE_VERSION([1.14])dnl
 m4_ifndef([AC_AUTOCONF_VERSION],
  [m4_copy([m4_PACKAGE_VERSION], [AC_AUTOCONF_VERSION])])dnl
 _AM_AUTOCONF_VERSION(m4_defn([AC_AUTOCONF_VERSION]))])

 # Figure out how to run the assembler.                      -*- Autoconf -*-

-# Copyright (C) 2001-2012 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -8664,7 +8665,7 @@ _AM_IF_OPTION([no-dependencies],, [_AM_DEPENDENCIES([CCAS])])dnl

 # AM_AUX_DIR_EXPAND                                         -*- Autoconf -*-

-# Copyright (C) 2001-2012 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -8717,7 +8718,7 @@ am_aux_dir=`cd $ac_aux_dir && pwd`

 # AM_CONDITIONAL                                            -*- Autoconf -*-

-# Copyright (C) 1997-2012 Free Software Foundation, Inc.
+# Copyright (C) 1997-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -8748,7 +8749,7 @@ AC_CONFIG_COMMANDS_PRE(
 Usually this means the macro was only invoked conditionally.]])
 fi])])

-# Copyright (C) 1999-2012 Free Software Foundation, Inc.
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -8939,7 +8940,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl

 # Generate code to set up dependency tracking.              -*- Autoconf -*-

-# Copyright (C) 1999-2012 Free Software Foundation, Inc.
+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -8950,7 +8951,7 @@ _AM_SUBST_NOTMAKE([am__nodep])dnl
 # ------------------------------
 AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
 [{
-  # Autoconf 2.62 quotes --file arguments for eval, but not when files
+  # Older Autoconf quotes --file arguments for eval, but not when files
  # are listed without --file.  Let's play safe and only enable the eval
  # if we detect the quoting.
  case $CONFIG_FILES in
@ -8979,7 +8980,7 @@ AC_DEFUN([_AM_OUTPUT_DEPENDENCY_COMMANDS],
    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
    test -z "$DEPDIR" && continue
    am__include=`sed -n 's/^am__include = //p' < "$mf"`
-    test -z "am__include" && continue
+    test -z "$am__include" && continue
    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
    # Find all dependency output files, they are included files with
    # $(DEPDIR) in their names.  We invoke sed twice because it is the
@ -9015,7 +9016,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],

 # Do all the work for Automake.                             -*- Autoconf -*-

-# Copyright (C) 1996-2012 Free Software Foundation, Inc.
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9024,6 +9025,12 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 # This macro actually does too much.  Some checks are only needed if
 # your package does certain things.  But this isn't really a big deal.

+dnl Redefine AC_PROG_CC to automatically invoke _AM_PROG_CC_C_O.
+m4_define([AC_PROG_CC],
+m4_defn([AC_PROG_CC])
+[_AM_PROG_CC_C_O
+])
+
 # AM_INIT_AUTOMAKE(PACKAGE, VERSION, [NO-DEFINE])
 # AM_INIT_AUTOMAKE([OPTIONS])
 # -----------------------------------------------
@ -9036,7 +9043,7 @@ AC_DEFUN([AM_OUTPUT_DEPENDENCY_COMMANDS],
 # arguments mandatory, and then we can depend on a new Autoconf
 # release and drop the old call support.
 AC_DEFUN([AM_INIT_AUTOMAKE],
-[AC_PREREQ([2.62])dnl
+[AC_PREREQ([2.65])dnl
 dnl Autoconf wants to disallow AM_ names.  We explicitly allow
 dnl the ones we care about.
 m4_pattern_allow([^AM_[A-Z]+FLAGS$])dnl
@ -9066,8 +9073,7 @@ AC_SUBST([CYGPATH_W])
 dnl Distinguish between old-style and new-style calls.
 m4_ifval([$2],
 [AC_DIAGNOSE([obsolete],
-[$0: two- and three-arguments forms are deprecated.  For more info, see:
-http://www.gnu.org/software/automake/manual/automake.html#Modernize-AM_INIT_AUTOMAKE-invocation])
+             [$0: two- and three-arguments forms are deprecated.])
 m4_ifval([$3], [_AM_SET_OPTION([no-define])])dnl
 AC_SUBST([PACKAGE], [$1])dnl
 AC_SUBST([VERSION], [$2])],
@ -9121,22 +9127,60 @@ AC_PROVIDE_IFELSE([AC_PROG_OBJC],
 		  [_AM_DEPENDENCIES([OBJC])],
 		  [m4_define([AC_PROG_OBJC],
 			     m4_defn([AC_PROG_OBJC])[_AM_DEPENDENCIES([OBJC])])])dnl
-dnl Support for Objective C++ was only introduced in Autoconf 2.65,
-dnl but we still cater to Autoconf 2.62.
-m4_ifdef([AC_PROG_OBJCXX],
-[AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
+AC_PROVIDE_IFELSE([AC_PROG_OBJCXX],
 		  [_AM_DEPENDENCIES([OBJCXX])],
 		  [m4_define([AC_PROG_OBJCXX],
-			     m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])])dnl
+			     m4_defn([AC_PROG_OBJCXX])[_AM_DEPENDENCIES([OBJCXX])])])dnl
 ])
-_AM_IF_OPTION([silent-rules], [AC_REQUIRE([AM_SILENT_RULES])])dnl
-dnl The 'parallel-tests' driver may need to know about EXEEXT, so add the
-dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This macro
-dnl is hooked onto _AC_COMPILER_EXEEXT early, see below.
+AC_REQUIRE([AM_SILENT_RULES])dnl
+dnl The testsuite driver may need to know about EXEEXT, so add the
+dnl 'am__EXEEXT' conditional if _AM_COMPILER_EXEEXT was seen.  This
+dnl macro is hooked onto _AC_COMPILER_EXEEXT early, see below.
 AC_CONFIG_COMMANDS_PRE(dnl
 [m4_provide_if([_AM_COMPILER_EXEEXT],
  [AM_CONDITIONAL([am__EXEEXT], [test -n "$EXEEXT"])])])dnl
-])
+
+# POSIX will say in a future version that running "rm -f" with no argument
+# is OK; and we want to be able to make that assumption in our Makefile
+# recipes.  So use an aggressive probe to check that the usage we want is
+# actually supported "in the wild" to an acceptable degree.
+# See automake bug#10828.
+# To make any issue more visible, cause the running configure to be aborted
+# by default if the 'rm' program in use doesn't match our expectations; the
+# user can still override this though.
+if rm -f && rm -fr && rm -rf; then : OK; else
+  cat >&2 <<'END'
+Oops!
+
+Your 'rm' program seems unable to run without file operands specified
+on the command line, even when the '-f' option is present.  This is contrary
+to the behaviour of most rm programs out there, and not conforming with
+the upcoming POSIX standard: <http://austingroupbugs.net/view.php?id=542>
+
+Please tell bug-automake@gnu.org about your system, including the value
+of your $PATH and any error possibly output before this message.  This
+can help us improve future automake versions.
+
+END
+  if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then
+    echo 'Configuration will proceed anyway, since you have set the' >&2
+    echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2
+    echo >&2
+  else
+    cat >&2 <<'END'
+Aborting the configuration process, to ensure you take notice of the issue.
+
+You can download and install GNU coreutils to get an 'rm' implementation
+that behaves properly: <http://www.gnu.org/software/coreutils/>.
+
+If you want to complete the configuration process using your problematic
+'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
+to "yes", and re-run configure.
+
+END
+    AC_MSG_ERROR([Your 'rm' program is bad, sorry.])
+  fi
+fi])

 dnl Hook into '_AC_COMPILER_EXEEXT' early to learn its expansion.  Do not
 dnl add the conditional right here, as _AC_COMPILER_EXEEXT may be further
@ -9144,7 +9188,6 @@ dnl mangled by Autoconf and run in a shell conditional statement.
 m4_define([_AC_COMPILER_EXEEXT],
 m4_defn([_AC_COMPILER_EXEEXT])[m4_provide([_AM_COMPILER_EXEEXT])])

-
 # When config.status generates a header, we must update the stamp-h file.
 # This file resides in the same directory as the config header
 # that is generated.  The stamp files are numbered to have different names.
@ -9166,7 +9209,7 @@ for _am_header in $config_headers :; do
 done
 echo "timestamp for $_am_arg" >`AS_DIRNAME(["$_am_arg"])`/stamp-h[]$_am_stamp_count])

-# Copyright (C) 2001-2012 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9187,7 +9230,7 @@ if test x"${install_sh}" != xset; then
 fi
 AC_SUBST([install_sh])])

-# Copyright (C) 2003-2012 Free Software Foundation, Inc.
+# Copyright (C) 2003-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9208,7 +9251,7 @@ AC_SUBST([am__leading_dot])])

 # Check to see how 'make' treats includes.	            -*- Autoconf -*-

-# Copyright (C) 2001-2012 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9258,7 +9301,7 @@ rm -f confinc confmf

 # Fake the existence of programs that GNU maintainers use.  -*- Autoconf -*-

-# Copyright (C) 1997-2012 Free Software Foundation, Inc.
+# Copyright (C) 1997-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9273,8 +9316,8 @@ AC_SUBST($1)])

 # AM_MISSING_HAS_RUN
 # ------------------
-# Define MISSING if not defined so far and test if it supports --run.
-# If it does, set am_missing_run to use it, otherwise, to nothing.
+# Define MISSING if not defined so far and test if it is modern enough.
+# If it is, set am_missing_run to use it, otherwise, to nothing.
 AC_DEFUN([AM_MISSING_HAS_RUN],
 [AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
 AC_REQUIRE_AUX_FILE([missing])dnl
@ -9287,8 +9330,8 @@ if test x"${MISSING+set}" != xset; then
  esac
 fi
 # Use eval to expand $SHELL
-if eval "$MISSING --run true"; then
-  am_missing_run="$MISSING --run "
+if eval "$MISSING --is-lightweight"; then
+  am_missing_run="$MISSING "
 else
  am_missing_run=
  AC_MSG_WARN(['missing' script is too old or missing])
@ -9297,7 +9340,7 @@ fi

 # Helper functions for option handling.                     -*- Autoconf -*-

-# Copyright (C) 2001-2012 Free Software Foundation, Inc.
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9326,9 +9369,73 @@ AC_DEFUN([_AM_SET_OPTIONS],
 AC_DEFUN([_AM_IF_OPTION],
 [m4_ifset(_AM_MANGLE_OPTION([$1]), [$2], [$3])])

+# Copyright (C) 1999-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# _AM_PROG_CC_C_O
+# ---------------
+# Like AC_PROG_CC_C_O, but changed for automake.  We rewrite AC_PROG_CC
+# to automatically call this.
+AC_DEFUN([_AM_PROG_CC_C_O],
+[AC_REQUIRE([AM_AUX_DIR_EXPAND])dnl
+AC_REQUIRE_AUX_FILE([compile])dnl
+AC_LANG_PUSH([C])dnl
+AC_CACHE_CHECK(
+  [whether $CC understands -c and -o together],
+  [am_cv_prog_cc_c_o],
+  [AC_LANG_CONFTEST([AC_LANG_PROGRAM([])])
+  # Make sure it works both with $CC and with simple cc.
+  # Following AC_PROG_CC_C_O, we do the test twice because some
+  # compilers refuse to overwrite an existing .o file with -o,
+  # though they will create one.
+  am_cv_prog_cc_c_o=yes
+  for am_i in 1 2; do
+    if AM_RUN_LOG([$CC -c conftest.$ac_ext -o conftest2.$ac_objext]) \
+         && test -f conftest2.$ac_objext; then
+      : OK
+    else
+      am_cv_prog_cc_c_o=no
+      break
+    fi
+  done
+  rm -f core conftest*
+  unset am_i])
+if test "$am_cv_prog_cc_c_o" != yes; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+AC_LANG_POP([C])])
+
+# For backward compatibility.
+AC_DEFUN_ONCE([AM_PROG_CC_C_O], [AC_REQUIRE([AC_PROG_CC])])
+
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_RUN_LOG(COMMAND)
+# -------------------
+# Run COMMAND, save the exit status in ac_status, and log it.
+# (This has been adapted from Autoconf's _AC_RUN_LOG macro.)
+AC_DEFUN([AM_RUN_LOG],
+[{ echo "$as_me:$LINENO: $1" >&AS_MESSAGE_LOG_FD
+   ($1) >&AS_MESSAGE_LOG_FD 2>&AS_MESSAGE_LOG_FD
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&AS_MESSAGE_LOG_FD
+   (exit $ac_status); }])
+
 # Check to make sure that the build environment is sane.    -*- Autoconf -*-

-# Copyright (C) 1996-2012 Free Software Foundation, Inc.
+# Copyright (C) 1996-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9409,7 +9516,67 @@ AC_CONFIG_COMMANDS_PRE(
 rm -f conftest.file
 ])

-# Copyright (C) 2001-2012 Free Software Foundation, Inc.
+# Copyright (C) 2009-2013 Free Software Foundation, Inc.
+#
+# This file is free software; the Free Software Foundation
+# gives unlimited permission to copy and/or distribute it,
+# with or without modifications, as long as this notice is preserved.
+
+# AM_SILENT_RULES([DEFAULT])
+# --------------------------
+# Enable less verbose build rules; with the default set to DEFAULT
+# ("yes" being less verbose, "no" or empty being verbose).
+AC_DEFUN([AM_SILENT_RULES],
+[AC_ARG_ENABLE([silent-rules], [dnl
+AS_HELP_STRING(
+  [--enable-silent-rules],
+  [less verbose build output (undo: "make V=1")])
+AS_HELP_STRING(
+  [--disable-silent-rules],
+  [verbose build output (undo: "make V=0")])dnl
+])
+case $enable_silent_rules in @%:@ (((
+  yes) AM_DEFAULT_VERBOSITY=0;;
+   no) AM_DEFAULT_VERBOSITY=1;;
+    *) AM_DEFAULT_VERBOSITY=m4_if([$1], [yes], [0], [1]);;
+esac
+dnl
+dnl A few 'make' implementations (e.g., NonStop OS and NextStep)
+dnl do not support nested variable expansions.
+dnl See automake bug#9928 and bug#10237.
+am_make=${MAKE-make}
+AC_CACHE_CHECK([whether $am_make supports nested variables],
+   [am_cv_make_support_nested_variables],
+   [if AS_ECHO([['TRUE=$(BAR$(V))
+BAR0=false
+BAR1=true
+V=1
+am__doit:
+	@$(TRUE)
+.PHONY: am__doit']]) | $am_make -f - >/dev/null 2>&1; then
+  am_cv_make_support_nested_variables=yes
+else
+  am_cv_make_support_nested_variables=no
+fi])
+if test $am_cv_make_support_nested_variables = yes; then
+  dnl Using '$V' instead of '$(V)' breaks IRIX make.
+  AM_V='$(V)'
+  AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)'
+else
+  AM_V=$AM_DEFAULT_VERBOSITY
+  AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY
+fi
+AC_SUBST([AM_V])dnl
+AM_SUBST_NOTMAKE([AM_V])dnl
+AC_SUBST([AM_DEFAULT_V])dnl
+AM_SUBST_NOTMAKE([AM_DEFAULT_V])dnl
+AC_SUBST([AM_DEFAULT_VERBOSITY])dnl
+AM_BACKSLASH='\'
+AC_SUBST([AM_BACKSLASH])dnl
+_AM_SUBST_NOTMAKE([AM_BACKSLASH])dnl
+])
+
+# Copyright (C) 2001-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9437,7 +9604,7 @@ fi
 INSTALL_STRIP_PROGRAM="\$(install_sh) -c -s"
 AC_SUBST([INSTALL_STRIP_PROGRAM])])

-# Copyright (C) 2006-2012 Free Software Foundation, Inc.
+# Copyright (C) 2006-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9456,7 +9623,7 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])

 # Check how to create a tarball.                            -*- Autoconf -*-

-# Copyright (C) 2004-2012 Free Software Foundation, Inc.
+# Copyright (C) 2004-2013 Free Software Foundation, Inc.
 #
 # This file is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -9475,76 +9642,114 @@ AC_DEFUN([AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE($@)])
 # Substitute a variable $(am__untar) that extract such
 # a tarball read from stdin.
 #     $(am__untar) < result.tar
+#
 AC_DEFUN([_AM_PROG_TAR],
 [# Always define AMTAR for backward compatibility.  Yes, it's still used
 # in the wild :-(  We should find a proper way to deprecate it ...
 AC_SUBST([AMTAR], ['$${TAR-tar}'])
-m4_if([$1], [v7],
-     [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'],
-     [m4_case([$1], [ustar],, [pax],,
-              [m4_fatal([Unknown tar format])])
-AC_MSG_CHECKING([how to create a $1 tar archive])
-# Loop over all known methods to create a tar archive until one works.
+
+# We'll loop over all known methods to create a tar archive until one works.
 _am_tools='gnutar m4_if([$1], [ustar], [plaintar]) pax cpio none'
-_am_tools=${am_cv_prog_tar_$1-$_am_tools}
-# Do not fold the above two line into one, because Tru64 sh and
-# Solaris sh will not grok spaces in the rhs of '-'.
-for _am_tool in $_am_tools
-do
-  case $_am_tool in
-  gnutar)
-    for _am_tar in tar gnutar gtar;
-    do
-      AM_RUN_LOG([$_am_tar --version]) && break
-    done
-    am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
-    am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
-    am__untar="$_am_tar -xf -"
-    ;;
-  plaintar)
-    # Must skip GNU tar: if it does not support --format= it doesn't create
-    # ustar tarball either.
-    (tar --version) >/dev/null 2>&1 && continue
-    am__tar='tar chf - "$$tardir"'
-    am__tar_='tar chf - "$tardir"'
-    am__untar='tar xf -'
-    ;;
-  pax)
-    am__tar='pax -L -x $1 -w "$$tardir"'
-    am__tar_='pax -L -x $1 -w "$tardir"'
-    am__untar='pax -r'
-    ;;
-  cpio)
-    am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
-    am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
-    am__untar='cpio -i -H $1 -d'
-    ;;
-  none)
-    am__tar=false
-    am__tar_=false
-    am__untar=false
-    ;;
-  esac

-  # If the value was cached, stop now.  We just wanted to have am__tar
-  # and am__untar set.
-  test -n "${am_cv_prog_tar_$1}" && break
+m4_if([$1], [v7],
+  [am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'],
+
+  [m4_case([$1],
+    [ustar],
+     [# The POSIX 1988 'ustar' format is defined with fixed-size fields.
+      # There is notably a 21 bits limit for the UID and the GID.  In fact,
+      # the 'pax' utility can hang on bigger UID/GID (see automake bug#8343
+      # and bug#13588).
+      am_max_uid=2097151 # 2^21 - 1
+      am_max_gid=$am_max_uid
+      # The $UID and $GID variables are not portable, so we need to resort
+      # to the POSIX-mandated id(1) utility.  Errors in the 'id' calls
+      # below are definitely unexpected, so allow the users to see them
+      # (that is, avoid stderr redirection).
+      am_uid=`id -u || echo unknown`
+      am_gid=`id -g || echo unknown`
+      AC_MSG_CHECKING([whether UID '$am_uid' is supported by ustar format])
+      if test $am_uid -le $am_max_uid; then
+         AC_MSG_RESULT([yes])
+      else
+         AC_MSG_RESULT([no])
+         _am_tools=none
+      fi
+      AC_MSG_CHECKING([whether GID '$am_gid' is supported by ustar format])
+      if test $am_gid -le $am_max_gid; then
+         AC_MSG_RESULT([yes])
+      else
+        AC_MSG_RESULT([no])
+        _am_tools=none
+      fi],
+
+  [pax],
+    [],

-  # tar/untar a dummy directory, and stop if the command works
-  rm -rf conftest.dir
-  mkdir conftest.dir
-  echo GrepMe > conftest.dir/file
-  AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
+  [m4_fatal([Unknown tar format])])
+
+  AC_MSG_CHECKING([how to create a $1 tar archive])
+
+  # Go ahead even if we have the value already cached.  We do so because we
+  # need to set the values for the 'am__tar' and 'am__untar' variables.
+  _am_tools=${am_cv_prog_tar_$1-$_am_tools}
+
+  for _am_tool in $_am_tools; do
+    case $_am_tool in
+    gnutar)
+      for _am_tar in tar gnutar gtar; do
+        AM_RUN_LOG([$_am_tar --version]) && break
+      done
+      am__tar="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$$tardir"'
+      am__tar_="$_am_tar --format=m4_if([$1], [pax], [posix], [$1]) -chf - "'"$tardir"'
+      am__untar="$_am_tar -xf -"
+      ;;
+    plaintar)
+      # Must skip GNU tar: if it does not support --format= it doesn't create
+      # ustar tarball either.
+      (tar --version) >/dev/null 2>&1 && continue
+      am__tar='tar chf - "$$tardir"'
+      am__tar_='tar chf - "$tardir"'
+      am__untar='tar xf -'
+      ;;
+    pax)
+      am__tar='pax -L -x $1 -w "$$tardir"'
+      am__tar_='pax -L -x $1 -w "$tardir"'
+      am__untar='pax -r'
+      ;;
+    cpio)
+      am__tar='find "$$tardir" -print | cpio -o -H $1 -L'
+      am__tar_='find "$tardir" -print | cpio -o -H $1 -L'
+      am__untar='cpio -i -H $1 -d'
+      ;;
+    none)
+      am__tar=false
+      am__tar_=false
+      am__untar=false
+      ;;
+    esac
+
+    # If the value was cached, stop now.  We just wanted to have am__tar
+    # and am__untar set.
+    test -n "${am_cv_prog_tar_$1}" && break
+
+    # tar/untar a dummy directory, and stop if the command works.
+    rm -rf conftest.dir
+    mkdir conftest.dir
+    echo GrepMe > conftest.dir/file
+    AM_RUN_LOG([tardir=conftest.dir && eval $am__tar_ >conftest.tar])
+    rm -rf conftest.dir
+    if test -s conftest.tar; then
+      AM_RUN_LOG([$am__untar <conftest.tar])
+      AM_RUN_LOG([cat conftest.dir/file])
+      grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
+    fi
+  done
  rm -rf conftest.dir
-  if test -s conftest.tar; then
-    AM_RUN_LOG([$am__untar <conftest.tar])
-    grep GrepMe conftest.dir/file >/dev/null 2>&1 && break
-  fi
-done
-rm -rf conftest.dir

-AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
-AC_MSG_RESULT([$am_cv_prog_tar_$1])])
+  AC_CACHE_VAL([am_cv_prog_tar_$1], [am_cv_prog_tar_$1=$_am_tool])
+  AC_MSG_RESULT([$am_cv_prog_tar_$1])])
+
 AC_SUBST([am__tar])
 AC_SUBST([am__untar])
 ]) # _AM_PROG_TAR
--- a/lib/ffts/config.h.in
+++ b/lib/ffts/config.h.in
@ -9,9 +9,6 @@
 /* Define to FFT in single precision. */
 #undef FFTS_PREC_SINGLE

-/* Set ARM float abi. */
-#undef FLOAT_ABI
-
 /* Define to 1 if you have the declaration of `memalign', and to 0 if you
   don't. */
 #undef HAVE_DECL_MEMALIGN
@ -146,3 +143,5 @@
 /* Define to the type of an unsigned integer type of width exactly 64 bits if
   such a type exists and the standard includes do not define it. */
 #undef uint64_t
+
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/lib/ffts/configure
+++ b/lib/ffts/configure
@ -713,6 +713,10 @@ build_os
 build_vendor
 build_cpu
 build
+AM_BACKSLASH
+AM_DEFAULT_VERBOSITY
+AM_DEFAULT_V
+AM_V
 am__untar
 am__tar
 AMTAR
@ -777,6 +781,7 @@ SHELL'
 ac_subst_files=''
 ac_user_opts='
 enable_option_checking
+enable_silent_rules
 enable_dependency_tracking
 enable_shared
 enable_static
@ -1429,6 +1434,8 @@ Optional Features:
  --disable-option-checking  ignore unrecognized --enable/--with options
  --disable-FEATURE       do not include FEATURE (same as --enable-FEATURE=no)
  --enable-FEATURE[=ARG]  include FEATURE [ARG=yes]
+  --enable-silent-rules   less verbose build output (undo: "make V=1")
+  --disable-silent-rules  verbose build output (undo: "make V=0")
  --enable-dependency-tracking
                          do not reject slow dependency extractors
  --disable-dependency-tracking
@ -2608,7 +2615,7 @@ ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $
 ac_compiler_gnu=$ac_cv_c_compiler_gnu


-am__api_version='1.12'
+am__api_version='1.14'

 ac_aux_dir=
 for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do
@ -2821,8 +2828,8 @@ if test x"${MISSING+set}" != xset; then
  esac
 fi
 # Use eval to expand $SHELL
-if eval "$MISSING --run true"; then
-  am_missing_run="$MISSING --run "
+if eval "$MISSING --is-lightweight"; then
+  am_missing_run="$MISSING "
 else
  am_missing_run=
  { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: 'missing' script is too old or missing" >&5
@ -3062,6 +3069,45 @@ else
 fi
 rmdir .tst 2>/dev/null

+# Check whether --enable-silent-rules was given.
+if test "${enable_silent_rules+set}" = set; then :
+  enableval=$enable_silent_rules;
+fi
+
+case $enable_silent_rules in # (((
+  yes) AM_DEFAULT_VERBOSITY=0;;
+   no) AM_DEFAULT_VERBOSITY=1;;
+    *) AM_DEFAULT_VERBOSITY=1;;
+esac
+am_make=${MAKE-make}
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $am_make supports nested variables" >&5
+$as_echo_n "checking whether $am_make supports nested variables... " >&6; }
+if ${am_cv_make_support_nested_variables+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  if $as_echo 'TRUE=$(BAR$(V))
+BAR0=false
+BAR1=true
+V=1
+am__doit:
+	@$(TRUE)
+.PHONY: am__doit' | $am_make -f - >/dev/null 2>&1; then
+  am_cv_make_support_nested_variables=yes
+else
+  am_cv_make_support_nested_variables=no
+fi
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_make_support_nested_variables" >&5
+$as_echo "$am_cv_make_support_nested_variables" >&6; }
+if test $am_cv_make_support_nested_variables = yes; then
+    AM_V='$(V)'
+  AM_DEFAULT_V='$(AM_DEFAULT_VERBOSITY)'
+else
+  AM_V=$AM_DEFAULT_VERBOSITY
+  AM_DEFAULT_V=$AM_DEFAULT_VERBOSITY
+fi
+AM_BACKSLASH='\'
+
 if test "`cd $srcdir && pwd`" != "`pwd`"; then
  # Use -I$(srcdir) only when $(srcdir) != ., so that make's output
  # is not polluted with repeated "-I."
@ -3125,6 +3171,10 @@ mkdir_p='$(MKDIR_P)'
 # in the wild :-(  We should find a proper way to deprecate it ...
 AMTAR='$${TAR-tar}'

+
+# We'll loop over all known methods to create a tar archive until one works.
+_am_tools='gnutar  pax cpio none'
+
 am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'


@ -3132,6 +3182,48 @@ am__tar='$${TAR-tar} chof - "$$tardir"' am__untar='$${TAR-tar} xf -'



+# POSIX will say in a future version that running "rm -f" with no argument
+# is OK; and we want to be able to make that assumption in our Makefile
+# recipes.  So use an aggressive probe to check that the usage we want is
+# actually supported "in the wild" to an acceptable degree.
+# See automake bug#10828.
+# To make any issue more visible, cause the running configure to be aborted
+# by default if the 'rm' program in use doesn't match our expectations; the
+# user can still override this though.
+if rm -f && rm -fr && rm -rf; then : OK; else
+  cat >&2 <<'END'
+Oops!
+
+Your 'rm' program seems unable to run without file operands specified
+on the command line, even when the '-f' option is present.  This is contrary
+to the behaviour of most rm programs out there, and not conforming with
+the upcoming POSIX standard: <http://austingroupbugs.net/view.php?id=542>
+
+Please tell bug-automake@gnu.org about your system, including the value
+of your $PATH and any error possibly output before this message.  This
+can help us improve future automake versions.
+
+END
+  if test x"$ACCEPT_INFERIOR_RM_PROGRAM" = x"yes"; then
+    echo 'Configuration will proceed anyway, since you have set the' >&2
+    echo 'ACCEPT_INFERIOR_RM_PROGRAM variable to "yes"' >&2
+    echo >&2
+  else
+    cat >&2 <<'END'
+Aborting the configuration process, to ensure you take notice of the issue.
+
+You can download and install GNU coreutils to get an 'rm' implementation
+that behaves properly: <http://www.gnu.org/software/coreutils/>.
+
+If you want to complete the configuration process using your problematic
+'rm' anyway, export the environment variable ACCEPT_INFERIOR_RM_PROGRAM
+to "yes", and re-run configure.
+
+END
+    as_fn_error $? "Your 'rm' program is bad, sorry." "$LINENO" 5
+  fi
+fi
+


 # AC_CONFIG_SRCDIR([include/common.h])
@ -4448,6 +4540,65 @@ ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
 ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
 ac_compiler_gnu=$ac_cv_c_compiler_gnu

+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC understands -c and -o together" >&5
+$as_echo_n "checking whether $CC understands -c and -o together... " >&6; }
+if ${am_cv_prog_cc_c_o+:} false; then :
+  $as_echo_n "(cached) " >&6
+else
+  cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+int
+main ()
+{
+
+  ;
+  return 0;
+}
+_ACEOF
+  # Make sure it works both with $CC and with simple cc.
+  # Following AC_PROG_CC_C_O, we do the test twice because some
+  # compilers refuse to overwrite an existing .o file with -o,
+  # though they will create one.
+  am_cv_prog_cc_c_o=yes
+  for am_i in 1 2; do
+    if { echo "$as_me:$LINENO: $CC -c conftest.$ac_ext -o conftest2.$ac_objext" >&5
+   ($CC -c conftest.$ac_ext -o conftest2.$ac_objext) >&5 2>&5
+   ac_status=$?
+   echo "$as_me:$LINENO: \$? = $ac_status" >&5
+   (exit $ac_status); } \
+         && test -f conftest2.$ac_objext; then
+      : OK
+    else
+      am_cv_prog_cc_c_o=no
+      break
+    fi
+  done
+  rm -f core conftest*
+  unset am_i
+fi
+{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $am_cv_prog_cc_c_o" >&5
+$as_echo "$am_cv_prog_cc_c_o" >&6; }
+if test "$am_cv_prog_cc_c_o" != yes; then
+   # Losing compiler, so override with the script.
+   # FIXME: It is wrong to rewrite CC.
+   # But if we don't then we get into trouble of one sort or another.
+   # A longer-term fix would be to have automake use am__CC in this case,
+   # and then we could set am__CC="\$(top_srcdir)/compile \$(CC)"
+   CC="$am_aux_dir/compile $CC"
+fi
+ac_ext=c
+ac_cpp='$CPP $CPPFLAGS'
+ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5'
+ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5'
+ac_compiler_gnu=$ac_cv_c_compiler_gnu
+
+
 depcc="$CC"   am_compiler_list=

 { $as_echo "$as_me:${as_lineno-$LINENO}: checking dependency style of $depcc" >&5
@ -15533,9 +15684,6 @@ else
 fi


-$as_echo "#define FLOAT_ABI \$float_abi" >>confdefs.h
-
-
 # Check whether --enable-jni was given.
 if test "${enable_jni+set}" = set; then :
  enableval=$enable_jni; have_jni=$enableval
@ -15747,7 +15895,7 @@ else
 JAVA_TEST=Test.java
 CLASS_TEST=Test.class
 cat << \EOF > $JAVA_TEST
-/* #line 15750 "configure" */
+/* #line 15898 "configure" */
 public class Test {
 }
 EOF
@ -18377,7 +18525,7 @@ $as_echo "$as_me: executing $ac_file commands" >&6;}

  case $ac_file$ac_mode in
    "depfiles":C) test x"$AMDEP_TRUE" != x"" || {
-  # Autoconf 2.62 quotes --file arguments for eval, but not when files
+  # Older Autoconf quotes --file arguments for eval, but not when files
  # are listed without --file.  Let's play safe and only enable the eval
  # if we detect the quoting.
  case $CONFIG_FILES in
@ -18428,7 +18576,7 @@ $as_echo X"$mf" |
    DEPDIR=`sed -n 's/^DEPDIR = //p' < "$mf"`
    test -z "$DEPDIR" && continue
    am__include=`sed -n 's/^am__include = //p' < "$mf"`
-    test -z "am__include" && continue
+    test -z "$am__include" && continue
    am__quote=`sed -n 's/^am__quote = //p' < "$mf"`
    # Find all dependency output files, they are included files with
    # $(DEPDIR) in their names.  We invoke sed twice because it is the
--- a/lib/ffts/include/ffts.h
+++ b/lib/ffts/include/ffts.h
@ -1,7 +1,7 @@
 /*
- 
+
 This file is part of FFTS.
-  
+
 Copyright (c) 2012, Anthony M. Blake
 All rights reserved.

@ -29,40 +29,82 @@

 */

-#ifndef __FFTS_H__
-#define __FFTS_H__
+#ifndef FFTS_H
+#define FFTS_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif

-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <stdint.h>
 #include <stddef.h>

 #ifdef __cplusplus
-extern "C"
-{
-#endif /* __cplusplus */
+extern "C" {
+#endif
+
+#if (defined(_WIN32) || defined(WIN32)) && defined(FFTS_SHARED)
+#  ifdef FFTS_BUILD
+#    define FFTS_API __declspec(dllexport)
+#  else
+#    define FFTS_API __declspec(dllimport)
+#  endif
+#else
+#  if (__GNUC__ >= 4) || defined(HAVE_GCC_VISIBILITY)
+#    define FFTS_API __attribute__ ((visibility("default")))
+#  else
+#    define FFTS_API
+#  endif
+#endif
+
+/* The direction of the transform
+   (i.e, the sign of the exponent in the transform.)
+*/
+#define FFTS_FORWARD (-1)
+#define FFTS_BACKWARD (+1)

 struct _ffts_plan_t;
 typedef struct _ffts_plan_t ffts_plan_t;

-ffts_plan_t *ffts_init_1d(size_t N, int sign);
-ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign);
-ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign);
+/* Complex data is stored in the interleaved format
+   (i.e, the real and imaginary parts composing each
+   element of complex data are stored adjacently in memory)
+
+   The multi-dimensional arrays passed are expected to be
+   stored as a single contiguous block in row-major order
+*/
+FFTS_API ffts_plan_t*
+ffts_init_1d(size_t N, int sign);

-// For real transforms, sign == -1 implies a real-to-complex forwards tranform,
-// and sign == 1 implies a complex-to-real backwards transform
-// The output of a real-to-complex transform is N/2+1 complex numbers, where the
-// redundant outputs have been omitted.
-ffts_plan_t *ffts_init_1d_real(size_t N, int sign);
-ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign);
-ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign);
+FFTS_API ffts_plan_t*
+ffts_init_2d(size_t N1, size_t N2, int sign);

-void ffts_execute(ffts_plan_t * , const void *input, void *output);
-void ffts_free(ffts_plan_t *);
+FFTS_API ffts_plan_t*
+ffts_init_nd(int rank, size_t *Ns, int sign);

-#ifdef __cplusplus
-}  /* extern "C" */
-#endif /* __cplusplus */
+/* For real transforms, sign == FFTS_FORWARD implies a real-to-complex
+   forwards tranform, and sign == FFTS_BACKWARD implies a complex-to-real
+   backwards transform.
+
+   The output of a real-to-complex transform is N/2+1 complex numbers,
+   where the redundant outputs have been omitted.
+*/
+FFTS_API ffts_plan_t*
+ffts_init_1d_real(size_t N, int sign);
+
+FFTS_API ffts_plan_t*
+ffts_init_2d_real(size_t N1, size_t N2, int sign);

+FFTS_API ffts_plan_t*
+ffts_init_nd_real(int rank, size_t *Ns, int sign);
+
+FFTS_API void
+ffts_execute(ffts_plan_t *p, const void *input, void *output);
+
+FFTS_API void
+ffts_free(ffts_plan_t *p);
+
+#ifdef __cplusplus
+}
 #endif
+
+#endif /* FFTS_H */
--- a/lib/ffts/java/Makefile.in
+++ b/lib/ffts/java/Makefile.in
@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.12.4 from Makefile.am.
+# Makefile.in generated by automake 1.14 from Makefile.am.
 # @configure_input@

-# Copyright (C) 1994-2012 Free Software Foundation, Inc.
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.

 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -20,23 +20,51 @@


 VPATH = @srcdir@
-am__make_dryrun = \
-  { \
-    am__dry=no; \
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
    case $$MAKEFLAGS in \
      *\\[\ \	]*) \
-        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
-          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
-      *) \
-        for am__flg in $$MAKEFLAGS; do \
-          case $$am__flg in \
-            *=*|--*) ;; \
-            *n*) am__dry=yes; break;; \
-          esac; \
-        done;; \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
    esac; \
-    test $$am__dry = yes; \
-  }
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
 pkgdatadir = $(datadir)/@PACKAGE@
 pkgincludedir = $(includedir)/@PACKAGE@
 pkglibdir = $(libdir)/@PACKAGE@
@ -56,7 +84,7 @@ POST_UNINSTALL = :
 build_triplet = @build@
 host_triplet = @host@
 subdir = java
-DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/depcomp
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_classpath.m4 \
@ -109,23 +137,49 @@ am__libffts_jni_la_SOURCES_DIST = jni/ffts_jni.c
@ENABLE_JNI_TRUE@am_libffts_jni_la_OBJECTS =  \
@ENABLE_JNI_TRUE@	libffts_jni_la-ffts_jni.lo
 libffts_jni_la_OBJECTS = $(am_libffts_jni_la_OBJECTS)
-libffts_jni_la_LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) \
-	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(libffts_jni_la_CFLAGS) \
-	$(CFLAGS) $(libffts_jni_la_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+libffts_jni_la_LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC \
+	$(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=link $(CCLD) \
+	$(libffts_jni_la_CFLAGS) $(CFLAGS) $(libffts_jni_la_LDFLAGS) \
+	$(LDFLAGS) -o $@
@ENABLE_JNI_TRUE@am_libffts_jni_la_rpath = -rpath $(libdir)
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
 DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/depcomp
 am__depfiles_maybe = depfiles
 am__mv = mv -f
 COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
 	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
-	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
-	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
 CCLD = $(CC)
-LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
-	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
 SOURCES = $(libffts_jni_la_SOURCES)
 DIST_SOURCES = $(am__libffts_jni_la_SOURCES_DIST)
 am__can_run_installinfo = \
@ -135,11 +189,29 @@ am__can_run_installinfo = \
  esac
 DATA = $(pkgdata_DATA)
 HEADERS = $(nodist_include_HEADERS)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
 ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
@ -313,6 +385,7 @@ $(top_srcdir)/configure:  $(am__configure_deps)
 $(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
+
 install-libLTLIBRARIES: $(lib_LTLIBRARIES)
 	@$(NORMAL_INSTALL)
 	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
@ -347,8 +420,9 @@ clean-libLTLIBRARIES:
 	  echo rm -f $${locs}; \
 	  rm -f $${locs}; \
 	}
+
 libffts_jni.la: $(libffts_jni_la_OBJECTS) $(libffts_jni_la_DEPENDENCIES) $(EXTRA_libffts_jni_la_DEPENDENCIES) 
-	$(libffts_jni_la_LINK) $(am_libffts_jni_la_rpath) $(libffts_jni_la_OBJECTS) $(libffts_jni_la_LIBADD) $(LIBS)
+	$(AM_V_CCLD)$(libffts_jni_la_LINK) $(am_libffts_jni_la_rpath) $(libffts_jni_la_OBJECTS) $(libffts_jni_la_LIBADD) $(LIBS)

 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@ -359,32 +433,32 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/libffts_jni_la-ffts_jni.Plo@am__quote@

 .c.o:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<

 .c.obj:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`

 .c.lo:
-@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<

 libffts_jni_la-ffts_jni.lo: jni/ffts_jni.c
-@am__fastdepCC_TRUE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libffts_jni_la_CFLAGS) $(CFLAGS) -MT libffts_jni_la-ffts_jni.lo -MD -MP -MF $(DEPDIR)/libffts_jni_la-ffts_jni.Tpo -c -o libffts_jni_la-ffts_jni.lo `test -f 'jni/ffts_jni.c' || echo '$(srcdir)/'`jni/ffts_jni.c
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/libffts_jni_la-ffts_jni.Tpo $(DEPDIR)/libffts_jni_la-ffts_jni.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='jni/ffts_jni.c' object='libffts_jni_la-ffts_jni.lo' libtool=yes @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libffts_jni_la_CFLAGS) $(CFLAGS) -MT libffts_jni_la-ffts_jni.lo -MD -MP -MF $(DEPDIR)/libffts_jni_la-ffts_jni.Tpo -c -o libffts_jni_la-ffts_jni.lo `test -f 'jni/ffts_jni.c' || echo '$(srcdir)/'`jni/ffts_jni.c
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/libffts_jni_la-ffts_jni.Tpo $(DEPDIR)/libffts_jni_la-ffts_jni.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='jni/ffts_jni.c' object='libffts_jni_la-ffts_jni.lo' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libffts_jni_la_CFLAGS) $(CFLAGS) -c -o libffts_jni_la-ffts_jni.lo `test -f 'jni/ffts_jni.c' || echo '$(srcdir)/'`jni/ffts_jni.c
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libffts_jni_la_CFLAGS) $(CFLAGS) -c -o libffts_jni_la-ffts_jni.lo `test -f 'jni/ffts_jni.c' || echo '$(srcdir)/'`jni/ffts_jni.c

 mostlyclean-libtool:
 	-rm -f *.lo
@ -434,26 +508,15 @@ uninstall-nodist_includeHEADERS:
 	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
 	dir='$(DESTDIR)$(includedir)'; $(am__uninstall_files_from_dir)

-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
 	set x; \
 	here=`pwd`; \
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	$(am__define_uniq_tagged_files); \
 	shift; \
 	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
 	  test -n "$$unique" || unique=$$empty_fix; \
@ -465,15 +528,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
 	      $$unique; \
 	  fi; \
 	fi
-ctags: CTAGS
-CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
 	test -z "$(CTAGS_ARGS)$$unique" \
 	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
 	     $$unique
@ -482,9 +541,10 @@ GTAGS:
 	here=`$(am__cd) $(top_builddir) && pwd` \
 	  && $(am__cd) $(top_srcdir) \
 	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am

-cscopelist:  $(HEADERS) $(SOURCES) $(LISP)
-	list='$(SOURCES) $(HEADERS) $(LISP)'; \
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
 	case "$(srcdir)" in \
 	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
 	  *) sdir=$(subdir)/$(srcdir) ;; \
@ -646,9 +706,9 @@ uninstall-am: uninstall-libLTLIBRARIES uninstall-nodist_includeHEADERS \

 .MAKE: all check install install-am install-strip

-.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
-	clean-libLTLIBRARIES clean-libtool clean-local cscopelist \
-	ctags distclean distclean-compile distclean-generic \
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+	clean-libLTLIBRARIES clean-libtool clean-local cscopelist-am \
+	ctags ctags-am distclean distclean-compile distclean-generic \
 	distclean-libtool distclean-tags distdir dvi dvi-am html \
 	html-am info info-am install install-am install-data \
 	install-data-am install-dvi install-dvi-am install-exec \
@ -659,7 +719,7 @@ uninstall-am: uninstall-libLTLIBRARIES uninstall-nodist_includeHEADERS \
 	installcheck installcheck-am installdirs maintainer-clean \
 	maintainer-clean-generic mostlyclean mostlyclean-compile \
 	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
-	tags uninstall uninstall-am uninstall-libLTLIBRARIES \
+	tags tags-am uninstall uninstall-am uninstall-libLTLIBRARIES \
 	uninstall-nodist_includeHEADERS uninstall-pkgdataDATA


--- a/lib/ffts/java/android/.classpath
+++ b/lib/ffts/java/android/.classpath
@ -0,0 +1,9 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="gen"/>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="com.android.ide.eclipse.adt.ANDROID_FRAMEWORK"/>
+	<classpathentry exported="true" kind="con" path="com.android.ide.eclipse.adt.LIBRARIES"/>
+	<classpathentry exported="true" kind="con" path="com.android.ide.eclipse.adt.DEPENDENCIES"/>
+	<classpathentry kind="output" path="bin/classes"/>
+</classpath>
--- a/lib/ffts/java/android/.project
+++ b/lib/ffts/java/android/.project
@ -0,0 +1,40 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>ffts-android</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>com.android.ide.eclipse.adt.ResourceManagerBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>com.android.ide.eclipse.adt.PreCompilerBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>com.android.ide.eclipse.adt.ApkBuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>com.android.ide.eclipse.adt.AndroidNature</nature>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+	<linkedResources>
+		<link>
+			<name>src</name>
+			<type>2</type>
+			<locationURI>PARENT-1-PROJECT_LOC/src</locationURI>
+		</link>
+	</linkedResources>
+</projectDescription>
--- a/lib/ffts/java/android/.settings/org.eclipse.jdt.core.prefs
+++ b/lib/ffts/java/android/.settings/org.eclipse.jdt.core.prefs
@ -0,0 +1,4 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.source=1.6
--- a/lib/ffts/java/android/.settings/org.eclipse.ltk.core.refactoring.prefs
+++ b/lib/ffts/java/android/.settings/org.eclipse.ltk.core.refactoring.prefs
@ -0,0 +1,2 @@
+eclipse.preferences.version=1
+org.eclipse.ltk.core.refactoring.enable.project.refactoring.history=false
--- a/lib/ffts/java/android/AndroidManifest.xml
+++ b/lib/ffts/java/android/AndroidManifest.xml
@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+      package="nz.waikato.ffts"
+      android:versionCode="1"
+      android:versionName="1.0">
+  <uses-sdk android:minSdkVersion="8" />
+</manifest> 
--- a/lib/ffts/java/android/ant.properties
+++ b/lib/ffts/java/android/ant.properties
@ -0,0 +1,18 @@
+# This file is used to override default values used by the Ant build system.
+#
+# This file must be checked into Version Control Systems, as it is
+# integral to the build system of your project.
+
+# This file is only used by the Ant script.
+
+# You can use this to override default values such as
+#  'source.dir' for the location of your java source folder and
+#  'out.dir' for the location of your output folder.
+source.dir=../src
+
+# You can also use it define how the release builds are signed by declaring
+# the following properties:
+#  'key.store' for the location of your keystore and
+#  'key.alias' for the name of the key to use.
+# The password will be asked during the build when you use the 'release' target.
+
--- a/lib/ffts/java/android/build.xml
+++ b/lib/ffts/java/android/build.xml
@ -0,0 +1,92 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project name="ffts" default="help">
+
+    <!-- The local.properties file is created and updated by the 'android' tool.
+         It contains the path to the SDK. It should *NOT* be checked into
+         Version Control Systems. -->
+    <property file="local.properties" />
+
+    <!-- The ant.properties file can be created by you. It is only edited by the
+         'android' tool to add properties to it.
+         This is the place to change some Ant specific build properties.
+         Here are some properties you may want to change/update:
+
+         source.dir
+             The name of the source directory. Default is 'src'.
+         out.dir
+             The name of the output directory. Default is 'bin'.
+
+         For other overridable properties, look at the beginning of the rules
+         files in the SDK, at tools/ant/build.xml
+
+         Properties related to the SDK location or the project target should
+         be updated using the 'android' tool with the 'update' action.
+
+         This file is an integral part of the build system for your
+         application and should be checked into Version Control Systems.
+
+         -->
+    <property file="ant.properties" />
+
+    <!-- if sdk.dir was not set from one of the property file, then
+         get it from the ANDROID_HOME env var.
+         This must be done before we load project.properties since
+         the proguard config can use sdk.dir -->
+    <property environment="env" />
+    <condition property="sdk.dir" value="${env.ANDROID_HOME}">
+        <isset property="env.ANDROID_HOME" />
+    </condition>
+
+    <!-- The project.properties file is created and updated by the 'android'
+         tool, as well as ADT.
+
+         This contains project specific properties such as project target, and library
+         dependencies. Lower level build properties are stored in ant.properties
+         (or in .classpath for Eclipse projects).
+
+         This file is an integral part of the build system for your
+         application and should be checked into Version Control Systems. -->
+    <loadproperties srcFile="project.properties" />
+
+    <!-- quick check on sdk.dir -->
+    <fail
+            message="sdk.dir is missing. Make sure to generate local.properties using 'android update project' or to inject it through the ANDROID_HOME environment variable."
+            unless="sdk.dir"
+    />
+
+    <!--
+        Import per project custom build rules if present at the root of the project.
+        This is the place to put custom intermediary targets such as:
+            -pre-build
+            -pre-compile
+            -post-compile (This is typically used for code obfuscation.
+                           Compiled code location: ${out.classes.absolute.dir}
+                           If this is not done in place, override ${out.dex.input.absolute.dir})
+            -post-package
+            -post-build
+            -pre-clean
+    -->
+    <import file="custom_rules.xml" optional="true" />
+
+    <!-- Import the actual build file.
+
+         To customize existing targets, there are two options:
+         - Customize only one target:
+             - copy/paste the target into this file, *before* the
+               <import> task.
+             - customize it to your needs.
+         - Customize the whole content of build.xml
+             - copy/paste the content of the rules files (minus the top node)
+               into this file, replacing the <import> task.
+             - customize to your needs.
+
+         ***********************
+         ****** IMPORTANT ******
+         ***********************
+         In all cases you must update the value of version-tag below to read 'custom' instead of an integer,
+         in order to avoid having your file be overridden by tools such as "android update project"
+    -->
+    <!-- version-tag: 1 -->
+    <import file="${sdk.dir}/tools/ant/build.xml" />
+
+</project>
--- a/lib/ffts/java/android/jni/Android.mk
+++ b/lib/ffts/java/android/jni/Android.mk
@ -0,0 +1,25 @@
+LOCAL_PATH := $(call my-dir)
+
+TOP=../../..
+
+# Include the shared library
+#include $(CLEAR_VARS)
+#LOCAL_MODULE := ffts
+#LOCAL_SRC_FILES :=  ../../../src/.libs/libffts.so
+#include $(PREBUILT_SHARED_LIBRARY)
+
+# Include the static library in shared lib
+include $(CLEAR_VARS)
+LOCAL_MODULE := ffts
+LOCAL_SRC_FILES := $(TOP)/java/android/bin/lib/libffts.a
+LOCAL_EXPORT_C_INCLUDES := $(TOP)/include
+include $(PREBUILT_STATIC_LIBRARY)
+
+include $(CLEAR_VARS)
+LOCAL_MODULE := ffts_jni
+LOCAL_CFLAGS := -I$(TOP)/include -I$(TOP)/java/jni -I$(TOP) -Wno-pointer-to-int-cast -Wno-int-to-pointer-cast
+LOCAL_SRC_FILES := $(TOP)/java/jni/ffts_jni.c
+LOCAL_LDLIBS := -L$(SYSROOT)/usr/lib -llog 
+LOCAL_STATIC_LIBRARIES := ffts
+
+include $(BUILD_SHARED_LIBRARY)
--- a/lib/ffts/java/android/jni/Application.mk
+++ b/lib/ffts/java/android/jni/Application.mk
@ -0,0 +1,2 @@
+# requires NEON atm
+APP_ABI := armeabi-v7a
--- a/lib/ffts/java/android/proguard-project.txt
+++ b/lib/ffts/java/android/proguard-project.txt
@ -0,0 +1,20 @@
+# To enable ProGuard in your project, edit project.properties
+# to define the proguard.config property as described in that file.
+#
+# Add project specific ProGuard rules here.
+# By default, the flags in this file are appended to flags specified
+# in ${sdk.dir}/tools/proguard/proguard-android.txt
+# You can edit the include path and order by changing the ProGuard
+# include property in project.properties.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# Add any project specific keep options here:
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
--- a/lib/ffts/java/android/project.properties
+++ b/lib/ffts/java/android/project.properties
@ -0,0 +1,15 @@
+# This file is automatically generated by Android Tools.
+# Do not modify this file -- YOUR CHANGES WILL BE ERASED!
+#
+# This file must be checked in Version Control Systems.
+#
+# To customize properties used by the Ant build system edit
+# "ant.properties", and override values to adapt the script to your
+# project structure.
+#
+# To enable ProGuard to shrink and obfuscate your code, uncomment this (available properties: sdk.dir, user.home):
+#proguard.config=${sdk.dir}/tools/proguard/proguard-android.txt:proguard-project.txt
+
+android.library=true
+# Project target.
+target=android-10
--- a/lib/ffts/java/jni/ffts_jni.c
+++ b/lib/ffts/java/jni/ffts_jni.c
@ -38,6 +38,8 @@
 // the classes ... but we can't build the project without the jni.
 #ifdef ANDROID
 #include <jni.h>
+#define NEEDS_ALIGNED
+#undef HAVE_DECL_POSIX_MEMALIGN
 #else
 #include "nz_ac_waikato_ffts_FFTS.h"
 #endif
@ -231,3 +233,5 @@ JNIEXPORT void JNICALL Java_nz_ac_waikato_ffts_FFTS_free

 	ffts_free(plan);
 }
+
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/lib/ffts/java/src/nz/ac/waikato/ffts/FFTS.java
+++ b/lib/ffts/java/src/nz/ac/waikato/ffts/FFTS.java
@ -0,0 +1,203 @@
+/*
+ *  This file is part of FFTS -- The Fastest Fourier Transform in the South
+ *
+ * Copyright (c) 2013, Michael Zucchi <notzed@gmail.com>
+ *
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in the
+ *     documentation and/or other materials provided with the distribution.
+ *   * Neither the name of the organization nor the
+ *     names of its contributors may be used to endorse or promote products
+ *     derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+package nz.ac.waikato.ffts;
+
+import java.nio.FloatBuffer;
+
+/**
+ * A java wrapper for ffts plans.
+ *
+ * Plans must currently be freed explicitly.
+ *
+ * @author notzed
+ */
+public class FFTS {
+
+	/**
+	 * C pointer
+	 */
+	private long p;
+	/**
+	 * Minimum size of input
+	 */
+	final protected long inSize;
+	/**
+	 * Minimum size of output
+	 */
+	final protected long outSize;
+
+	private FFTS(long p, long inSize) {
+		this(p, inSize, inSize);
+	}
+
+	private FFTS(long p, long inSize, long outSize) {
+		this.p = p;
+		this.inSize = inSize;
+		this.outSize = inSize;
+	}
+	/**
+	 * The sign to use for a forward transform.
+	 */
+	public static final int FORWARD = -1;
+	/**
+	 * The sign to use for a backward transform.
+	 */
+	public static final int BACKWARD = 1;
+
+	/**
+	 * Create a FFT plan for a 1-dimensional complex transform.
+	 *
+	 * The src and dst parameters to execute() use complex data.
+	 *
+	 * @param sign The direction of the transform.
+	 * @param N The size of the transform.
+	 * @return
+	 */
+	public static FFTS complex(int sign, int N) {
+		return new FFTS(complex_1d(N, sign), N * 2);
+	}
+
+	/**
+	 * Create a FFT plan for a 2-dimensional complex transform.
+	 * @param sign The direction of the transform.
+	 * @param N1 The size of the transform.
+	 * @param N2 The size of the transform.
+	 * @return
+	 */
+	public static FFTS complex(int sign, int N1, int N2) {
+		return new FFTS(complex_2d(N1, N2, sign), N1 * N2 * 2);
+	}
+
+	public static FFTS complex(int sign, int... Ns) {
+		return new FFTS(complex_nd(Ns, sign), size(Ns) * 2);
+	}
+
+	public static FFTS real(int sign, int N) {
+		return new FFTS(real_1d(N, sign), sign == FORWARD ? N : (N / 2 + 1) * 2, sign == FORWARD ? (N / 2 + 1) * 2 : N);
+	}
+
+	public static FFTS real(int sign, int N1, int N2) {
+		return new FFTS(real_2d(N1, N2, sign), sign == FORWARD ? N1 * N2 : (N1 * N2 / 2 + 1) * 2, sign == FORWARD ? (N1 * N2 / 2 + 1) * 2 : N1 * N2);
+	}
+
+	public static FFTS real(int sign, int... Ns) {
+		return new FFTS(real_nd(Ns, sign), sign == FORWARD ? size(Ns) : (size(Ns) / 2 + 1) * 2, sign == FORWARD ? (size(Ns) / 2 + 1) * 2 : size(Ns));
+	}
+
+	/**
+	 * Execute this plan with the given array data.
+	 *
+	 * @param src
+	 * @param dst
+	 */
+	public void execute(float[] src, float[] dst) {
+		execute(src, 0, dst, 0);
+	}
+
+	/**
+	 * Execute this plan with the given array data.
+	 * @param src
+	 * @param soff Start offset into src array.
+	 * @param dst
+	 * @param doff Start offset into dst array.
+	 */
+	public void execute(float[] src, int soff, float[] dst, int doff) {
+		if (src.length - soff < inSize || dst.length - doff < outSize)
+			throw new ArrayIndexOutOfBoundsException();
+		if (p == 0)
+			throw new NullPointerException();
+
+		execute(p, inSize, src, soff, dst, doff);
+	}
+
+	/**
+	 * Execute this plan with the given nio buffers.  The bufffers
+	 * must be derived from direct buffers.
+	 *
+	 * The buffer position and limits are ignored.
+	 *
+	 * @param src
+	 * @param dst
+	 */
+	public void execute(FloatBuffer src, FloatBuffer dst) {
+		if (src.capacity() < inSize || dst.capacity() < outSize)
+			throw new ArrayIndexOutOfBoundsException();
+		if (p == 0)
+			throw new NullPointerException();
+
+		execute(p, inSize, src, dst);
+	}
+
+	/**
+	 * Free the plan.
+	 */
+	public void free() {
+		if (p == 0)
+			throw new NullPointerException();
+		free(p);
+	}
+
+	/*
+	 * Calculate the number of elements required to store one
+	 * set of n-dimensional data.
+	 */
+	protected static long size(int[] Ns) {
+		long s = Ns[0];
+		for (int i = 1; i < Ns.length; i++)
+			s *= Ns[i];
+		return s;
+	}
+
+	static {
+		System.loadLibrary("ffts_jni");
+	}
+
+	/*
+	 * Native interface
+	 */
+	protected static native long complex_1d(int N, int sign);
+
+	protected static native long complex_2d(int N1, int N2, int sign);
+
+	protected static native long complex_nd(int[] Ns, int sign);
+
+	protected static native long real_1d(int N, int sign);
+
+	protected static native long real_2d(int N1, int N2, int sign);
+
+	protected static native long real_nd(int[] Ns, int sign);
+
+	protected static native void execute(long p, long size, float[] src, int soff, float[] dst, int doff);
+
+	protected static native void execute(long p, long size, FloatBuffer src, FloatBuffer dst);
+
+	protected static native void free(long p);
+}
--- a/lib/ffts/m4/ax_check_class.m4
+++ b/lib/ffts/m4/ax_check_class.m4
@ -0,0 +1,144 @@
+# ===========================================================================
+#      http://www.gnu.org/software/autoconf-archive/ax_check_class.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_CLASS
+#
+# DESCRIPTION
+#
+#   AX_CHECK_CLASS tests the existence of a given Java class, either in a
+#   jar or in a '.class' file.
+#
+#   *Warning*: its success or failure can depend on a proper setting of the
+#   CLASSPATH env. variable.
+#
+#   Note: This is part of the set of autoconf M4 macros for Java programs.
+#   It is VERY IMPORTANT that you download the whole set, some macros depend
+#   on other. Unfortunately, the autoconf archive does not support the
+#   concept of set of macros, so I had to break it for submission. The
+#   general documentation, as well as the sample configure.in, is included
+#   in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 7
+
+AU_ALIAS([AC_CHECK_CLASS], [AX_CHECK_CLASS])
+AC_DEFUN([AX_CHECK_CLASS],[
+AC_REQUIRE([AX_PROG_JAVA])
+ac_var_name=`echo $1 | sed 's/\./_/g'`
+dnl Normaly I'd use a AC_CACHE_CHECK here but since the variable name is
+dnl dynamic I need an extra level of extraction
+AC_MSG_CHECKING([for $1 class])
+AC_CACHE_VAL(ax_cv_class_$ac_var_name, [
+if test x$ac_cv_prog_uudecode_base64 = xyes; then
+dnl /**
+dnl  * Test.java: used to test dynamicaly if a class exists.
+dnl  */
+dnl public class Test
+dnl {
+dnl
+dnl public static void
+dnl main( String[] argv )
+dnl {
+dnl     Class lib;
+dnl     if (argv.length < 1)
+dnl      {
+dnl             System.err.println ("Missing argument");
+dnl             System.exit (77);
+dnl      }
+dnl     try
+dnl      {
+dnl             lib = Class.forName (argv[0]);
+dnl      }
+dnl     catch (ClassNotFoundException e)
+dnl      {
+dnl             System.exit (1);
+dnl      }
+dnl     lib = null;
+dnl     System.exit (0);
+dnl }
+dnl
+dnl }
+cat << \EOF > Test.uue
+begin-base64 644 Test.class
+yv66vgADAC0AKQcAAgEABFRlc3QHAAQBABBqYXZhL2xhbmcvT2JqZWN0AQAE
+bWFpbgEAFihbTGphdmEvbGFuZy9TdHJpbmc7KVYBAARDb2RlAQAPTGluZU51
+bWJlclRhYmxlDAAKAAsBAANlcnIBABVMamF2YS9pby9QcmludFN0cmVhbTsJ
+AA0ACQcADgEAEGphdmEvbGFuZy9TeXN0ZW0IABABABBNaXNzaW5nIGFyZ3Vt
+ZW50DAASABMBAAdwcmludGxuAQAVKExqYXZhL2xhbmcvU3RyaW5nOylWCgAV
+ABEHABYBABNqYXZhL2lvL1ByaW50U3RyZWFtDAAYABkBAARleGl0AQAEKEkp
+VgoADQAXDAAcAB0BAAdmb3JOYW1lAQAlKExqYXZhL2xhbmcvU3RyaW5nOylM
+amF2YS9sYW5nL0NsYXNzOwoAHwAbBwAgAQAPamF2YS9sYW5nL0NsYXNzBwAi
+AQAgamF2YS9sYW5nL0NsYXNzTm90Rm91bmRFeGNlcHRpb24BAAY8aW5pdD4B
+AAMoKVYMACMAJAoAAwAlAQAKU291cmNlRmlsZQEACVRlc3QuamF2YQAhAAEA
+AwAAAAAAAgAJAAUABgABAAcAAABtAAMAAwAAACkqvgSiABCyAAwSD7YAFBBN
+uAAaKgMyuAAeTKcACE0EuAAaAUwDuAAasQABABMAGgAdACEAAQAIAAAAKgAK
+AAAACgAAAAsABgANAA4ADgATABAAEwASAB4AFgAiABgAJAAZACgAGgABACMA
+JAABAAcAAAAhAAEAAQAAAAUqtwAmsQAAAAEACAAAAAoAAgAAAAQABAAEAAEA
+JwAAAAIAKA==
+====
+EOF
+                if $UUDECODE Test.uue; then
+                        :
+                else
+                        echo "configure: __oline__: uudecode had trouble decoding base 64 file 'Test.uue'" >&AS_MESSAGE_LOG_FD
+                        echo "configure: failed file was:" >&AS_MESSAGE_LOG_FD
+                        cat Test.uue >&AS_MESSAGE_LOG_FD
+                        ac_cv_prog_uudecode_base64=no
+                fi
+        rm -f Test.uue
+        if AC_TRY_COMMAND($JAVA $JAVAFLAGS Test $1) >/dev/null 2>&1; then
+                eval "ac_cv_class_$ac_var_name=yes"
+        else
+                eval "ac_cv_class_$ac_var_name=no"
+        fi
+        rm -f Test.class
+else
+        AX_TRY_COMPILE_JAVA([$1], , [eval "ac_cv_class_$ac_var_name=yes"],
+                [eval "ac_cv_class_$ac_var_name=no"])
+fi
+eval "ac_var_val=$`eval echo ac_cv_class_$ac_var_name`"
+eval "HAVE_$ac_var_name=$`echo ac_cv_class_$ac_var_val`"
+HAVE_LAST_CLASS=$ac_var_val
+if test x$ac_var_val = xyes; then
+        ifelse([$2], , :, [$2])
+else
+        ifelse([$3], , :, [$3])
+fi
+])
+dnl for some reason the above statment didn't fall though here?
+dnl do scripts have variable scoping?
+eval "ac_var_val=$`eval echo ac_cv_class_$ac_var_name`"
+AC_MSG_RESULT($ac_var_val)
+])
--- a/lib/ffts/m4/ax_check_java_plugin.m4
+++ b/lib/ffts/m4/ax_check_java_plugin.m4
@ -0,0 +1,101 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_check_java_plugin.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CHECK_JAVA_PLUGIN(<shell-variable>)
+#
+# DESCRIPTION
+#
+#   This macro sets <shell-variable> to empty on failure and to a compatible
+#   version of plugin.jar otherwise. Directories searched are /usr/java/*
+#   and /usr/local/java/*, which are assumed to be j{dk,re} installations.
+#   Apply the shell variable as you see fit. If sun changes things so
+#   <jre>/lib/plugin.jar is not the magic file it will stop working.
+#
+#   This macro assumes that unzip, zipinfo or pkzipc is avialable (and can
+#   list the contents of the jar archive). The first two are assumed to work
+#   similarly enough to the infozip versisonms. The pkzipc version is
+#   assumed to work if I undertstand the documentation on pkware's site but
+#   YMMV. I do not have access to pwkware's version to test it.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Duncan Simpson <dps@simpson.demon.co.uk>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 6
+
+AU_ALIAS([DPS_CHECK_PLUGIN], [AX_CHECK_JAVA_PLUGIN])
+AC_DEFUN([AX_CHECK_JAVA_PLUGIN],
+[AC_REQUIRE([AC_PROG_AWK])
+AC_REQUIRE([AC_PROG_FGREP])
+AC_CHECK_PROG(ZIPINFO,[zipinfo unzip pkzipc])
+AC_MSG_CHECKING([for the java plugin])
+case "x$ZIPINFO" in
+[*/zipinfo)]
+	zipinf="zipinfo -1" ;;
+[*/unzip)]
+	zipinf="unzip -l";;
+[*/pkzipc)]
+	ziping="unzipc -view";;
+[x*)]
+	AC_MSG_RESULT([skiped, none of zipinfo, unzip and pkzipc found])
+	AC_SUBST($1,[])
+	zipinf="";;
+esac
+if test "x$zipinf" != "x"; then
+jplugin=""
+for jhome in `ls -dr /usr/java/* /usr/local/java/* 2> /dev/null`; do
+for jfile in lib/plugin.jar jre/lib/plugin.jar; do
+if test "x$jplugin" = "x" && test -f "$jhome/$jfile"; then
+eval "$zipinf $jhome/$jfile | $AWK '{ print \$NF; }' | $FGREP netscape/javascript/JSObject" >/dev/null 2>/dev/null
+if test $? -eq 0; then
+dnl Some version of gcj (and javac) refuse to work with some files
+dnl that pass this test. To stop this problem make sure that the compiler
+dnl still works with this jar file in the classpath
+cat << \EOF > Test.java
+/* [#]line __oline__ "configure" */
+public class Test {
+}
+EOF
+if eval "$JAVAC -classpath $jhome/$jfile Test.java 2>/dev/null >/dev/null" && test -f Test.class; then
+jplugin="$jhome/$jfile"
+fi
+rm -f Test.java Test.class
+fi; fi; done; done
+if test "x$jplugin" != "x"; then
+AC_SUBST($1,$jplugin)
+AC_MSG_RESULT($jplugin)
+else
+AC_MSG_RESULT([java plugin not found])
+AC_SUBST($1,[])
+fi
+fi
+])
--- a/lib/ffts/m4/ax_java_check_class.m4
+++ b/lib/ffts/m4/ax_java_check_class.m4
@ -0,0 +1,85 @@
+# ===========================================================================
+#    http://www.gnu.org/software/autoconf-archive/ax_java_check_class.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_JAVA_CHECK_CLASS(<class>,<action-if-found>,<action-if-not-found>)
+#
+# DESCRIPTION
+#
+#   Test if a Java class is available. Based on AX_PROG_JAVAC_WORKS. This
+#   version uses a cache variable which is both compiler, options and
+#   classpath dependent (so if you switch from javac to gcj it correctly
+#   notices and redoes the test).
+#
+#   The macro tries to compile a minimal program importing <class>. Some
+#   newer compilers moan about the failure to use this but fail or produce a
+#   class file anyway. All moaing is sunk to /dev/null since I only wanted
+#   to know if the class could be imported. This is a recommended followup
+#   to AX_CHECK_JAVA_PLUGIN with classpath appropriately adjusted.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Duncan Simpson <dps@simpson.demon.co.uk>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([DPS_JAVA_CHECK_CLASS], [AX_JAVA_CHECK_CLASS])
+AC_DEFUN([AX_JAVA_CHECK_CLASS],[
+m4_define([cache_val],[m4_translit(ax_cv_have_java_class_$1, " ." ,"__")])
+if test "x$CLASSPATH" != "x"; then
+xtra=" with classpath ${CLASSPATH}"
+xopts=`echo ${CLASSPATH} | ${SED} 's/^ *://'`
+xopts="-classpath $xopts"
+else xtra=""; xopts=""; fi
+cache_var="cache_val"AS_TR_SH([_Jc_${JAVAC}_Cp_${CLASSPATH}])
+AC_CACHE_CHECK([if the $1 class is avialable$xtra], [$cache_var], [
+JAVA_TEST=Test.java
+CLASS_TEST=Test.class
+cat << \EOF > $JAVA_TEST
+/* [#]xline __oline__ "configure" */
+import $1;
+public class Test {
+}
+EOF
+if AC_TRY_COMMAND($JAVAC $JAVACFLAGS $xopts $JAVA_TEST) >/dev/null 2>&1; then
+  eval "${cache_var}=yes"
+else
+  eval "${cache_var}=no"
+  echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+  cat $JAVA_TEST >&AS_MESSAGE_LOG_FD
+fi
+rm -f $JAVA_TEST $CLASS_TEST
+])
+if eval 'test "x$'${cache_var}'" = "xyes"'; then
+$2
+true; else
+$3
+false; fi])
--- a/lib/ffts/m4/ax_prog_java.m4
+++ b/lib/ffts/m4/ax_prog_java.m4
@ -0,0 +1,115 @@
+# ===========================================================================
+#       http://www.gnu.org/software/autoconf-archive/ax_prog_java.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PROG_JAVA
+#
+# DESCRIPTION
+#
+#   Here is a summary of the main macros:
+#
+#   AX_PROG_JAVAC: finds a Java compiler.
+#
+#   AX_PROG_JAVA: finds a Java virtual machine.
+#
+#   AX_CHECK_CLASS: finds if we have the given class (beware of CLASSPATH!).
+#
+#   AX_CHECK_RQRD_CLASS: finds if we have the given class and stops
+#   otherwise.
+#
+#   AX_TRY_COMPILE_JAVA: attempt to compile user given source.
+#
+#   AX_TRY_RUN_JAVA: attempt to compile and run user given source.
+#
+#   AX_JAVA_OPTIONS: adds Java configure options.
+#
+#   AX_PROG_JAVA tests an existing Java virtual machine. It uses the
+#   environment variable JAVA then tests in sequence various common Java
+#   virtual machines. For political reasons, it starts with the free ones.
+#   You *must* call [AX_PROG_JAVAC] before.
+#
+#   If you want to force a specific VM:
+#
+#   - at the configure.in level, set JAVA=yourvm before calling AX_PROG_JAVA
+#
+#     (but after AC_INIT)
+#
+#   - at the configure level, setenv JAVA
+#
+#   You can use the JAVA variable in your Makefile.in, with @JAVA@.
+#
+#   *Warning*: its success or failure can depend on a proper setting of the
+#   CLASSPATH env. variable.
+#
+#   TODO: allow to exclude virtual machines (rationale: most Java programs
+#   cannot run with some VM like kaffe).
+#
+#   Note: This is part of the set of autoconf M4 macros for Java programs.
+#   It is VERY IMPORTANT that you download the whole set, some macros depend
+#   on other. Unfortunately, the autoconf archive does not support the
+#   concept of set of macros, so I had to break it for submission.
+#
+#   A Web page, with a link to the latest CVS snapshot is at
+#   <http://www.internatif.org/bortzmeyer/autoconf-Java/>.
+#
+#   This is a sample configure.in Process this file with autoconf to produce
+#   a configure script.
+#
+#     AC_INIT(UnTag.java)
+#
+#     dnl Checks for programs.
+#     AC_CHECK_CLASSPATH
+#     AX_PROG_JAVAC
+#     AX_PROG_JAVA
+#
+#     dnl Checks for classes
+#     AX_CHECK_RQRD_CLASS(org.xml.sax.Parser)
+#     AX_CHECK_RQRD_CLASS(com.jclark.xml.sax.Driver)
+#
+#     AC_OUTPUT(Makefile)
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([AC_PROG_JAVA], [AX_PROG_JAVA])
+AC_DEFUN([AX_PROG_JAVA],[
+if test x$JAVAPREFIX = x; then
+        test x$JAVA = x && AC_CHECK_PROGS(JAVA, kaffe java)
+else
+        test x$JAVA = x && AC_CHECK_PROGS(JAVA, kaffe java, $JAVAPREFIX)
+fi
+test x$JAVA = x && AC_MSG_ERROR([no acceptable Java virtual machine found in \$PATH])
+AX_PROG_JAVA_WORKS
+AC_PROVIDE([$0])dnl
+])
--- a/lib/ffts/m4/ax_prog_java_cc.m4
+++ b/lib/ffts/m4/ax_prog_java_cc.m4
@ -0,0 +1,104 @@
+# ===========================================================================
+#      http://www.gnu.org/software/autoconf-archive/ax_prog_java_cc.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PROG_JAVA_CC
+#
+# DESCRIPTION
+#
+#   Finds the appropriate java compiler on your path. By preference the java
+#   compiler is gcj, then jikes then javac.
+#
+#   The macro can take one argument specifying a space separated list of
+#   java compiler names.
+#
+#   For example:
+#
+#     AX_PROG_JAVA_CC(javac, gcj)
+#
+#   The macro also sets the compiler options variable: JAVA_CC_OPTS to
+#   something sensible:
+#
+#    - for GCJ it sets it to: @GCJ_OPTS@
+#      (if GCJ_OPTS is not yet defined then it is set to "-C")
+#
+#    - no other compiler has applicable options yet
+#
+#   Here's an example configure.in:
+#
+#     AC_INIT(Makefile.in)
+#     AX_PROG_JAVA_CC()
+#     AC_OUTPUT(Makefile)
+#     dnl End.
+#
+#   And here's the start of the Makefile.in:
+#
+#     PROJECT_ROOT      := @srcdir@
+#     # Tool definitions.
+#     JAVAC             := @JAVA_CC@
+#     JAVAC_OPTS        := @JAVA_CC_OPTS@
+#     JAR_TOOL          := @jar_tool@
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Nic Ferrier <nferrier@tapsellferrier.co.uk>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 4
+
+# AX_PROG_JAVA_CC([COMPILER ...])
+# --------------------------
+# COMPILER ... is a space separated list of java compilers to search for.
+# This just gives the user an opportunity to specify an alternative
+# search list for the java compiler.
+AU_ALIAS([AC_PROG_JAVA_CC], [AX_PROG_JAVA_CC])
+AC_DEFUN([AX_PROG_JAVA_CC],
+[AC_ARG_VAR([JAVA_CC],                [java compiler command])dnl
+AC_ARG_VAR([JAVA_CC_FLAGS],           [java compiler flags])dnl
+m4_ifval([$1],
+      [AC_CHECK_TOOLS(JAVA_CC, [$1])],
+[AC_CHECK_TOOL(JAVA_CC, gcj)
+if test -z "$JAVA_CC"; then
+  AC_CHECK_TOOL(JAVA_CC, javac)
+fi
+if test -z "$JAVA_CC"; then
+  AC_CHECK_TOOL(JAVA_CC, jikes)
+fi
+])
+
+if test "$JAVA_CC" = "gcj"; then
+   if test "$GCJ_OPTS" = ""; then
+      AC_SUBST(GCJ_OPTS,-C)
+   fi
+   AC_SUBST(JAVA_CC_OPTS, @GCJ_OPTS@,
+        [Define the compilation options for GCJ])
+fi
+test -z "$JAVA_CC" && AC_MSG_ERROR([no acceptable java compiler found in \$PATH])
+])# AX_PROG_JAVA_CC
--- a/lib/ffts/m4/ax_prog_java_works.m4
+++ b/lib/ffts/m4/ax_prog_java_works.m4
@ -0,0 +1,134 @@
+# ===========================================================================
+#    http://www.gnu.org/software/autoconf-archive/ax_prog_java_works.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PROG_JAVA_WORKS
+#
+# DESCRIPTION
+#
+#   Internal use ONLY.
+#
+#   Note: This is part of the set of autoconf M4 macros for Java programs.
+#   It is VERY IMPORTANT that you download the whole set, some macros depend
+#   on other. Unfortunately, the autoconf archive does not support the
+#   concept of set of macros, so I had to break it for submission. The
+#   general documentation, as well as the sample configure.in, is included
+#   in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Stephane Bortzmeyer <bortzmeyer@pasteur.fr>
+#
+#   This program is free software; you can redistribute it and/or modify it
+#   under the terms of the GNU General Public License as published by the
+#   Free Software Foundation; either version 2 of the License, or (at your
+#   option) any later version.
+#
+#   This program is distributed in the hope that it will be useful, but
+#   WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
+#   Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License along
+#   with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+#   As a special exception, the respective Autoconf Macro's copyright owner
+#   gives unlimited permission to copy, distribute and modify the configure
+#   scripts that are the output of Autoconf when processing the Macro. You
+#   need not follow the terms of the GNU General Public License when using
+#   or distributing such scripts, even though portions of the text of the
+#   Macro appear in them. The GNU General Public License (GPL) does govern
+#   all other use of the material that constitutes the Autoconf Macro.
+#
+#   This special exception to the GPL applies to versions of the Autoconf
+#   Macro released by the Autoconf Archive. When you make and distribute a
+#   modified version of the Autoconf Macro, you may extend this special
+#   exception to the GPL to apply to your modified version as well.
+
+#serial 8
+
+AU_ALIAS([AC_PROG_JAVA_WORKS], [AX_PROG_JAVA_WORKS])
+AC_DEFUN([AX_PROG_JAVA_WORKS], [
+AC_PATH_PROG(UUDECODE, uudecode, [no])
+if test x$UUDECODE != xno; then
+AC_CACHE_CHECK([if uudecode can decode base 64 file], ac_cv_prog_uudecode_base64, [
+dnl /**
+dnl  * Test.java: used to test if java compiler works.
+dnl  */
+dnl public class Test
+dnl {
+dnl
+dnl public static void
+dnl main( String[] argv )
+dnl {
+dnl     System.exit (0);
+dnl }
+dnl
+dnl }
+cat << \EOF > Test.uue
+begin-base64 644 Test.class
+yv66vgADAC0AFQcAAgEABFRlc3QHAAQBABBqYXZhL2xhbmcvT2JqZWN0AQAE
+bWFpbgEAFihbTGphdmEvbGFuZy9TdHJpbmc7KVYBAARDb2RlAQAPTGluZU51
+bWJlclRhYmxlDAAKAAsBAARleGl0AQAEKEkpVgoADQAJBwAOAQAQamF2YS9s
+YW5nL1N5c3RlbQEABjxpbml0PgEAAygpVgwADwAQCgADABEBAApTb3VyY2VG
+aWxlAQAJVGVzdC5qYXZhACEAAQADAAAAAAACAAkABQAGAAEABwAAACEAAQAB
+AAAABQO4AAyxAAAAAQAIAAAACgACAAAACgAEAAsAAQAPABAAAQAHAAAAIQAB
+AAEAAAAFKrcAErEAAAABAAgAAAAKAAIAAAAEAAQABAABABMAAAACABQ=
+====
+EOF
+if $UUDECODE Test.uue; then
+        ac_cv_prog_uudecode_base64=yes
+else
+        echo "configure: __oline__: uudecode had trouble decoding base 64 file 'Test.uue'" >&AS_MESSAGE_LOG_FD
+        echo "configure: failed file was:" >&AS_MESSAGE_LOG_FD
+        cat Test.uue >&AS_MESSAGE_LOG_FD
+        ac_cv_prog_uudecode_base64=no
+fi
+rm -f Test.uue])
+fi
+if test x$ac_cv_prog_uudecode_base64 != xyes; then
+        rm -f Test.class
+        AC_MSG_WARN([I have to compile Test.class from scratch])
+        if test x$ac_cv_prog_javac_works = xno; then
+                AC_MSG_ERROR([Cannot compile java source. $JAVAC does not work properly])
+        fi
+        if test x$ac_cv_prog_javac_works = x; then
+                AX_PROG_JAVAC
+        fi
+fi
+AC_CACHE_CHECK(if $JAVA works, ac_cv_prog_java_works, [
+JAVA_TEST=Test.java
+CLASS_TEST=Test.class
+TEST=Test
+changequote(, )dnl
+cat << \EOF > $JAVA_TEST
+/* [#]line __oline__ "configure" */
+public class Test {
+public static void main (String args[]) {
+        System.exit (0);
+} }
+EOF
+changequote([, ])dnl
+if test x$ac_cv_prog_uudecode_base64 != xyes; then
+        if AC_TRY_COMMAND($JAVAC $JAVACFLAGS $JAVA_TEST) && test -s $CLASS_TEST; then
+                :
+        else
+          echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+          cat $JAVA_TEST >&AS_MESSAGE_LOG_FD
+          AC_MSG_ERROR(The Java compiler $JAVAC failed (see config.log, check the CLASSPATH?))
+        fi
+fi
+if AC_TRY_COMMAND($JAVA $JAVAFLAGS $TEST) >/dev/null 2>&1; then
+  ac_cv_prog_java_works=yes
+else
+  echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+  cat $JAVA_TEST >&AS_MESSAGE_LOG_FD
+  AC_MSG_ERROR(The Java VM $JAVA failed (see config.log, check the CLASSPATH?))
+fi
+rm -fr $JAVA_TEST $CLASS_TEST Test.uue
+])
+AC_PROVIDE([$0])dnl
+]
+)
--- a/lib/ffts/m4/ax_prog_javadoc.m4
+++ b/lib/ffts/m4/ax_prog_javadoc.m4
@ -0,0 +1,52 @@
+# ===========================================================================
+#      http://www.gnu.org/software/autoconf-archive/ax_prog_javadoc.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PROG_JAVADOC
+#
+# DESCRIPTION
+#
+#   AX_PROG_JAVADOC tests for an existing javadoc generator. It uses the
+#   environment variable JAVADOC then tests in sequence various common
+#   javadoc generator.
+#
+#   If you want to force a specific compiler:
+#
+#   - at the configure.in level, set JAVADOC=yourgenerator before calling
+#   AX_PROG_JAVADOC
+#
+#   - at the configure level, setenv JAVADOC
+#
+#   You can use the JAVADOC variable in your Makefile.in, with @JAVADOC@.
+#
+#   Note: This macro depends on the autoconf M4 macros for Java programs. It
+#   is VERY IMPORTANT that you download that whole set, some macros depend
+#   on other. Unfortunately, the autoconf archive does not support the
+#   concept of set of macros, so I had to break it for submission.
+#
+#   The general documentation of those macros, as well as the sample
+#   configure.in, is included in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Egon Willighagen <e.willighagen@science.ru.nl>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 7
+
+AU_ALIAS([AC_PROG_JAVADOC], [AX_PROG_JAVADOC])
+AC_DEFUN([AX_PROG_JAVADOC],[
+if test "x$JAVAPREFIX" = x; then
+        test "x$JAVADOC" = x && AC_CHECK_PROGS(JAVADOC, javadoc)
+else
+        test "x$JAVADOC" = x && AC_CHECK_PROGS(JAVADOC, javadoc, $JAVAPREFIX)
+fi
+test "x$JAVADOC" = x && AC_MSG_ERROR([no acceptable javadoc generator found in \$PATH])
+AC_PROVIDE([$0])dnl
+])
--- a/lib/ffts/m4/ax_prog_javah.m4
+++ b/lib/ffts/m4/ax_prog_javah.m4
@ -0,0 +1,43 @@
+# ===========================================================================
+#       http://www.gnu.org/software/autoconf-archive/ax_prog_javah.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_PROG_JAVAH
+#
+# DESCRIPTION
+#
+#   AX_PROG_JAVAH tests the availability of the javah header generator and
+#   looks for the jni.h header file. If available, JAVAH is set to the full
+#   path of javah and CPPFLAGS is updated accordingly.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Luc Maisonobe <luc@spaceroots.org>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 5
+
+AU_ALIAS([AC_PROG_JAVAH], [AX_PROG_JAVAH])
+AC_DEFUN([AX_PROG_JAVAH],[
+AC_REQUIRE([AC_CANONICAL_SYSTEM])dnl
+AC_REQUIRE([AC_PROG_CPP])dnl
+AC_PATH_PROG(JAVAH,javah)
+if test x"`eval 'echo $ac_cv_path_JAVAH'`" != x ; then
+  AC_TRY_CPP([#include <jni.h>],,[
+    ac_save_CPPFLAGS="$CPPFLAGS"
+changequote(, )dnl
+    ac_dir=`echo $ac_cv_path_JAVAH | sed 's,\(.*\)/[^/]*/[^/]*$,\1/include,'`
+    ac_machdep=`echo $build_os | sed 's,[-0-9].*,,' | sed 's,cygwin,win32,'`
+changequote([, ])dnl
+    CPPFLAGS="$ac_save_CPPFLAGS -I$ac_dir -I$ac_dir/$ac_machdep"
+    AC_TRY_CPP([#include <jni.h>],
+               ac_save_CPPFLAGS="$CPPFLAGS",
+               AC_MSG_WARN([unable to include <jni.h>]))
+    CPPFLAGS="$ac_save_CPPFLAGS"])
+fi])
--- a/lib/ffts/m4/ax_try_compile_java.m4
+++ b/lib/ffts/m4/ax_try_compile_java.m4
@ -0,0 +1,55 @@
+# ===========================================================================
+#    http://www.gnu.org/software/autoconf-archive/ax_try_compile_java.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_TRY_COMPILE_JAVA
+#
+# DESCRIPTION
+#
+#   AX_TRY_COMPILE_JAVA attempt to compile user given source.
+#
+#   *Warning*: its success or failure can depend on a proper setting of the
+#   CLASSPATH env. variable.
+#
+#   Note: This is part of the set of autoconf M4 macros for Java programs.
+#   It is VERY IMPORTANT that you download the whole set, some macros depend
+#   on other. Unfortunately, the autoconf archive does not support the
+#   concept of set of macros, so I had to break it for submission. The
+#   general documentation, as well as the sample configure.in, is included
+#   in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Devin Weaver <ktohg@tritarget.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 7
+
+AU_ALIAS([AC_TRY_COMPILE_JAVA], [AX_TRY_COMPILE_JAVA])
+AC_DEFUN([AX_TRY_COMPILE_JAVA],[
+AC_REQUIRE([AX_PROG_JAVAC])dnl
+cat << \EOF > Test.java
+/* [#]line __oline__ "configure" */
+ifelse([$1], , , [import $1;])
+public class Test {
+[$2]
+}
+EOF
+if AC_TRY_COMMAND($JAVAC $JAVACFLAGS Test.java) && test -s Test.class
+then
+dnl Don't remove the temporary files here, so they can be examined.
+  ifelse([$3], , :, [$3])
+else
+  echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+  cat Test.java >&AS_MESSAGE_LOG_FD
+ifelse([$4], , , [  rm -fr Test*
+  $4
+])dnl
+fi
+rm -fr Test*])
--- a/lib/ffts/m4/ax_try_run_java.m4
+++ b/lib/ffts/m4/ax_try_run_java.m4
@ -0,0 +1,56 @@
+# ===========================================================================
+#      http://www.gnu.org/software/autoconf-archive/ax_try_run_java.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_TRY_RUN_JAVA
+#
+# DESCRIPTION
+#
+#   AX_TRY_RUN_JAVA attempt to compile and run user given source.
+#
+#   *Warning*: its success or failure can depend on a proper setting of the
+#   CLASSPATH env. variable.
+#
+#   Note: This is part of the set of autoconf M4 macros for Java programs.
+#   It is VERY IMPORTANT that you download the whole set, some macros depend
+#   on other. Unfortunately, the autoconf archive does not support the
+#   concept of set of macros, so I had to break it for submission. The
+#   general documentation, as well as the sample configure.in, is included
+#   in the AX_PROG_JAVA macro.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Devin Weaver <ktohg@tritarget.com>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved. This file is offered as-is, without any
+#   warranty.
+
+#serial 1
+
+AU_ALIAS([AC_TRY_RUN_JAVA], [AX_TRY_RUN_JAVA])
+AC_DEFUN([AX_TRY_RUN_JAVA],[
+AC_REQUIRE([AX_PROG_JAVAC])dnl
+AC_REQUIRE([AX_PROG_JAVA])dnl
+cat << \EOF > Test.java
+/* [#]line __oline__ "configure" */
+ifelse([$1], , , [include $1;])
+public class Test {
+[$2]
+}
+EOF
+if AC_TRY_COMMAND($JAVAC $JAVACFLAGS Test.java) && test -s Test.class && ($JAVA $JAVAFLAGS Test; exit) 2>/dev/null
+then
+dnl Don't remove the temporary files here, so they can be examined.
+  ifelse([$3], , :, [$3])
+else
+  echo "configure: failed program was:" >&AS_MESSAGE_LOG_FD
+  cat Test.java >&AS_MESSAGE_LOG_FD
+ifelse([$4], , , [  rm -fr Test*
+  $4
+])dnl
+fi
+rm -fr Test*])
--- a/lib/ffts/src/Makefile.am
+++ b/lib/ffts/src/Makefile.am
@ -20,10 +20,10 @@ libffts_la_SOURCES += vfp.s
 else
 if HAVE_NEON

+libffts_la_SOURCES += neon.s
+
 if DYNAMIC_DISABLED
 libffts_la_SOURCES += neon_static_f.s neon_static_i.s
-else
-libffts_la_SOURCES += neon.s
 endif

 else 
--- a/lib/ffts/src/Makefile.in
+++ b/lib/ffts/src/Makefile.in
@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.12.4 from Makefile.am.
+# Makefile.in generated by automake 1.14 from Makefile.am.
 # @configure_input@

-# Copyright (C) 1994-2012 Free Software Foundation, Inc.
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.

 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -16,23 +16,51 @@


 VPATH = @srcdir@
-am__make_dryrun = \
-  { \
-    am__dry=no; \
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
    case $$MAKEFLAGS in \
      *\\[\ \	]*) \
-        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
-          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
-      *) \
-        for am__flg in $$MAKEFLAGS; do \
-          case $$am__flg in \
-            *=*|--*) ;; \
-            *n*) am__dry=yes; break;; \
-          esac; \
-        done;; \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
    esac; \
-    test $$am__dry = yes; \
-  }
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
+    esac; \
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
 pkgdatadir = $(datadir)/@PACKAGE@
 pkgincludedir = $(includedir)/@PACKAGE@
 pkglibdir = $(libdir)/@PACKAGE@
@ -54,12 +82,12 @@ host_triplet = @host@
@DYNAMIC_DISABLED_TRUE@am__append_1 = ffts_static.c
@DYNAMIC_DISABLED_FALSE@am__append_2 = codegen.c
@HAVE_VFP_TRUE@am__append_3 = vfp.s 
-@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_4 = neon_static_f.s neon_static_i.s
-@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_5 = neon.s
+@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_4 = neon.s
+@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__append_5 = neon_static_f.s neon_static_i.s
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__append_6 = sse.s
 subdir = src
-DIST_COMMON = $(libffts_include_HEADERS) $(srcdir)/Makefile.am \
-	$(srcdir)/Makefile.in $(top_srcdir)/depcomp
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
+	$(top_srcdir)/depcomp $(libffts_include_HEADERS)
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_classpath.m4 \
 	$(top_srcdir)/m4/ax_check_java_home.m4 \
@ -111,14 +139,14 @@ am__libffts_la_SOURCES_DIST = ffts.c ffts_small.c ffts_nd.c \
 	codegen_sse.h ffts.h ffts_nd.h ffts_real.h ffts_real_nd.h \
 	ffts_small.h ffts_static.h macros-alpha.h macros-altivec.h \
 	macros-neon.h macros-sse.h macros.h neon.h neon_float.h \
-	patterns.h types.h vfp.h ffts_static.c codegen.c vfp.s \
-	neon_static_f.s neon_static_i.s neon.s sse.s
+	patterns.h types.h vfp.h ffts_static.c codegen.c vfp.s neon.s \
+	neon_static_f.s neon_static_i.s sse.s
@DYNAMIC_DISABLED_TRUE@am__objects_1 = ffts_static.lo
@DYNAMIC_DISABLED_FALSE@am__objects_2 = codegen.lo
@HAVE_VFP_TRUE@am__objects_3 = vfp.lo
-@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_4 = neon_static_f.lo \
+@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_4 = neon.lo
+@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon_static_f.lo \
@DYNAMIC_DISABLED_TRUE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@	neon_static_i.lo
-@DYNAMIC_DISABLED_FALSE@@HAVE_NEON_TRUE@@HAVE_VFP_FALSE@am__objects_5 = neon.lo
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@am__objects_6 =  \
@HAVE_NEON_FALSE@@HAVE_SSE_TRUE@@HAVE_VFP_FALSE@	sse.lo
 am_libffts_la_OBJECTS = ffts.lo ffts_small.lo ffts_nd.lo ffts_real.lo \
@ -126,22 +154,52 @@ am_libffts_la_OBJECTS = ffts.lo ffts_small.lo ffts_nd.lo ffts_real.lo \
 	$(am__objects_3) $(am__objects_4) $(am__objects_5) \
 	$(am__objects_6)
 libffts_la_OBJECTS = $(am_libffts_la_OBJECTS)
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
 DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/depcomp
 am__depfiles_maybe = depfiles
 am__mv = mv -f
 COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
 	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
-	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
-	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
 CCLD = $(CC)
-LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
-	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
 CCASCOMPILE = $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
-LTCCASCOMPILE = $(LIBTOOL) $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
-	--mode=compile $(CCAS) $(AM_CCASFLAGS) $(CCASFLAGS)
+LTCCASCOMPILE = $(LIBTOOL) $(AM_V_lt) $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CCAS) $(AM_CCASFLAGS) \
+	$(CCASFLAGS)
+AM_V_CCAS = $(am__v_CCAS_@AM_V@)
+am__v_CCAS_ = $(am__v_CCAS_@AM_DEFAULT_V@)
+am__v_CCAS_0 = @echo "  CCAS    " $@;
+am__v_CCAS_1 = 
 SOURCES = $(libffts_la_SOURCES)
 DIST_SOURCES = $(am__libffts_la_SOURCES_DIST)
 am__can_run_installinfo = \
@ -150,11 +208,29 @@ am__can_run_installinfo = \
    *) (install-info --version) >/dev/null 2>&1;; \
  esac
 HEADERS = $(libffts_include_HEADERS)
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
 ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
@ -328,6 +404,7 @@ $(top_srcdir)/configure:  $(am__configure_deps)
 $(ACLOCAL_M4):  $(am__aclocal_m4_deps)
 	cd $(top_builddir) && $(MAKE) $(AM_MAKEFLAGS) am--refresh
 $(am__aclocal_m4_deps):
+
 install-libLTLIBRARIES: $(lib_LTLIBRARIES)
 	@$(NORMAL_INSTALL)
 	@list='$(lib_LTLIBRARIES)'; test -n "$(libdir)" || list=; \
@ -362,8 +439,9 @@ clean-libLTLIBRARIES:
 	  echo rm -f $${locs}; \
 	  rm -f $${locs}; \
 	}
+
 libffts.la: $(libffts_la_OBJECTS) $(libffts_la_DEPENDENCIES) $(EXTRA_libffts_la_DEPENDENCIES) 
-	$(LINK) -rpath $(libdir) $(libffts_la_OBJECTS) $(libffts_la_LIBADD) $(LIBS)
+	$(AM_V_CCLD)$(LINK) -rpath $(libdir) $(libffts_la_OBJECTS) $(libffts_la_LIBADD) $(LIBS)

 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@ -381,34 +459,34 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/patterns.Plo@am__quote@

 .c.o:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<

 .c.obj:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`

 .c.lo:
-@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<

 .s.o:
-	$(CCASCOMPILE) -c -o $@ $<
+	$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ $<

 .s.obj:
-	$(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`
+	$(AM_V_CCAS)$(CCASCOMPILE) -c -o $@ `$(CYGPATH_W) '$<'`

 .s.lo:
-	$(LTCCASCOMPILE) -c -o $@ $<
+	$(AM_V_CCAS)$(LTCCASCOMPILE) -c -o $@ $<

 mostlyclean-libtool:
 	-rm -f *.lo
@ -437,26 +515,15 @@ uninstall-libffts_includeHEADERS:
 	files=`for p in $$list; do echo $$p; done | sed -e 's|^.*/||'`; \
 	dir='$(DESTDIR)$(libffts_includedir)'; $(am__uninstall_files_from_dir)

-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
 	set x; \
 	here=`pwd`; \
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	$(am__define_uniq_tagged_files); \
 	shift; \
 	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
 	  test -n "$$unique" || unique=$$empty_fix; \
@ -468,15 +535,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
 	      $$unique; \
 	  fi; \
 	fi
-ctags: CTAGS
-CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
 	test -z "$(CTAGS_ARGS)$$unique" \
 	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
 	     $$unique
@ -485,9 +548,10 @@ GTAGS:
 	here=`$(am__cd) $(top_builddir) && pwd` \
 	  && $(am__cd) $(top_srcdir) \
 	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am

-cscopelist:  $(HEADERS) $(SOURCES) $(LISP)
-	list='$(SOURCES) $(HEADERS) $(LISP)'; \
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
 	case "$(srcdir)" in \
 	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
 	  *) sdir=$(subdir)/$(srcdir) ;; \
@ -644,20 +708,20 @@ uninstall-am: uninstall-libLTLIBRARIES \

 .MAKE: install-am install-strip

-.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
-	clean-libLTLIBRARIES clean-libtool cscopelist ctags distclean \
-	distclean-compile distclean-generic distclean-libtool \
-	distclean-tags distdir dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-info install-info-am \
-	install-libLTLIBRARIES install-libffts_includeHEADERS \
-	install-man install-pdf install-pdf-am install-ps \
-	install-ps-am install-strip installcheck installcheck-am \
-	installdirs maintainer-clean maintainer-clean-generic \
-	mostlyclean mostlyclean-compile mostlyclean-generic \
-	mostlyclean-libtool pdf pdf-am ps ps-am tags uninstall \
-	uninstall-am uninstall-libLTLIBRARIES \
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+	clean-libLTLIBRARIES clean-libtool cscopelist-am ctags \
+	ctags-am distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-libLTLIBRARIES \
+	install-libffts_includeHEADERS install-man install-pdf \
+	install-pdf-am install-ps install-ps-am install-strip \
+	installcheck installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am uninstall-libLTLIBRARIES \
 	uninstall-libffts_includeHEADERS


--- a/lib/ffts/src/arch/.gitignore
+++ b/lib/ffts/src/arch/.gitignore
@ -0,0 +1,6 @@
+/Makefile
+/Makefile.in
+/.deps
+/.libs
+/*.la
+/*.lo
--- a/lib/ffts/src/arch/ChangeLog
+++ b/lib/ffts/src/arch/ChangeLog
--- a/lib/ffts/src/arch/LICENSE
+++ b/lib/ffts/src/arch/LICENSE
@ -0,0 +1,21 @@
+Copyright (c) 2001, 2002, 2003 Ximian, Inc and the individuals listed
+on the ChangeLog entries.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/lib/ffts/src/arch/Makefile.am
+++ b/lib/ffts/src/arch/Makefile.am
@ -0,0 +1,11 @@
+DIST_SUBDIRS = x86 ppc sparc arm arm64 s390x amd64 ia64 mips
+
+AM_CPPFLAGS = $(GLIB_CFLAGS) -I$(top_srcdir)
+
+if ARM
+# arm needs to build some stuff even in JIT mode
+SUBDIRS = $(arch_target)
+endif
+
+EXTRA_DIST = ChangeLog
+
--- a/lib/ffts/src/arch/README
+++ b/lib/ffts/src/arch/README
@ -0,0 +1,7 @@
+mono_arch
+=========
+
+Part of Mono project, https://github.com/mono
+
+These are C macros that are useful when generating native code on various platforms.
+This code is MIT X11 licensed.
--- a/lib/ffts/src/arch/arm/.gitattributes
+++ b/lib/ffts/src/arch/arm/.gitattributes
@ -0,0 +1 @@
+/arm-wmmx.h -crlf
--- a/lib/ffts/src/arch/arm/.gitignore
+++ b/lib/ffts/src/arch/arm/.gitignore
@ -0,0 +1,15 @@
+/Makefile
+/Makefile.in
+/.deps
+/.libs
+/*.o
+/*.la
+/*.lo
+/*.lib
+/*.obj
+/*.exe
+/*.dll
+/arm_dpimacros.h
+/arm_fpamacros.h
+/arm_vfpmacros.h
+/fixeol.sh
--- a/lib/ffts/src/arch/arm/Makefile.am
+++ b/lib/ffts/src/arch/arm/Makefile.am
@ -0,0 +1,27 @@
+
+AM_CPPFLAGS = $(GLIB_CFLAGS) -I$(top_srcdir)
+
+noinst_LTLIBRARIES = libmonoarch-arm.la
+
+BUILT_SOURCES = arm_dpimacros.h arm_vfpmacros.h
+
+
+libmonoarch_arm_la_SOURCES = $(BUILT_SOURCES) \
+	arm-codegen.c \
+	arm-codegen.h \
+	arm-dis.c \
+	arm-dis.h
+
+arm_dpimacros.h: dpiops.sh mov_macros.th dpi_macros.th cmp_macros.th
+	(cd $(srcdir); bash ./dpiops.sh) > $@t
+	mv $@t $@
+
+arm_vfpmacros.h: vfpops.sh vfpm_macros.th vfp_macros.th
+	(cd $(srcdir); bash ./vfpops.sh) > $@t
+	mv $@t $@
+
+CLEANFILES = $(BUILT_SOURCES)
+
+EXTRA_DIST = dpiops.sh mov_macros.th dpi_macros.th cmp_macros.th \
+	vfpm_macros.th vfp_macros.th arm-vfp-codegen.h vfpops.sh
+
--- a/lib/ffts/src/arch/arm/arm-codegen.c
+++ b/lib/ffts/src/arch/arm/arm-codegen.c
@ -0,0 +1,193 @@
+/*
+ * arm-codegen.c
+ * Copyright (c) 2002 Sergey Chaban <serge@wildwestsoftware.com>
+ */
+
+#include "arm-codegen.h"
+
+
+arminstr_t* arm_emit_std_prologue(arminstr_t* p, unsigned int local_size) {
+	ARM_MOV_REG_REG(p, ARMREG_IP, ARMREG_SP);
+
+	/* save args */
+	ARM_PUSH(p,   (1 << ARMREG_A1)
+	            | (1 << ARMREG_A2)
+	            | (1 << ARMREG_A3)
+	            | (1 << ARMREG_A4));
+
+	ARM_PUSH(p, (1U << ARMREG_IP) | (1U << ARMREG_LR));
+
+	if (local_size != 0) {
+		if ((local_size & (~0xFF)) == 0) {
+			ARM_SUB_REG_IMM8(p, ARMREG_SP, ARMREG_SP, local_size);
+		} else {
+			/* TODO: optimize */
+			p = arm_mov_reg_imm32(p, ARMREG_IP, local_size);
+			ARM_SUB_REG_REG(p, ARMREG_SP, ARMREG_SP, ARMREG_IP);
+			ARM_ADD_REG_IMM8(p, ARMREG_IP, ARMREG_IP, sizeof(armword_t));
+			ARM_LDR_REG_REG(p, ARMREG_IP, ARMREG_SP, ARMREG_IP);
+		}
+	}
+
+	return p;
+}
+
+arminstr_t* arm_emit_std_epilogue(arminstr_t* p, unsigned int local_size, int pop_regs) {
+	if (local_size != 0) {
+		if ((local_size & (~0xFF)) == 0) {
+			ARM_ADD_REG_IMM8(p, ARMREG_SP, ARMREG_SP, local_size);
+		} else {
+			/* TODO: optimize */
+			p = arm_mov_reg_imm32(p, ARMREG_IP, local_size);
+			ARM_ADD_REG_REG(p, ARMREG_SP, ARMREG_SP, ARMREG_IP);
+		}
+	}
+
+	ARM_POP_NWB(p, (1 << ARMREG_SP) | (1 << ARMREG_PC) | (pop_regs & 0x3FF));
+
+	return p;
+}
+
+
+/* do not push A1-A4 */
+arminstr_t* arm_emit_lean_prologue(arminstr_t* p, unsigned int local_size, int push_regs) {
+	ARM_MOV_REG_REG(p, ARMREG_IP, ARMREG_SP);
+	/* push_regs upto R10 will be saved */
+	ARM_PUSH(p, (1U << ARMREG_IP) | (1U << ARMREG_LR) | (push_regs & 0x3FF));
+
+	if (local_size != 0) {
+		if ((local_size & (~0xFF)) == 0) {
+			ARM_SUB_REG_IMM8(p, ARMREG_SP, ARMREG_SP, local_size);
+		} else {
+			/* TODO: optimize */
+			p = arm_mov_reg_imm32(p, ARMREG_IP, local_size);
+			ARM_SUB_REG_REG(p, ARMREG_SP, ARMREG_SP, ARMREG_IP);
+			/* restore IP from stack */
+			ARM_ADD_REG_IMM8(p, ARMREG_IP, ARMREG_IP, sizeof(armword_t));
+			ARM_LDR_REG_REG(p, ARMREG_IP, ARMREG_SP, ARMREG_IP);
+		}
+	}
+
+	return p;
+}
+
+/* Bit scan forward. */
+int arm_bsf(armword_t val) {
+	int i;
+	armword_t mask;
+
+	if (val == 0) return 0;
+	for (i=1, mask=1; (i <= 8 * sizeof(armword_t)) && ((val & mask) == 0); ++i, mask<<=1);
+
+	return i;
+}
+
+
+int arm_is_power_of_2(armword_t val) {
+	return ((val & (val-1)) == 0);
+}
+
+
+/*
+ * returns:
+ *   1 - unable to represent
+ *   positive even number - MOV-representable
+ *   negative even number - MVN-representable
+ */
+int calc_arm_mov_const_shift(armword_t val) {
+	armword_t mask;
+	int res = 1, shift;
+
+	for (shift=0; shift < 32; shift+=2) {
+		mask = ARM_SCALE(0xFF, shift);
+		if ((val & (~mask)) == 0) {
+			res = shift;
+			break;
+		}
+		if (((~val) & (~mask)) == 0) {
+			res = -shift - 2;
+			break;
+		}
+	}
+
+	return res;
+}
+
+
+int is_arm_const(armword_t val) {
+	int res;
+	res = arm_is_power_of_2(val);
+	if (!res) {
+		res = calc_arm_mov_const_shift(val);
+		res = !(res < 0 || res == 1);
+	}
+	return res;
+}
+
+
+int arm_const_steps(armword_t val) {
+	int shift, steps = 0;
+
+	while (val != 0) {
+		shift = (arm_bsf(val) - 1) & (~1);
+		val &= ~(0xFF << shift);
+		++steps;
+	}
+	return steps;
+}
+
+
+/*
+ * ARM cannot load arbitrary 32-bit constants directly into registers;
+ * widely used work-around for this is to store constants into a
+ * PC-addressable pool and use LDR instruction with PC-relative address
+ * to load constant into register. Easiest way to implement this is to
+ * embed constant inside a function with unconditional branch around it.
+ * The above method is not used at the moment.
+ * This routine always emits sequence of instructions to generate
+ * requested constant. In the worst case it takes 4 instructions to
+ * synthesize a constant - 1 MOV and 3 subsequent ORRs.
+ */
+arminstr_t* arm_mov_reg_imm32_cond(arminstr_t* p, int reg, armword_t imm32, int cond) {
+	int mov_op;
+	int step_op;
+	int snip;
+	int shift = calc_arm_mov_const_shift(imm32);
+
+	if ((shift & 0x80000001) != 1) {
+		if (shift >= 0) {
+			ARM_MOV_REG_IMM_COND(p, reg, imm32 >> ((32 - shift) & 31), shift, cond);
+		} else {
+			ARM_MVN_REG_IMM_COND(p, reg, (imm32 ^ (~0)) >> ((32 + 2 + shift) & 31), (-shift - 2), cond);
+		}
+	} else {
+		mov_op = ARMOP_MOV;
+		step_op = ARMOP_ORR;
+
+		if (arm_const_steps(imm32) > arm_const_steps(~imm32)) {
+			mov_op = ARMOP_MVN;
+			step_op = ARMOP_SUB;
+			imm32 = ~imm32;
+		}
+
+		shift = (arm_bsf(imm32) - 1) & (~1);
+		snip = imm32 & (0xFF << shift);
+		ARM_EMIT(p, ARM_DEF_DPI_IMM_COND((unsigned)snip >> shift, (32 - shift) >> 1, reg, 0, 0, mov_op, cond));
+
+		while ((imm32 ^= snip) != 0) {
+			shift = (arm_bsf(imm32) - 1) & (~1);
+			snip = imm32 & (0xFF << shift);
+			ARM_EMIT(p, ARM_DEF_DPI_IMM_COND((unsigned)snip >> shift, (32 - shift) >> 1, reg, reg, 0, step_op, cond));
+		}
+	}
+
+	return p;
+}
+
+
+arminstr_t* arm_mov_reg_imm32(arminstr_t* p, int reg, armword_t imm32) {
+	return arm_mov_reg_imm32_cond(p, reg, imm32, ARMCOND_AL);
+}
+
+
+
--- a/lib/ffts/src/arch/arm/arm-codegen.h
+++ b/lib/ffts/src/arch/arm/arm-codegen.h
--- a/lib/ffts/src/arch/arm/arm-dis.c
+++ b/lib/ffts/src/arch/arm/arm-dis.c
@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2002 Sergey Chaban <serge@wildwestsoftware.com>
+ */
+
+
+#include <stdarg.h>
+
+#include "arm-dis.h"
+#include "arm-codegen.h"
+
+
+static ARMDis* gdisasm = NULL;
+
+static int use_reg_alias = 1;
+
+const static char* cond[] = {
+	"eq", "ne", "cs", "cc", "mi", "pl", "vs", "vc",
+	"hi", "ls", "ge", "lt", "gt", "le", "", "nv"
+};
+
+const static char* ops[] = {
+	"and", "eor", "sub", "rsb", "add", "adc", "sbc", "rsc",
+	"tst", "teq", "cmp", "cmn", "orr", "mov", "bic", "mvn"
+};
+
+const static char* shift_types[] = {"lsl", "lsr", "asr", "ror"};
+
+const static char* mul_ops[] = {
+	"mul", "mla", "?", "?", "umull", "umlal", "smull", "smlal"
+};
+
+const static char* reg_alias[] = {
+	"a1", "a2", "a3", "a4",
+	"r4", "r5", "r6", "r7", "r8", "r9", "r10",
+	"fp", "ip", "sp", "lr", "pc"
+};
+
+const static char* msr_fld[] = {"f", "c", "x", "?", "s"};
+
+
+/* private functions prototypes (to keep compiler happy) */
+void chk_out(ARMDis* dis);
+void dump_reg(ARMDis* dis, int reg);
+void dump_creg(ARMDis* dis, int creg);
+void dump_reglist(ARMDis* dis, int reg_list);
+void init_gdisasm(void);
+
+void dump_br(ARMDis* dis, ARMInstr i);
+void dump_cdp(ARMDis* dis, ARMInstr i);
+void dump_cdt(ARMDis* dis, ARMInstr i);
+void dump_crt(ARMDis* dis, ARMInstr i);
+void dump_dpi(ARMDis* dis, ARMInstr i);
+void dump_hxfer(ARMDis* dis, ARMInstr i);
+void dump_mrs(ARMDis* dis, ARMInstr i);
+void dump_mrt(ARMDis* dis, ARMInstr i);
+void dump_msr(ARMDis* dis, ARMInstr i);
+void dump_mul(ARMDis* dis, ARMInstr i);
+void dump_swi(ARMDis* dis, ARMInstr i);
+void dump_swp(ARMDis* dis, ARMInstr i);
+void dump_wxfer(ARMDis* dis, ARMInstr i);
+void dump_clz(ARMDis* dis, ARMInstr i);
+
+
+/*
+void out(ARMDis* dis, const char* format, ...) {
+	va_list arglist;
+	va_start(arglist, format);
+	fprintf(dis->dis_out, format, arglist);
+	va_end(arglist);
+}
+*/
+
+
+void chk_out(ARMDis* dis) {
+	if (dis != NULL && dis->dis_out == NULL) dis->dis_out = stdout;
+}
+
+
+void armdis_set_output(ARMDis* dis, FILE* f) {
+	if (dis != NULL) {
+		dis->dis_out = f;
+		chk_out(dis);
+	}
+}
+
+FILE* armdis_get_output(ARMDis* dis) {
+	return (dis != NULL ? dis->dis_out : NULL);
+}
+
+
+
+
+void dump_reg(ARMDis* dis, int reg) {
+	reg &= 0xF;
+	if (!use_reg_alias || (reg > 3 && reg < 11)) {
+		fprintf(dis->dis_out, "r%d", reg);
+	} else {
+		fprintf(dis->dis_out, "%s", reg_alias[reg]);
+	}
+}
+
+void dump_creg(ARMDis* dis, int creg) {
+	if (dis != NULL) {
+		creg &= 0xF;
+		fprintf(dis->dis_out, "c%d", creg);
+	}
+}
+
+void dump_reglist(ARMDis* dis, int reg_list) {
+	int i = 0, j, n = 0;
+	int m1 = 1, m2, rn;
+	while (i < 16) {
+		if ((reg_list & m1) != 0) {
+			if (n != 0) fprintf(dis->dis_out, ", ");
+			n++;
+			dump_reg(dis, i);
+			for (j = i+1, rn = 0, m2 = m1<<1; j < 16; ++j, m2<<=1) {
+				if ((reg_list & m2) != 0) ++rn;
+				else break;
+			}
+			i+=rn;
+			if (rn > 1) {
+				fprintf(dis->dis_out, "-");
+				dump_reg(dis, i);
+			} else if (rn == 1) {
+				fprintf(dis->dis_out, ", ");
+				dump_reg(dis, i);
+			}
+			m1<<=(rn+1);
+			i++;
+		} else {
+			++i;
+			m1<<=1;
+		}
+	}
+}
+
+
+void dump_br(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "b%s%s\t%x\t; %p -> %#x",
+	    (i.br.link == 1) ? "l" : "",
+	    cond[i.br.cond], i.br.offset, dis->pi, (int)dis->pi + 4*2 + ((int)(i.br.offset << 8) >> 6));
+}
+
+
+void dump_dpi(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "%s%s", ops[i.dpi.all.opcode], cond[i.dpi.all.cond]);
+
+	if ((i.dpi.all.opcode < ARMOP_TST || i.dpi.all.opcode > ARMOP_CMN) && (i.dpi.all.s != 0)) {
+		fprintf(dis->dis_out, "s");
+	}
+
+	fprintf(dis->dis_out, "\t");
+
+	if ((i.dpi.all.opcode < ARMOP_TST) || (i.dpi.all.opcode > ARMOP_CMN)) {
+		/* for comparison operations Rd is ignored */
+		dump_reg(dis, i.dpi.all.rd);
+		fprintf(dis->dis_out, ", ");
+	}
+
+	if ((i.dpi.all.opcode != ARMOP_MOV) && (i.dpi.all.opcode != ARMOP_MVN)) {
+		/* for MOV/MVN Rn is ignored */
+		dump_reg(dis, i.dpi.all.rn);
+		fprintf(dis->dis_out, ", ");
+	}
+
+	if (i.dpi.all.type == 1) {
+		/* immediate */
+		if (i.dpi.op2_imm.rot != 0) {
+			fprintf(dis->dis_out, "#%d, %d\t; 0x%x", i.dpi.op2_imm.imm, i.dpi.op2_imm.rot << 1,
+			        ARM_SCALE(i.dpi.op2_imm.imm, (i.dpi.op2_imm.rot << 1)) );
+		} else {
+			fprintf(dis->dis_out, "#%d\t; 0x%x", i.dpi.op2_imm.imm, i.dpi.op2_imm.imm);
+		}
+	} else {
+		/* reg-reg */
+		if (i.dpi.op2_reg.tag == 0) {
+			/* op2 is reg shift by imm */
+			dump_reg(dis, i.dpi.op2_reg_imm.r2.rm);
+			if (i.dpi.op2_reg_imm.imm.shift != 0) {
+				fprintf(dis->dis_out, " %s #%d", shift_types[i.dpi.op2_reg_imm.r2.type], i.dpi.op2_reg_imm.imm.shift);
+			}
+		} else {
+			/* op2 is reg shift by reg */
+			dump_reg(dis, i.dpi.op2_reg_reg.r2.rm);
+			fprintf(dis->dis_out, " %s ", shift_types[i.dpi.op2_reg_reg.r2.type]);
+			dump_reg(dis, i.dpi.op2_reg_reg.reg.rs);
+		}
+
+	}
+}
+
+void dump_wxfer(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "%s%s%s%s\t",
+		(i.wxfer.all.ls == 0) ? "str" : "ldr",
+		cond[i.generic.cond],
+		(i.wxfer.all.b == 0) ? "" : "b",
+		(i.wxfer.all.ls != 0 && i.wxfer.all.wb != 0) ? "t" : "");
+	dump_reg(dis, i.wxfer.all.rd);
+	fprintf(dis->dis_out, ", [");
+	dump_reg(dis, i.wxfer.all.rn);
+	fprintf(dis->dis_out, "%s, ", (i.wxfer.all.p == 0) ? "]" : "");
+
+	if (i.wxfer.all.type == 0) { /* imm */
+		fprintf(dis->dis_out, "#%s%d", (i.wxfer.all.u == 0) ? "-" : "", i.wxfer.all.op2_imm);
+	} else {
+		dump_reg(dis, i.wxfer.op2_reg_imm.r2.rm);
+		if (i.wxfer.op2_reg_imm.imm.shift != 0) {
+			fprintf(dis->dis_out, " %s #%d", shift_types[i.wxfer.op2_reg_imm.r2.type], i.wxfer.op2_reg_imm.imm.shift);
+		}
+	}
+
+	if (i.wxfer.all.p != 0) {
+		/* close pre-index instr, also check for write-back */
+		fprintf(dis->dis_out, "]%s", (i.wxfer.all.wb != 0) ? "!" : "");
+	}
+}
+
+void dump_hxfer(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "%s%s%s%s\t",
+		(i.hxfer.ls == 0) ? "str" : "ldr",
+		cond[i.generic.cond],
+		(i.hxfer.s != 0) ? "s" : "",
+		(i.hxfer.h != 0) ? "h" : "b");
+	dump_reg(dis, i.hxfer.rd);
+	fprintf(dis->dis_out, ", [");
+	dump_reg(dis, i.hxfer.rn);
+	fprintf(dis->dis_out, "%s, ", (i.hxfer.p == 0) ? "]" : "");
+
+	if (i.hxfer.type != 0) { /* imm */
+		fprintf(dis->dis_out, "#%s%d", (i.hxfer.u == 0) ? "-" : "", (i.hxfer.imm_hi << 4) | i.hxfer.rm);
+	} else {
+		dump_reg(dis, i.hxfer.rm);
+	}
+
+	if (i.hxfer.p != 0) {
+		/* close pre-index instr, also check for write-back */
+		fprintf(dis->dis_out, "]%s", (i.hxfer.wb != 0) ? "!" : "");
+	}
+}
+
+
+void dump_mrt(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "%s%s%s%s\t", (i.mrt.ls == 0) ? "stm" : "ldm", cond[i.mrt.cond],
+	        (i.mrt.u == 0) ? "d" : "i", (i.mrt.p == 0) ? "a" : "b");
+	dump_reg(dis, i.mrt.rn);
+	fprintf(dis->dis_out, "%s, {", (i.mrt.wb != 0) ? "!" : "");
+	dump_reglist(dis, i.mrt.reg_list);
+	fprintf(dis->dis_out, "}");
+}
+
+
+void dump_swp(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "swp%s%s ", cond[i.swp.cond], (i.swp.b != 0) ? "b" : "");
+	dump_reg(dis, i.swp.rd);
+	fprintf(dis->dis_out, ", ");
+	dump_reg(dis, i.swp.rm);
+	fprintf(dis->dis_out, ", [");
+	dump_reg(dis, i.swp.rn);
+	fprintf(dis->dis_out, "]");
+}
+
+
+void dump_mul(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "%s%s%s\t", mul_ops[i.mul.opcode], cond[i.mul.cond], (i.mul.s != 0) ? "s" : "");
+	switch (i.mul.opcode) {
+	case ARMOP_MUL:
+		dump_reg(dis, i.mul.rd);
+		fprintf(dis->dis_out, ", ");
+		dump_reg(dis, i.mul.rm);
+		fprintf(dis->dis_out, ", ");
+		dump_reg(dis, i.mul.rs);
+		break;
+	case ARMOP_MLA:
+		dump_reg(dis, i.mul.rd);
+		fprintf(dis->dis_out, ", ");
+		dump_reg(dis, i.mul.rm);
+		fprintf(dis->dis_out, ", ");
+		dump_reg(dis, i.mul.rs);
+		fprintf(dis->dis_out, ", ");
+		dump_reg(dis, i.mul.rn);
+		break;
+	case ARMOP_UMULL:
+	case ARMOP_UMLAL:
+	case ARMOP_SMULL:
+	case ARMOP_SMLAL:
+		dump_reg(dis, i.mul.rd);
+		fprintf(dis->dis_out, ", ");
+		dump_reg(dis, i.mul.rn);
+		fprintf(dis->dis_out, ", ");
+		dump_reg(dis, i.mul.rm);
+		fprintf(dis->dis_out, ", ");
+		dump_reg(dis, i.mul.rs);
+		break;
+	default:
+		fprintf(dis->dis_out, "DCD 0x%x\t; <unknown>", i.raw);
+		break;
+	}
+}
+
+
+void dump_cdp(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "cdp%s\tp%d, %d, ", cond[i.generic.cond], i.cdp.cpn, i.cdp.op);
+	dump_creg(dis, i.cdp.crd);
+	fprintf(dis->dis_out, ", ");
+	dump_creg(dis, i.cdp.crn);
+	fprintf(dis->dis_out, ", ");
+	dump_creg(dis, i.cdp.crm);
+
+	if (i.cdp.op2 != 0) {
+		fprintf(dis->dis_out, ", %d", i.cdp.op2);
+	}
+}
+
+
+void dump_cdt(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "%s%s%s\tp%d, ", (i.cdt.ls == 0) ? "stc" : "ldc",
+	        cond[i.generic.cond], (i.cdt.n != 0) ? "l" : "", i.cdt.cpn);
+	dump_creg(dis, i.cdt.crd);
+	fprintf(dis->dis_out, ", ");
+	dump_reg(dis, i.cdt.rn);
+
+	if (i.cdt.p == 0) {
+		fprintf(dis->dis_out, "]");
+	}
+
+	if (i.cdt.offs != 0) {
+		fprintf(dis->dis_out, ", #%d", i.cdt.offs);
+	}
+
+	if (i.cdt.p != 0) {
+		fprintf(dis->dis_out, "]%s", (i.cdt.wb != 0) ? "!" : "");
+	}
+}
+
+
+void dump_crt(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "%s%s\tp%d, %d, ", (i.crt.ls == 0) ? "mrc" : "mcr",
+	        cond[i.generic.cond], i.crt.cpn, i.crt.op1);
+	dump_reg(dis, i.crt.rd);
+	fprintf(dis->dis_out, ", ");
+	dump_creg(dis, i.crt.crn);
+	fprintf(dis->dis_out, ", ");
+	dump_creg(dis, i.crt.crm);
+
+	if (i.crt.op2 != 0) {
+		fprintf(dis->dis_out, ", %d", i.crt.op2);
+	}
+}
+
+
+void dump_msr(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "msr%s\t%spsr_, ", cond[i.generic.cond],
+	        (i.msr.all.sel == 0) ? "s" : "c");
+	if (i.msr.all.type == 0) {
+		/* reg */
+		fprintf(dis->dis_out, "%s, ", msr_fld[i.msr.all.fld]);
+		dump_reg(dis, i.msr.all.rm);
+	} else {
+		/* imm */
+		fprintf(dis->dis_out, "f, #%d", i.msr.op2_imm.imm << i.msr.op2_imm.rot);
+	}
+}
+
+
+void dump_mrs(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "mrs%s\t", cond[i.generic.cond]);
+	dump_reg(dis, i.mrs.rd);
+	fprintf(dis->dis_out, ", %spsr", (i.mrs.sel == 0) ? "s" : "c");
+}
+
+
+void dump_swi(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "swi%s\t%d", cond[i.generic.cond], i.swi.num);
+}
+
+
+void dump_clz(ARMDis* dis, ARMInstr i) {
+	fprintf(dis->dis_out, "clz\t");
+	dump_reg(dis, i.clz.rd);
+	fprintf(dis->dis_out, ", ");
+	dump_reg(dis, i.clz.rm);
+	fprintf(dis->dis_out, "\n");
+}
+
+
+
+void armdis_decode(ARMDis* dis, void* p, int size) {
+	int i;
+	arminstr_t* pi = (arminstr_t*)p;
+	ARMInstr instr;
+
+	if (dis == NULL) return;
+
+	chk_out(dis);
+
+	size/=sizeof(arminstr_t);
+
+	for (i=0; i<size; ++i) {
+		fprintf(dis->dis_out, "%p:\t%08x\t", pi, *pi);
+		dis->pi = pi;
+		instr.raw = *pi++;
+
+		if ((instr.raw & ARM_BR_MASK) == ARM_BR_TAG) {
+			dump_br(dis, instr);
+		} else if ((instr.raw & ARM_SWP_MASK) == ARM_SWP_TAG) {
+			dump_swp(dis, instr);
+		} else if ((instr.raw & ARM_MUL_MASK) == ARM_MUL_TAG) {
+			dump_mul(dis, instr);
+		} else if ((instr.raw & ARM_CLZ_MASK) == ARM_CLZ_TAG) {
+			dump_clz(dis, instr);
+		} else if ((instr.raw & ARM_WXFER_MASK) == ARM_WXFER_TAG) {
+			dump_wxfer(dis, instr);
+		} else if ((instr.raw & ARM_HXFER_MASK) == ARM_HXFER_TAG) {
+			dump_hxfer(dis, instr);
+		} else if ((instr.raw & ARM_DPI_MASK) == ARM_DPI_TAG) {
+			dump_dpi(dis, instr);
+		} else if ((instr.raw & ARM_MRT_MASK) == ARM_MRT_TAG) {
+			dump_mrt(dis, instr);
+		} else if ((instr.raw & ARM_CDP_MASK) == ARM_CDP_TAG) {
+			dump_cdp(dis, instr);
+		} else if ((instr.raw & ARM_CDT_MASK) == ARM_CDT_TAG) {
+			dump_cdt(dis, instr);
+		} else if ((instr.raw & ARM_CRT_MASK) == ARM_CRT_TAG) {
+			dump_crt(dis, instr);
+		} else if ((instr.raw & ARM_MSR_MASK) == ARM_MSR_TAG) {
+			dump_msr(dis, instr);
+		} else if ((instr.raw & ARM_MRS_MASK) == ARM_MRS_TAG) {
+			dump_mrs(dis, instr);
+		} else if ((instr.raw & ARM_SWI_MASK) == ARM_SWI_TAG) {
+			dump_swi(dis, instr);
+		} else {
+			fprintf(dis->dis_out, "DCD 0x%x\t; <unknown>", instr.raw);
+		}
+
+		fprintf(dis->dis_out, "\n");
+	}
+}
+
+
+void armdis_open(ARMDis* dis, const char* dump_name) {
+	if (dis != NULL && dump_name != NULL) {
+		armdis_set_output(dis, fopen(dump_name, "w"));
+	}
+}
+
+
+void armdis_close(ARMDis* dis) {
+	if (dis->dis_out != NULL && dis->dis_out != stdout && dis->dis_out != stderr) {
+		fclose(dis->dis_out);
+		dis->dis_out = NULL;
+	}
+}
+
+
+void armdis_dump(ARMDis* dis, const char* dump_name, void* p, int size) {
+	armdis_open(dis, dump_name);
+	armdis_decode(dis, p, size);
+	armdis_close(dis);
+}
+
+
+void armdis_init(ARMDis* dis) {
+	if (dis != NULL) {
+		/* set to stdout */
+		armdis_set_output(dis, NULL);
+	}
+}
+
+
+
+
+void init_gdisasm() {
+	if (gdisasm == NULL) {
+		gdisasm = (ARMDis*)malloc(sizeof(ARMDis));
+		armdis_init(gdisasm);
+	}
+}
+
+void _armdis_set_output(FILE* f) {
+	init_gdisasm();
+	armdis_set_output(gdisasm, f);
+}
+
+FILE* _armdis_get_output() {
+	init_gdisasm();
+	return armdis_get_output(gdisasm);
+}
+
+void _armdis_decode(void* p, int size) {
+	init_gdisasm();
+	armdis_decode(gdisasm, p, size);
+}
+
+void _armdis_open(const char* dump_name) {
+	init_gdisasm();
+	armdis_open(gdisasm, dump_name);
+}
+
+void _armdis_close() {
+	init_gdisasm();
+	armdis_close(gdisasm);
+}
+
+void _armdis_dump(const char* dump_name, void* p, int size) {
+	init_gdisasm();
+	armdis_dump(gdisasm, dump_name, p, size);
+}
+
--- a/lib/ffts/src/arch/arm/arm-dis.h
+++ b/lib/ffts/src/arch/arm/arm-dis.h
@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2002 Sergey Chaban <serge@wildwestsoftware.com>
+ */
+
+#ifndef ARM_DIS
+#define ARM_DIS
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct _ARMDis {
+	FILE* dis_out;
+	void* pi;
+} ARMDis;
+
+
+void _armdis_set_output(FILE* f);
+FILE* _armdis_get_output(void);
+void _armdis_decode(void* p, int size);
+void _armdis_open(const char* dump_name);
+void _armdis_close(void);
+void _armdis_dump(const char* dump_name, void* p, int size);
+
+
+void armdis_init(ARMDis* dis);
+void armdis_set_output(ARMDis* dis, FILE* f);
+FILE* armdis_get_output(ARMDis* dis);
+void armdis_decode(ARMDis* dis, void* p, int size);
+void armdis_open(ARMDis* dis, const char* dump_name);
+void armdis_close(ARMDis* dis);
+void armdis_dump(ARMDis* dis, const char* dump_name, void* p, int size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ARM_DIS */
--- a/lib/ffts/src/arch/arm/arm-vfp-codegen.h
+++ b/lib/ffts/src/arch/arm/arm-vfp-codegen.h
@ -0,0 +1,247 @@
+//
+// Copyright 2011 Xamarin Inc
+//
+
+#ifndef __MONO_ARM_VFP_CODEGEN_H__
+#define __MONO_ARM_VFP_CODEGEN_H__
+
+#include "arm-codegen.h"
+
+enum {
+	/* VFP registers */
+	ARM_VFP_F0,
+	ARM_VFP_F1,
+	ARM_VFP_F2,
+	ARM_VFP_F3,
+	ARM_VFP_F4,
+	ARM_VFP_F5,
+	ARM_VFP_F6,
+	ARM_VFP_F7,
+	ARM_VFP_F8,
+	ARM_VFP_F9,
+	ARM_VFP_F10,
+	ARM_VFP_F11,
+	ARM_VFP_F12,
+	ARM_VFP_F13,
+	ARM_VFP_F14,
+	ARM_VFP_F15,
+	ARM_VFP_F16,
+	ARM_VFP_F17,
+	ARM_VFP_F18,
+	ARM_VFP_F19,
+	ARM_VFP_F20,
+	ARM_VFP_F21,
+	ARM_VFP_F22,
+	ARM_VFP_F23,
+	ARM_VFP_F24,
+	ARM_VFP_F25,
+	ARM_VFP_F26,
+	ARM_VFP_F27,
+	ARM_VFP_F28,
+	ARM_VFP_F29,
+	ARM_VFP_F30,
+	ARM_VFP_F31,
+
+	ARM_VFP_D0 = ARM_VFP_F0,
+	ARM_VFP_D1 = ARM_VFP_F2,
+	ARM_VFP_D2 = ARM_VFP_F4,
+	ARM_VFP_D3 = ARM_VFP_F6,
+	ARM_VFP_D4 = ARM_VFP_F8,
+	ARM_VFP_D5 = ARM_VFP_F10,
+	ARM_VFP_D6 = ARM_VFP_F12,
+	ARM_VFP_D7 = ARM_VFP_F14,
+	ARM_VFP_D8 = ARM_VFP_F16,
+	ARM_VFP_D9 = ARM_VFP_F18,
+	ARM_VFP_D10 = ARM_VFP_F20,
+	ARM_VFP_D11 = ARM_VFP_F22,
+	ARM_VFP_D12 = ARM_VFP_F24,
+	ARM_VFP_D13 = ARM_VFP_F26,
+	ARM_VFP_D14 = ARM_VFP_F28,
+	ARM_VFP_D15 = ARM_VFP_F30,
+
+	ARM_VFP_COPROC_SINGLE = 10,
+	ARM_VFP_COPROC_DOUBLE = 11,
+
+#define ARM_VFP_OP(p,q,r,s) (((p) << 23) | ((q) << 21) | ((r) << 20) | ((s) << 6))
+#define ARM_VFP_OP2(Fn,N) (ARM_VFP_OP (1,1,1,1) | ((Fn) << 16) | ((N) << 7))
+
+	ARM_VFP_MUL = ARM_VFP_OP (0,1,0,0),
+	ARM_VFP_NMUL = ARM_VFP_OP (0,1,0,1),
+	ARM_VFP_ADD = ARM_VFP_OP (0,1,1,0),
+	ARM_VFP_SUB = ARM_VFP_OP (0,1,1,1),
+	ARM_VFP_DIV = ARM_VFP_OP (1,0,0,0),
+
+	ARM_VFP_CPY = ARM_VFP_OP2 (0,0),
+	ARM_VFP_ABS = ARM_VFP_OP2 (0,1),
+	ARM_VFP_NEG = ARM_VFP_OP2 (1,0),
+	ARM_VFP_SQRT = ARM_VFP_OP2 (1,1),
+	ARM_VFP_CMP = ARM_VFP_OP2 (4,0),
+	ARM_VFP_CMPE = ARM_VFP_OP2 (4,1),
+	ARM_VFP_CMPZ = ARM_VFP_OP2 (5,0),
+	ARM_VFP_CMPEZ = ARM_VFP_OP2 (5,1),
+	ARM_VFP_CVT = ARM_VFP_OP2 (7,1),
+	ARM_VFP_UITO = ARM_VFP_OP2 (8,0),
+	ARM_VFP_SITO = ARM_VFP_OP2 (8,1),
+	ARM_VFP_TOUI = ARM_VFP_OP2 (12,0),
+	ARM_VFP_TOSI = ARM_VFP_OP2 (13,0),
+	ARM_VFP_TOUIZ = ARM_VFP_OP2 (12,1),
+	ARM_VFP_TOSIZ = ARM_VFP_OP2 (13,1),
+
+	ARM_VFP_SID = 0,
+	ARM_VFP_SCR = 1 << 1,
+	ARM_VFP_EXC = 8 << 1
+};
+
+#define ARM_DEF_VFP_DYADIC(cond,cp,op,Fd,Fn,Fm)	\
+	(14 << 24)				|	\
+	((cp) << 8)				|	\
+	(op)					|	\
+	(((Fd) >> 1) << 12)			|	\
+	(((Fd) & 1) << 22)			|	\
+	(((Fn) >> 1) << 16)			|	\
+	(((Fn) & 1) << 7)			|	\
+	(((Fm) >> 1) << 0)			|	\
+	(((Fm) & 1) << 5)			|	\
+	ARM_DEF_COND(cond)
+
+#define ARM_DEF_VFP_MONADIC(cond,cp,op,Fd,Fm)	\
+	(14 << 24)				|	\
+	((cp) << 8)				|	\
+	(op)					|	\
+	(((Fd) >> 1) << 12)			|	\
+	(((Fd) & 1) << 22)			|	\
+	(((Fm) >> 1) << 0)			|	\
+	(((Fm) & 1) << 5)			|	\
+	ARM_DEF_COND(cond)
+
+#define ARM_DEF_VFP_LSF(cond,cp,post,ls,wback,basereg,Fd,offset)	\
+	((offset) >= 0? (offset)>>2: -(offset)>>2)	|	\
+	(6 << 25)					|	\
+	((cp) << 8)					|	\
+	(((Fd) >> 1) << 12)				|	\
+	(((Fd) & 1) << 22)				|	\
+	((basereg) << 16)				|	\
+	((ls) << 20)					|	\
+	((wback) << 21)					|	\
+	(((offset) >= 0) << 23)				|	\
+	((wback) << 21)					|	\
+	((post) << 24)					|	\
+	ARM_DEF_COND(cond)
+
+#define ARM_DEF_VFP_CPT(cond,cp,op,L,Fn,Rd)	\
+	(14 << 24)				|	\
+	(1 << 4)				|	\
+	((cp) << 8)				|	\
+	((op) << 21)				|	\
+	((L) << 20)				|	\
+	((Rd) << 12)				|	\
+	(((Fn) >> 1) << 16)			|	\
+	(((Fn) & 1) << 7)			|	\
+	ARM_DEF_COND(cond)
+
+/* FP load and stores */
+#define ARM_FLDS_COND(p,freg,base,offset,cond)	\
+	ARM_EMIT((p), ARM_DEF_VFP_LSF((cond),ARM_VFP_COPROC_SINGLE,1,ARMOP_LDR,0,(base),(freg),(offset)))
+#define ARM_FLDS(p,freg,base,offset)	\
+	ARM_FLDS_COND(p,freg,base,offset,ARMCOND_AL)
+
+#define ARM_FLDD_COND(p,freg,base,offset,cond)	\
+	ARM_EMIT((p), ARM_DEF_VFP_LSF((cond),ARM_VFP_COPROC_DOUBLE,1,ARMOP_LDR,0,(base),(freg),(offset)))
+#define ARM_FLDD(p,freg,base,offset)	\
+	ARM_FLDD_COND(p,freg,base,offset,ARMCOND_AL)
+
+#define ARM_FSTS_COND(p,freg,base,offset,cond)	\
+	ARM_EMIT((p), ARM_DEF_VFP_LSF((cond),ARM_VFP_COPROC_SINGLE,1,ARMOP_STR,0,(base),(freg),(offset)))
+#define ARM_FSTS(p,freg,base,offset)	\
+	ARM_FSTS_COND(p,freg,base,offset,ARMCOND_AL)
+
+#define ARM_FSTD_COND(p,freg,base,offset,cond)	\
+	ARM_EMIT((p), ARM_DEF_VFP_LSF((cond),ARM_VFP_COPROC_DOUBLE,1,ARMOP_STR,0,(base),(freg),(offset)))
+#define ARM_FSTD(p,freg,base,offset)	\
+	ARM_FSTD_COND(p,freg,base,offset,ARMCOND_AL)
+
+#define ARM_FLDMD_COND(p,first_reg,nregs,base,cond)							\
+	ARM_EMIT((p), ARM_DEF_VFP_LSF((cond),ARM_VFP_COPROC_DOUBLE,0,ARMOP_LDR,0,(base),(first_reg),((nregs) * 2) << 2))
+
+#define ARM_FLDMD(p,first_reg,nregs,base)		\
+	ARM_FLDMD_COND(p,first_reg,nregs,base,ARMCOND_AL)
+
+#define ARM_FSTMD_COND(p,first_reg,nregs,base,cond)							\
+	ARM_EMIT((p), ARM_DEF_VFP_LSF((cond),ARM_VFP_COPROC_DOUBLE,0,ARMOP_STR,0,(base),(first_reg),((nregs) * 2) << 2))
+
+#define ARM_FSTMD(p,first_reg,nregs,base)		\
+	ARM_FSTMD_COND(p,first_reg,nregs,base,ARMCOND_AL)
+
+#include <mono/arch/arm/arm_vfpmacros.h>
+
+/* coprocessor register transfer */
+#define ARM_FMSR(p,freg,reg)	\
+	ARM_EMIT((p), ARM_DEF_VFP_CPT(ARMCOND_AL,ARM_VFP_COPROC_SINGLE,0,0,(freg),(reg)))
+#define ARM_FMRS(p,reg,freg)	\
+	ARM_EMIT((p), ARM_DEF_VFP_CPT(ARMCOND_AL,ARM_VFP_COPROC_SINGLE,0,1,(freg),(reg)))
+
+#define ARM_FMDLR(p,freg,reg)	\
+	ARM_EMIT((p), ARM_DEF_VFP_CPT(ARMCOND_AL,ARM_VFP_COPROC_DOUBLE,0,0,(freg),(reg)))
+#define ARM_FMRDL(p,reg,freg)	\
+	ARM_EMIT((p), ARM_DEF_VFP_CPT(ARMCOND_AL,ARM_VFP_COPROC_DOUBLE,0,1,(freg),(reg)))
+#define ARM_FMDHR(p,freg,reg)	\
+	ARM_EMIT((p), ARM_DEF_VFP_CPT(ARMCOND_AL,ARM_VFP_COPROC_DOUBLE,1,0,(freg),(reg)))
+#define ARM_FMRDH(p,reg,freg)	\
+	ARM_EMIT((p), ARM_DEF_VFP_CPT(ARMCOND_AL,ARM_VFP_COPROC_DOUBLE,1,1,(freg),(reg)))
+
+#define ARM_FMXR(p,freg,reg)	\
+	ARM_EMIT((p), ARM_DEF_VFP_CPT(ARMCOND_AL,ARM_VFP_COPROC_SINGLE,7,0,(freg),(reg)))
+#define ARM_FMRX(p,reg,fcreg)	\
+	ARM_EMIT((p), ARM_DEF_VFP_CPT(ARMCOND_AL,ARM_VFP_COPROC_SINGLE,7,1,(fcreg),(reg)))
+
+#define ARM_FMSTAT(p)   \
+	ARM_FMRX((p),ARMREG_R15,ARM_VFP_SCR)
+
+#define ARM_DEF_MCRR(cond,cp,rn,rd,Fm,M) \
+	((Fm) << 0) |					   \
+	(1 << 4)   |					   \
+	((M) << 5) |					   \
+	((cp) << 8) |					   \
+	((rd) << 12) |					   \
+	((rn) << 16) |					   \
+	((2) << 21) |					   \
+	(12 << 24) |					   \
+	ARM_DEF_COND(cond)
+
+#define ARM_FMDRR(p,rd,rn,dm)   \
+	ARM_EMIT((p), ARM_DEF_MCRR(ARMCOND_AL,ARM_VFP_COPROC_DOUBLE,(rn),(rd),(dm) >> 1, (dm) & 1))
+
+#define ARM_DEF_FMRRD(cond,cp,rn,rd,Dm,D)		\
+	((Dm) << 0) |					   \
+	(1 << 4)   |					   \
+	((cp) << 8) |					   \
+	((rd) << 12) |					   \
+	((rn) << 16) |					   \
+	((0xc5) << 20) |					   \
+	ARM_DEF_COND(cond)
+
+#define ARM_FMRRD(p,rd,rn,dm)   \
+	ARM_EMIT((p), ARM_DEF_FMRRD(ARMCOND_AL,ARM_VFP_COPROC_DOUBLE,(rn),(rd),(dm) >> 1, (dm) & 1))
+
+#define ARM_DEF_FUITOS(cond,Dd,D,Fm,M) ((cond) << 28) | ((0x1d) << 23) | ((D) << 22) | ((0x3) << 20) | ((8) << 16) | ((Dd) << 12) | ((0xa) << 8) | ((1) << 6) | ((M) << 5) | ((Fm) << 0)
+
+#define ARM_FUITOS(p,dreg,sreg) \
+	ARM_EMIT((p), ARM_DEF_FUITOS (ARMCOND_AL, (dreg) >> 1, (dreg) & 1, (sreg) >> 1, (sreg) & 1))
+
+#define ARM_DEF_FUITOD(cond,Dd,D,Fm,M) ((cond) << 28) | ((0x1d) << 23) | ((D) << 22) | ((0x3) << 20) | ((8) << 16) | ((Dd) << 12) | ((0xb) << 8) | ((1) << 6) | ((M) << 5) | ((Fm) << 0)
+
+#define ARM_FUITOD(p,dreg,sreg) \
+	ARM_EMIT((p), ARM_DEF_FUITOD (ARMCOND_AL, (dreg) >> 1, (dreg) & 1, (sreg) >> 1, (sreg) & 1))
+
+#define ARM_DEF_FSITOS(cond,Dd,D,Fm,M) ((cond) << 28) | ((0x1d) << 23) | ((D) << 22) | ((0x3) << 20) | ((8) << 16) | ((Dd) << 12) | ((0xa) << 8) | ((1) << 7) | ((1) << 6) | ((M) << 5) | ((Fm) << 0)
+
+#define ARM_FSITOS(p,dreg,sreg) \
+	ARM_EMIT((p), ARM_DEF_FSITOS (ARMCOND_AL, (dreg) >> 1, (dreg) & 1, (sreg) >> 1, (sreg) & 1))
+
+#define ARM_DEF_FSITOD(cond,Dd,D,Fm,M) ((cond) << 28) | ((0x1d) << 23) | ((D) << 22) | ((0x3) << 20) | ((8) << 16) | ((Dd) << 12) | ((0xb) << 8) | ((1) << 7) | ((1) << 6) | ((M) << 5) | ((Fm) << 0)
+
+#define ARM_FSITOD(p,dreg,sreg) \
+	ARM_EMIT((p), ARM_DEF_FSITOD (ARMCOND_AL, (dreg) >> 1, (dreg) & 1, (sreg) >> 1, (sreg) & 1))
+
+#endif /* __MONO_ARM_VFP_CODEGEN_H__ */
+
--- a/lib/ffts/src/arch/arm/arm-wmmx.h
+++ b/lib/ffts/src/arch/arm/arm-wmmx.h
@ -0,0 +1,177 @@
+/*
+ * ARM CodeGen
+ * XScale WirelessMMX extensions
+ * Copyright 2002 Wild West Software
+ */
+
+#ifndef __WMMX_H__
+#define __WMMX_H__ 1
+
+#if 0
+#include <arm-codegen.h>
+#endif
+
+#if defined(ARM_IASM)
+#	define WM_ASM(_expr) ARM_IASM(_expr)
+#else
+#	define WM_ASM(_expr) __emit (_expr)
+#endif
+
+#if defined(ARM_EMIT)
+#	define WM_EMIT(p, i) ARM_EMIT(p, i)
+#else
+#	define WM_EMIT(p, i) 
+#endif
+
+enum {
+	WM_CC_EQ = 0x0,
+	WM_CC_NE = 0x1,
+	WM_CC_CS = 0x2,
+	WM_CC_HS = WM_CC_CS,
+	WM_CC_CC = 0x3,
+	WM_CC_LO = WM_CC_CC,
+	WM_CC_MI = 0x4,
+	WM_CC_PL = 0x5,
+	WM_CC_VS = 0x6,
+	WM_CC_VC = 0x7,
+	WM_CC_HI = 0x8,
+	WM_CC_LS = 0x9,
+	WM_CC_GE = 0xA,
+	WM_CC_LT = 0xB,
+	WM_CC_GT = 0xC,
+	WM_CC_LE = 0xD,
+	WM_CC_AL = 0xE,
+	WM_CC_NV = 0xF,
+	WM_CC_SHIFT = 28
+};
+
+#if defined(ARM_DEF_COND)
+#	define WM_DEF_CC(_cc) ARM_DEF_COND(_cc)
+#else
+#	define WM_DEF_CC(_cc) ((_cc & 0xF) << WM_CC_SHIFT)
+#endif
+
+
+enum {
+	WM_R0	= 0x0,
+	WM_R1	= 0x1,
+	WM_R2	= 0x2,
+	WM_R3	= 0x3,
+	WM_R4	= 0x4,
+	WM_R5	= 0x5,
+	WM_R6	= 0x6,
+	WM_R7	= 0x7,
+	WM_R8	= 0x8,
+	WM_R9	= 0x9,
+	WM_R10	= 0xA,
+	WM_R11	= 0xB,
+	WM_R12	= 0xC,
+	WM_R13	= 0xD,
+	WM_R14	= 0xE,
+	WM_R15	= 0xF,
+
+	WM_wR0	= 0x0,
+	WM_wR1	= 0x1,
+	WM_wR2	= 0x2,
+	WM_wR3	= 0x3,
+	WM_wR4	= 0x4,
+	WM_wR5	= 0x5,
+	WM_wR6	= 0x6,
+	WM_wR7	= 0x7,
+	WM_wR8	= 0x8,
+	WM_wR9	= 0x9,
+	WM_wR10	= 0xA,
+	WM_wR11	= 0xB,
+	WM_wR12	= 0xC,
+	WM_wR13	= 0xD,
+	WM_wR14	= 0xE,
+	WM_wR15	= 0xF
+};
+
+
+/*
+ * Qualifiers:
+ *	H - 16-bit (HalfWord) SIMD
+ *	W - 32-bit (Word) SIMD
+ *	D - 64-bit (Double)
+ */
+enum {
+	WM_B = 0,
+	WM_H = 1,
+	WM_D = 2
+};
+
+/*
+ * B.2.3 Transfers From Coprocessor Register (MRC)
+ * Table B-5
+ */
+enum {
+	WM_TMRC_OP2      = 0,
+	WM_TMRC_CPNUM    = 1,
+
+	WM_TMOVMSK_OP2   = 1,
+	WM_TMOVMSK_CPNUM = 0,
+
+	WM_TANDC_OP2     = 1,
+	WM_TANDC_CPNUM   = 1,
+
+	WM_TORC_OP2      = 2,
+	WM_TORC_CPNUM    = 1,
+
+	WM_TEXTRC_OP2    = 3,
+	WM_TEXTRC_CPNUM  = 1,
+
+	WM_TEXTRM_OP2    = 3,
+	WM_TEXTRM_CPNUM  = 0
+};
+
+
+/*
+ * TANDC<B,H,W>{Cond} R15
+ * Performs AND across the fields of the SIMD PSR register (wCASF) and sends the result
+ * to CPSR; can be performed after a Byte, Half-word or Word operation that sets the flags.
+ * NOTE: R15 is omitted from the macro declaration;
+ */
+#define DEF_WM_TNADC_CC(_q, _cc) WM_DEF_CC((_cc)) + ((_q) << 0x16) + 0xE13F130
+
+#define _WM_TNADC_CC(_q, _cc) WM_ASM(DEF_WM_TNADC_CC(_q, _cc))
+#define ARM_WM_TNADC_CC(_p, _q, _cc) WM_EMIT(_p, DEF_WM_TNADC_CC(_q, _cc))
+
+/* inline assembly */
+#define _WM_TNADC(_q) _WM_TNADC_CC((_q), WM_CC_AL)
+#define _WM_TNADCB() _WM_TNADC(WM_B)
+#define _WM_TNADCH() _WM_TNADC(WM_H)
+#define _WM_TNADCD() _WM_TNADC(WM_D)
+
+/* codegen */
+#define ARM_WM_TNADC(_p, _q) ARM_WM_TNADC_CC((_p), (_q), WM_CC_AL)
+#define ARM_WM_TNADCB(_p) ARM_WM_TNADC(_p, WM_B)
+#define ARM_WM_TNADCH(_p) ARM_WM_TNADC(_p, WM_H)
+#define ARM_WM_TNADCD(_p) ARM_WM_TNADC(_p, WM_D)
+
+
+/*
+ * TBCST<B,H,W>{Cond} wRd, Rn
+ * Broadcasts a value from the ARM Source reg (Rn) to every SIMD position
+ * in the WMMX Destination reg (wRd).
+ */
+#define DEF_WM_TBCST_CC(_q, _cc, _wrd, _rn) \
+	WM_DEF_CC((_cc)) + ((_q) << 6) + ((_wrd) << 16) + ((_rn) << 12) + 0xE200010
+
+#define _WM_TBCST_CC(_q, _cc, _wrd, _rn) WM_ASM(DEF_WM_TBCST_CC(_q, _cc, _wrd, _rn))
+#define ARM_WM_TBCST_CC(_p, _q, _cc, _wrd, _rn) WM_EMIT(_p, DEF_WM_TBCST_CC(_q, _cc, _wrd, _rn))
+
+/* inline */
+#define _WM_TBCST(_q, _wrd, _rn) _WM_TBCST_CC(_q, WM_CC_AL, _wrd, _rn)
+#define _WM_TBCSTB(_wrd, _rn) _WM_TBCST(WM_B)
+#define _WM_TBCSTH(_wrd, _rn) _WM_TBCST(WM_H)
+#define _WM_TBCSTD(_wrd, _rn) _WM_TBCST(WM_D)
+
+/* codegen */
+#define ARM_WM_TBCST(_p, _q, _wrd, _rn) ARM_WM_TBCST_CC(_p, _q, WM_CC_AL, _wrd, _rn)
+#define ARM_WM_TBCSTB(_p, _wrd, _rn) _WM_TBCST(_p, WM_B)
+#define ARM_WM_TBCSTH(_p, _wrd, _rn) _WM_TBCST(_p, WM_H)
+#define ARM_WM_TBCSTD(_p, _wrd, _rn) _WM_TBCST(_p, WM_D)
+
+
+#endif /* __WMMX_H__ */
--- a/lib/ffts/src/arch/arm/cmp_macros.th
+++ b/lib/ffts/src/arch/arm/cmp_macros.th
@ -0,0 +1,56 @@
+/* PSR := <Op> Rn, (imm8 ROR 2*rot) */
+#define ARM_<Op>_REG_IMM_COND(p, rn, imm8, rot, cond) \
+	ARM_DPIOP_S_REG_IMM8ROT_COND(p, ARMOP_<Op>, 0, rn, imm8, rot, cond)
+#define ARM_<Op>_REG_IMM(p, rn, imm8, rot) \
+	ARM_<Op>_REG_IMM_COND(p, rn, imm8, rot, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_IMM_COND(rn, imm8, rot, cond) \
+	ARM_IASM_DPIOP_S_REG_IMM8ROT_COND(ARMOP_<Op>, 0, rn, imm8, rot, cond)
+#define _<Op>_REG_IMM(rn, imm8, rot) \
+	_<Op>_REG_IMM_COND(rn, imm8, rot, ARMCOND_AL)
+#endif
+
+
+/* PSR := <Op> Rn, imm8 */
+#define ARM_<Op>_REG_IMM8_COND(p, rn, imm8, cond) \
+	ARM_<Op>_REG_IMM_COND(p, rn, imm8, 0, cond)
+#define ARM_<Op>_REG_IMM8(p, rn, imm8) \
+	ARM_<Op>_REG_IMM8_COND(p, rn, imm8, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_IMM8_COND(rn, imm8, cond) \
+	_<Op>_REG_IMM_COND(rn, imm8, 0, cond)
+#define _<Op>_REG_IMM8(rn, imm8) \
+	_<Op>_REG_IMM8_COND(rn, imm8, ARMCOND_AL)
+#endif
+
+
+/* PSR := <Op> Rn, Rm */
+#define ARM_<Op>_REG_REG_COND(p, rn, rm, cond) \
+	ARM_DPIOP_S_REG_REG_COND(p, ARMOP_<Op>, 0, rn, rm, cond)
+#define ARM_<Op>_REG_REG(p, rn, rm) \
+	ARM_<Op>_REG_REG_COND(p, rn, rm, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_REG_COND(rn, rm, cond) \
+	ARM_IASM_DPIOP_S_REG_REG_COND(ARMOP_<Op>, 0, rn, rm, cond)
+#define _<Op>_REG_REG(rn, rm) \
+	_<Op>_REG_REG_COND(rn, rm, ARMCOND_AL)
+#endif
+
+
+/* PSR := <Op> Rn, (Rm <shift_type> imm8) */
+#define ARM_<Op>_REG_IMMSHIFT_COND(p, rn, rm, shift_type, imm_shift, cond) \
+	ARM_DPIOP_S_REG_IMMSHIFT_COND(p, ARMOP_<Op>, 0, rn, rm, shift_type, imm_shift, cond)
+#define ARM_<Op>_REG_IMMSHIFT(p, rn, rm, shift_type, imm_shift) \
+	ARM_<Op>_REG_IMMSHIFT_COND(p, rn, rm, shift_type, imm_shift, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_IMMSHIFT_COND(rn, rm, shift_type, imm_shift, cond) \
+	ARM_IASM_DPIOP_S_REG_IMMSHIFT_COND(ARMOP_<Op>, 0, rn, rm, shift_type, imm_shift, cond)
+#define _<Op>_REG_IMMSHIFT(rn, rm, shift_type, imm_shift) \
+	_<Op>_REG_IMMSHIFT_COND(rn, rm, shift_type, imm_shift, ARMCOND_AL)
+#endif
+
+
--- a/lib/ffts/src/arch/arm/dpi_macros.th
+++ b/lib/ffts/src/arch/arm/dpi_macros.th
@ -0,0 +1,112 @@
+/* -- <Op> -- */
+
+/* Rd := Rn <Op> (imm8 ROR rot) ; rot is power of 2 */
+#define ARM_<Op>_REG_IMM_COND(p, rd, rn, imm8, rot, cond) \
+	ARM_DPIOP_REG_IMM8ROT_COND(p, ARMOP_<Op>, rd, rn, imm8, rot, cond)
+#define ARM_<Op>_REG_IMM(p, rd, rn, imm8, rot) \
+	ARM_<Op>_REG_IMM_COND(p, rd, rn, imm8, rot, ARMCOND_AL)
+#define ARM_<Op>S_REG_IMM_COND(p, rd, rn, imm8, rot, cond) \
+	ARM_DPIOP_S_REG_IMM8ROT_COND(p, ARMOP_<Op>, rd, rn, imm8, rot, cond)
+#define ARM_<Op>S_REG_IMM(p, rd, rn, imm8, rot) \
+	ARM_<Op>S_REG_IMM_COND(p, rd, rn, imm8, rot, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_IMM_COND(rd, rn, imm8, rot, cond) \
+	ARM_IASM_DPIOP_REG_IMM8ROT_COND(ARMOP_<Op>, rd, rn, imm8, rot, cond)
+#define _<Op>_REG_IMM(rd, rn, imm8, rot) \
+	_<Op>_REG_IMM_COND(rd, rn, imm8, rot, ARMCOND_AL)
+#define _<Op>S_REG_IMM_COND(rd, rn, imm8, rot, cond) \
+	ARM_IASM_DPIOP_S_REG_IMM8ROT_COND(ARMOP_<Op>, rd, rn, imm8, rot, cond)
+#define _<Op>S_REG_IMM(rd, rn, imm8, rot) \
+	_<Op>S_REG_IMM_COND(rd, rn, imm8, rot, ARMCOND_AL)
+#endif
+
+
+/* Rd := Rn <Op> imm8 */
+#define ARM_<Op>_REG_IMM8_COND(p, rd, rn, imm8, cond) \
+	ARM_<Op>_REG_IMM_COND(p, rd, rn, imm8, 0, cond)
+#define ARM_<Op>_REG_IMM8(p, rd, rn, imm8) \
+	ARM_<Op>_REG_IMM8_COND(p, rd, rn, imm8, ARMCOND_AL)
+#define ARM_<Op>S_REG_IMM8_COND(p, rd, rn, imm8, cond) \
+	ARM_<Op>S_REG_IMM_COND(p, rd, rn, imm8, 0, cond)
+#define ARM_<Op>S_REG_IMM8(p, rd, rn, imm8) \
+	ARM_<Op>S_REG_IMM8_COND(p, rd, rn, imm8, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_IMM8_COND(rd, rn, imm8, cond) \
+	_<Op>_REG_IMM_COND(rd, rn, imm8, 0, cond)
+#define _<Op>_REG_IMM8(rd, rn, imm8) \
+	_<Op>_REG_IMM8_COND(rd, rn, imm8, ARMCOND_AL)
+#define _<Op>S_REG_IMM8_COND(rd, rn, imm8, cond) \
+	_<Op>S_REG_IMM_COND(rd, rn, imm8, 0, cond)
+#define _<Op>S_REG_IMM8(rd, rn, imm8) \
+	_<Op>S_REG_IMM8_COND(rd, rn, imm8, ARMCOND_AL)
+#endif
+
+
+/* Rd := Rn <Op> Rm */
+#define ARM_<Op>_REG_REG_COND(p, rd, rn, rm, cond) \
+	ARM_DPIOP_REG_REG_COND(p, ARMOP_<Op>, rd, rn, rm, cond)
+#define ARM_<Op>_REG_REG(p, rd, rn, rm) \
+	ARM_<Op>_REG_REG_COND(p, rd, rn, rm, ARMCOND_AL)
+#define ARM_<Op>S_REG_REG_COND(p, rd, rn, rm, cond) \
+	ARM_DPIOP_S_REG_REG_COND(p, ARMOP_<Op>, rd, rn, rm, cond)
+#define ARM_<Op>S_REG_REG(p, rd, rn, rm) \
+	ARM_<Op>S_REG_REG_COND(p, rd, rn, rm, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_REG_COND(rd, rn, rm, cond) \
+	ARM_IASM_DPIOP_REG_REG_COND(ARMOP_<Op>, rd, rn, rm, cond)
+#define _<Op>_REG_REG(rd, rn, rm) \
+	_<Op>_REG_REG_COND(rd, rn, rm, ARMCOND_AL)
+#define _<Op>S_REG_REG_COND(rd, rn, rm, cond) \
+	ARM_IASM_DPIOP_S_REG_REG_COND(ARMOP_<Op>, rd, rn, rm, cond)
+#define _<Op>S_REG_REG(rd, rn, rm) \
+	_<Op>S_REG_REG_COND(rd, rn, rm, ARMCOND_AL)
+#endif
+
+
+/* Rd := Rn <Op> (Rm <shift_type> imm_shift) */
+#define ARM_<Op>_REG_IMMSHIFT_COND(p, rd, rn, rm, shift_type, imm_shift, cond) \
+	ARM_DPIOP_REG_IMMSHIFT_COND(p, ARMOP_<Op>, rd, rn, rm, shift_type, imm_shift, cond)
+#define ARM_<Op>_REG_IMMSHIFT(p, rd, rn, rm, shift_type, imm_shift) \
+	ARM_<Op>_REG_IMMSHIFT_COND(p, rd, rn, rm, shift_type, imm_shift, ARMCOND_AL)
+#define ARM_<Op>S_REG_IMMSHIFT_COND(p, rd, rn, rm, shift_type, imm_shift, cond) \
+	ARM_DPIOP_S_REG_IMMSHIFT_COND(p, ARMOP_<Op>, rd, rn, rm, shift_type, imm_shift, cond)
+#define ARM_<Op>S_REG_IMMSHIFT(p, rd, rn, rm, shift_type, imm_shift) \
+	ARM_<Op>S_REG_IMMSHIFT_COND(p, rd, rn, rm, shift_type, imm_shift, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_IMMSHIFT_COND(rd, rn, rm, shift_type, imm_shift, cond) \
+	ARM_IASM_DPIOP_REG_IMMSHIFT_COND(ARMOP_<Op>, rd, rn, rm, shift_type, imm_shift, cond)
+#define _<Op>_REG_IMMSHIFT(rd, rn, rm, shift_type, imm_shift) \
+	_<Op>_REG_IMMSHIFT_COND(rd, rn, rm, shift_type, imm_shift, ARMCOND_AL)
+#define _<Op>S_REG_IMMSHIFT_COND(rd, rn, rm, shift_type, imm_shift, cond) \
+	ARM_IASM_DPIOP_S_REG_IMMSHIFT_COND(ARMOP_<Op>, rd, rn, rm, shift_type, imm_shift, cond)
+#define _<Op>S_REG_IMMSHIFT(rd, rn, rm, shift_type, imm_shift) \
+	_<Op>S_REG_IMMSHIFT_COND(rd, rn, rm, shift_type, imm_shift, ARMCOND_AL)
+#endif
+
+
+/* Rd := Rn <Op> (Rm <shift_type> Rs) */
+#define ARM_<Op>_REG_REGSHIFT_COND(p, rd, rn, rm, shift_type, rs, cond) \
+	ARM_DPIOP_REG_REGSHIFT_COND(p, ARMOP_<Op>, rd, rn, rm, shift_t, rs, cond)
+#define ARM_<Op>_REG_REGSHIFT(p, rd, rn, rm, shift_type, rs) \
+	ARM_<Op>_REG_REGSHIFT_COND(p, rd, rn, rm, shift_type, rs, ARMCOND_AL)
+#define ARM_<Op>S_REG_REGSHIFT_COND(p, rd, rn, rm, shift_type, rs, cond) \
+	ARM_DPIOP_S_REG_REGSHIFT_COND(p, ARMOP_<Op>, rd, rn, rm, shift_t, rs, cond)
+#define ARM_<Op>S_REG_REGSHIFT(p, rd, rn, rm, shift_type, rs) \
+	ARM_<Op>S_REG_REGSHIFT_COND(p, rd, rn, rm, shift_type, rs, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_REGSHIFT_COND(rd, rn, rm, shift_type, rs, cond) \
+	ARM_IASM_DPIOP_REG_REGSHIFT_COND(ARMOP_<Op>, rd, rn, rm, shift_t, rs, cond)
+#define _<Op>_REG_REGSHIFT(rd, rn, rm, shift_type, rs) \
+	_<Op>_REG_REGSHIFT_COND(rd, rn, rm, shift_type, rs, ARMCOND_AL)
+#define _<Op>S_REG_REGSHIFT_COND(rd, rn, rm, shift_type, rs, cond) \
+	ARM_IASM_DPIOP_S_REG_REGSHIFT_COND(ARMOP_<Op>, rd, rn, rm, shift_t, rs, cond)
+#define _<Op>S_REG_REGSHIFT(rd, rn, rm, shift_type, rs) \
+	_<Op>S_REG_REGSHIFT_COND(rd, rn, rm, shift_type, rs, ARMCOND_AL)
+#endif
+
+
--- a/lib/ffts/src/arch/arm/dpiops.sh
+++ b/lib/ffts/src/arch/arm/dpiops.sh
@ -0,0 +1,30 @@
+#!/bin/sh
+
+OPCODES="AND EOR SUB RSB ADD ADC SBC RSC ORR BIC"
+CMP_OPCODES="TST TEQ CMP CMN"
+MOV_OPCODES="MOV MVN"
+
+# $1: opcode list
+# $2: template
+gen() {
+	for i in $1; do
+		sed "s/<Op>/$i/g" $2.th
+	done
+}
+
+
+
+echo -e "/* Macros for DPI ops, auto-generated from template */\n"
+
+echo -e "\n/* mov/mvn */\n"
+gen "$MOV_OPCODES" mov_macros
+
+echo -e "\n/* DPIs, arithmetic and logical */\n"
+gen "$OPCODES" dpi_macros
+
+echo -e "\n\n"
+
+echo -e "\n/* DPIs, comparison */\n"
+gen "$CMP_OPCODES" cmp_macros
+
+echo -e "\n/* end generated */\n"
--- a/lib/ffts/src/arch/arm/mov_macros.th
+++ b/lib/ffts/src/arch/arm/mov_macros.th
@ -0,0 +1,121 @@
+/* Rd := imm8 ROR rot */
+#define ARM_<Op>_REG_IMM_COND(p, reg, imm8, rot, cond) \
+	ARM_DPIOP_REG_IMM8ROT_COND(p, ARMOP_<Op>, reg, 0, imm8, rot, cond)
+#define ARM_<Op>_REG_IMM(p, reg, imm8, rot) \
+	ARM_<Op>_REG_IMM_COND(p, reg, imm8, rot, ARMCOND_AL)
+/* S */
+#define ARM_<Op>S_REG_IMM_COND(p, reg, imm8, rot, cond) \
+	ARM_DPIOP_S_REG_IMM8ROT_COND(p, ARMOP_<Op>, reg, 0, imm8, rot, cond)
+#define ARM_<Op>S_REG_IMM(p, reg, imm8, rot) \
+	ARM_<Op>S_REG_IMM_COND(p, reg, imm8, rot, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_IMM_COND(reg, imm8, rot, cond) \
+	ARM_IASM_DPIOP_REG_IMM8ROT_COND(ARMOP_<Op>, reg, 0, imm8, rot, cond)
+#define _<Op>_REG_IMM(reg, imm8, rot) \
+	_<Op>_REG_IMM_COND(reg, imm8, rot, ARMCOND_AL)
+/* S */
+#define _<Op>S_REG_IMM_COND(reg, imm8, rot, cond) \
+	ARM_IASM_DPIOP_S_REG_IMM8ROT_COND(ARMOP_<Op>, reg, 0, imm8, rot, cond)
+#define _<Op>S_REG_IMM(reg, imm8, rot) \
+	_<Op>S_REG_IMM_COND(reg, imm8, rot, ARMCOND_AL)
+#endif
+
+
+/* Rd := imm8 */
+#define ARM_<Op>_REG_IMM8_COND(p, reg, imm8, cond) \
+	ARM_DPIOP_REG_IMM8ROT_COND(p, ARMOP_<Op>, reg, 0, imm8, 0, cond)
+#define ARM_<Op>_REG_IMM8(p, reg, imm8) \
+	ARM_<Op>_REG_IMM8_COND(p, reg, imm8, ARMCOND_AL)
+/* S */
+#define ARM_<Op>S_REG_IMM8_COND(p, reg, imm8, cond) \
+	ARM_DPIOP_S_REG_IMM8ROT_COND(p, ARMOP_<Op>, reg, 0, imm8, 0, cond)
+#define ARM_<Op>S_REG_IMM8(p, reg, imm8) \
+	ARM_<Op>S_REG_IMM8_COND(p, reg, imm8, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_IMM8_COND(reg, imm8, cond) \
+	ARM_IASM_DPIOP_REG_IMM8ROT_COND(ARMOP_<Op>, reg, 0, imm8, 0, cond)
+#define _<Op>_REG_IMM8(reg, imm8) \
+	_<Op>_REG_IMM8_COND(reg, imm8, ARMCOND_AL)
+/* S */
+#define _<Op>S_REG_IMM8_COND(reg, imm8, cond) \
+	ARM_IASM_DPIOP_S_REG_IMM8ROT_COND(ARMOP_<Op>, reg, 0, imm8, 0, cond)
+#define _<Op>S_REG_IMM8(reg, imm8) \
+	_<Op>S_REG_IMM8_COND(reg, imm8, ARMCOND_AL)
+#endif
+
+
+/* Rd := Rm */
+#define ARM_<Op>_REG_REG_COND(p, rd, rm, cond) \
+	ARM_DPIOP_REG_REG_COND(p, ARMOP_<Op>, rd, 0, rm, cond)
+#define ARM_<Op>_REG_REG(p, rd, rm) \
+	ARM_<Op>_REG_REG_COND(p, rd, rm, ARMCOND_AL)
+/* S */
+#define ARM_<Op>S_REG_REG_COND(p, rd, rm, cond) \
+	ARM_DPIOP_S_REG_REG_COND(p, ARMOP_<Op>, rd, 0, rm, cond)
+#define ARM_<Op>S_REG_REG(p, rd, rm) \
+	ARM_<Op>S_REG_REG_COND(p, rd, rm, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_REG_COND(rd, rm, cond) \
+	ARM_IASM_DPIOP_REG_REG_COND(ARMOP_<Op>, rd, 0, rm, cond)
+#define _<Op>_REG_REG(rd, rm) \
+	_<Op>_REG_REG_COND(rd, rm, ARMCOND_AL)
+/* S */
+#define _<Op>S_REG_REG_COND(rd, rm, cond) \
+	ARM_IASM_DPIOP_S_REG_REG_COND(ARMOP_<Op>, rd, 0, rm, cond)
+#define _<Op>S_REG_REG(rd, rm) \
+	_<Op>S_REG_REG_COND(rd, rm, ARMCOND_AL)
+#endif
+
+
+/* Rd := Rm <shift_type> imm_shift */
+#define ARM_<Op>_REG_IMMSHIFT_COND(p, rd, rm, shift_type, imm_shift, cond) \
+	ARM_DPIOP_REG_IMMSHIFT_COND(p, ARMOP_<Op>, rd, 0, rm, shift_type, imm_shift, cond)
+#define ARM_<Op>_REG_IMMSHIFT(p, rd, rm, shift_type, imm_shift) \
+	ARM_<Op>_REG_IMMSHIFT_COND(p, rd, rm, shift_type, imm_shift, ARMCOND_AL)
+/* S */
+#define ARM_<Op>S_REG_IMMSHIFT_COND(p, rd, rm, shift_type, imm_shift, cond) \
+	ARM_DPIOP_S_REG_IMMSHIFT_COND(p, ARMOP_<Op>, rd, 0, rm, shift_type, imm_shift, cond)
+#define ARM_<Op>S_REG_IMMSHIFT(p, rd, rm, shift_type, imm_shift) \
+	ARM_<Op>S_REG_IMMSHIFT_COND(p, rd, rm, shift_type, imm_shift, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_IMMSHIFT_COND(rd, rm, shift_type, imm_shift, cond) \
+	ARM_IASM_DPIOP_REG_IMMSHIFT_COND(ARMOP_<Op>, rd, 0, rm, shift_type, imm_shift, cond)
+#define _<Op>_REG_IMMSHIFT(rd, rm, shift_type, imm_shift) \
+	_<Op>_REG_IMMSHIFT_COND(rd, rm, shift_type, imm_shift, ARMCOND_AL)
+/* S */
+#define _<Op>S_REG_IMMSHIFT_COND(rd, rm, shift_type, imm_shift, cond) \
+	ARM_IASM_DPIOP_S_REG_IMMSHIFT_COND(ARMOP_<Op>, rd, 0, rm, shift_type, imm_shift, cond)
+#define _<Op>S_REG_IMMSHIFT(rd, rm, shift_type, imm_shift) \
+	_<Op>S_REG_IMMSHIFT_COND(rd, rm, shift_type, imm_shift, ARMCOND_AL)
+#endif
+
+
+
+/* Rd := (Rm <shift_type> Rs) */
+#define ARM_<Op>_REG_REGSHIFT_COND(p, rd, rm, shift_type, rs, cond) \
+	ARM_DPIOP_REG_REGSHIFT_COND(p, ARMOP_<Op>, rd, 0, rm, shift_type, rs, cond)
+#define ARM_<Op>_REG_REGSHIFT(p, rd, rm, shift_type, rs) \
+	ARM_<Op>_REG_REGSHIFT_COND(p, rd, rm, shift_type, rs, ARMCOND_AL)
+/* S */
+#define ARM_<Op>S_REG_REGSHIFT_COND(p, rd, rm, shift_type, rs, cond) \
+	ARM_DPIOP_S_REG_REGSHIFT_COND(p, ARMOP_<Op>, rd, 0, rm, shift_type, rs, cond)
+#define ARM_<Op>S_REG_REGSHIFT(p, rd, rm, shift_type, rs) \
+	ARM_<Op>S_REG_REGSHIFT_COND(p, rd, rm, shift_type, rs, ARMCOND_AL)
+
+#ifndef ARM_NOIASM
+#define _<Op>_REG_REGSHIFT_COND(rd, rm, shift_type, rs, cond) \
+	ARM_IASM_DPIOP_REG_REGSHIFT_COND(ARMOP_<Op>, rd, 0, rm, shift_type, rs, cond)
+#define _<Op>_REG_REGSHIFT(rd, rm, shift_type, rs) \
+	_<Op>_REG_REGSHIFT_COND(rd, rm, shift_type, rs, ARMCOND_AL)
+/* S */
+#define _<Op>S_REG_REGSHIFT_COND(rd, rm, shift_type, rs, cond) \
+	ARM_IASM_DPIOP_S_REG_REGSHIFT_COND(ARMOP_<Op>, rd, 0, rm, shift_type, rs, cond)
+#define _<Op>S_REG_REGSHIFT(rd, rm, shift_type, rs) \
+	_<Op>S_REG_REGSHIFT_COND(rd, rm, shift_type, rs, ARMCOND_AL)
+#endif
+
+
--- a/lib/ffts/src/arch/arm/tramp.c
+++ b/lib/ffts/src/arch/arm/tramp.c
@ -0,0 +1,710 @@
+/*
+ * Create trampolines to invoke arbitrary functions.
+ * Copyright (c) 2002 Sergey Chaban <serge@wildwestsoftware.com>
+ *
+ * Contributions by Malte Hildingson
+ */
+
+#include "arm-codegen.h"
+#include "arm-dis.h"
+
+#if defined(_WIN32_WCE) || defined (UNDER_CE)
+#	include <windows.h>
+#else
+#include <unistd.h>
+#include <sys/mman.h>
+#endif
+
+#if !defined(PLATFORM_MACOSX)
+#include <errno.h>
+
+#include "mono/metadata/class.h"
+#include "mono/metadata/tabledefs.h"
+#include "mono/interpreter/interp.h"
+#include "mono/metadata/appdomain.h"
+
+
+#if 0
+#	define ARM_DUMP_DISASM 1
+#endif
+
+/* prototypes for private functions (to avoid compiler warnings) */
+void flush_icache (void);
+void* alloc_code_buff (int num_instr);
+
+
+
+/*
+ * The resulting function takes the form:
+ * void func (void (*callme)(), void *retval, void *this_obj, stackval *arguments);
+ * NOTE: all args passed in ARM registers (A1-A4),
+ *       then copied to R4-R7 (see definitions below).
+ */
+
+#define REG_FUNC_ADDR ARMREG_R4
+#define REG_RETVAL    ARMREG_R5
+#define REG_THIS      ARMREG_R6
+#define REG_ARGP      ARMREG_R7
+
+
+#define ARG_SIZE sizeof(stackval)
+
+
+
+
+void flush_icache ()
+{
+#if defined(_WIN32)
+	FlushInstructionCache(GetCurrentProcess(), NULL, 0);
+#else
+# if 0
+	asm ("mov r0, r0");
+	asm ("mov r0, #0");
+	asm ("mcr p15, 0, r0, c7, c7, 0");
+# else
+	/* TODO: use (movnv  pc, rx) method */
+# endif
+#endif
+}
+
+
+void* alloc_code_buff (int num_instr)
+{
+	void* code_buff;
+	int code_size = num_instr * sizeof(arminstr_t);
+
+#if defined(_WIN32) || defined(UNDER_CE)
+	int old_prot = 0;
+
+	code_buff = malloc(code_size);
+	VirtualProtect(code_buff, code_size, PAGE_EXECUTE_READWRITE, &old_prot);
+#else
+	int page_size = sysconf(_SC_PAGESIZE);
+	int new_code_size;
+
+	new_code_size = code_size + page_size - 1;
+	code_buff = malloc(new_code_size);
+	code_buff = (void *) (((int) code_buff + page_size - 1) & ~(page_size - 1));
+
+	if (mprotect(code_buff, code_size, PROT_READ|PROT_WRITE|PROT_EXEC) != 0) {
+		g_critical (G_GNUC_PRETTY_FUNCTION
+				": mprotect error: %s", g_strerror (errno));
+	}
+#endif
+
+	return code_buff;
+}
+
+
+/*
+ * Refer to ARM Procedure Call Standard (APCS) for more info.
+ */
+MonoPIFunc mono_arch_create_trampoline (MonoMethodSignature *sig, gboolean string_ctor)
+{
+	MonoType* param;
+	MonoPIFunc code_buff;
+	arminstr_t* p;
+	guint32 code_size, stack_size;
+	guint32 simple_type;
+	int i, hasthis, aregs, regc, stack_offs;
+	int this_loaded;
+	guchar reg_alloc [ARM_NUM_ARG_REGS];
+
+	/* pessimistic estimation for prologue/epilogue size */
+	code_size = 16 + 16;
+	/* push/pop work regs */
+	code_size += 2; 
+	/* call */
+	code_size += 2;
+	/* handle retval */
+	code_size += 2;
+
+	stack_size = 0;
+	hasthis = sig->hasthis ? 1 : 0;
+
+	aregs = ARM_NUM_ARG_REGS - hasthis;
+
+	for (i = 0, regc = aregs; i < sig->param_count; ++i) {
+		param = sig->params [i];
+
+		/* keep track of argument sizes */
+		if (i < ARM_NUM_ARG_REGS) reg_alloc [i] = 0;
+
+		if (param->byref) {
+			if (regc > 0) {
+				code_size += 1;
+				reg_alloc [i] = regc;
+				--regc;
+			} else {
+				code_size += 2;
+				stack_size += sizeof(gpointer);
+			}
+		} else {
+			simple_type = param->type;
+enum_calc_size:
+			switch (simple_type) {
+			case MONO_TYPE_BOOLEAN:
+			case MONO_TYPE_CHAR:
+			case MONO_TYPE_I1:
+			case MONO_TYPE_U1:
+			case MONO_TYPE_I2:
+			case MONO_TYPE_U2:
+			case MONO_TYPE_I4:
+			case MONO_TYPE_U4:
+			case MONO_TYPE_I:
+			case MONO_TYPE_U:
+			case MONO_TYPE_PTR:
+			case MONO_TYPE_R4:
+			case MONO_TYPE_SZARRAY:
+			case MONO_TYPE_CLASS:
+			case MONO_TYPE_OBJECT:
+			case MONO_TYPE_STRING:
+				if (regc > 0) {
+					/* register arg */
+					code_size += 1;
+					reg_alloc [i] = regc;
+					--regc;
+				} else {
+					/* stack arg */
+					code_size += 2;
+					stack_size += 4;
+				}
+				break;
+			case MONO_TYPE_I8:
+			case MONO_TYPE_U8:
+			case MONO_TYPE_R8:
+				/* keep track of argument sizes */
+				if (regc > 1) {
+					/* fits into registers, two LDRs */
+					code_size += 2;
+					reg_alloc [i] = regc;
+					regc -= 2;
+				} else if (regc > 0) {
+					/* first half fits into register, one LDR */
+					code_size += 1;
+					reg_alloc [i] = regc;
+					--regc;
+					/* the rest on the stack, LDR/STR */
+					code_size += 2;
+					stack_size += 4;
+				} else {
+					/* stack arg, 4 instrs - 2x(LDR/STR) */
+					code_size += 4;
+					stack_size += 2 * 4;
+				}
+				break;
+			case MONO_TYPE_VALUETYPE:
+				if (param->data.klass->enumtype) {
+					simple_type = param->data.klass->enum_basetype->type;
+					goto enum_calc_size;
+				}
+
+				if (mono_class_value_size(param->data.klass, NULL) != 4) {
+					g_error("can only marshal enums, not generic structures (size: %d)", mono_class_value_size(param->data.klass, NULL));
+				}
+				if (regc > 0) {
+					/* register arg */
+					code_size += 1;
+					reg_alloc [i] = regc;
+					--regc;
+				} else {
+					/* stack arg */
+					code_size += 2;
+					stack_size += 4;
+				}
+				break;
+			default :
+				break;
+			}
+		}
+	}
+
+	code_buff = (MonoPIFunc)alloc_code_buff(code_size);
+	p = (arminstr_t*)code_buff;
+
+	/* prologue */
+	p = arm_emit_lean_prologue(p, stack_size,
+	        /* save workset (r4-r7) */
+	        (1 << ARMREG_R4) | (1 << ARMREG_R5) | (1 << ARMREG_R6) | (1 << ARMREG_R7));
+
+
+	/* copy args into workset */
+	/* callme - always present */
+	ARM_MOV_REG_REG(p, ARMREG_R4, ARMREG_A1);
+	/* retval */
+	if (sig->ret->byref || string_ctor || (sig->ret->type != MONO_TYPE_VOID)) {
+		ARM_MOV_REG_REG(p, ARMREG_R5, ARMREG_A2);
+	}
+	/* this_obj */
+	if (sig->hasthis) {
+		this_loaded = 0;
+		if (stack_size == 0) {
+			ARM_MOV_REG_REG(p, ARMREG_A1, ARMREG_A3);
+			this_loaded = 1;
+		} else {
+			ARM_MOV_REG_REG(p, ARMREG_R6, ARMREG_A3);
+		}
+	}
+	/* args */
+	if (sig->param_count != 0) {
+		ARM_MOV_REG_REG(p, ARMREG_R7, ARMREG_A4);
+	}
+
+	stack_offs = stack_size;
+
+	/* handle arguments */
+	/* in reverse order so we could use r0 (arg1) for memory transfers */
+	for (i = sig->param_count; --i >= 0;) {
+		param = sig->params [i];
+		if (param->byref) {
+			if (i < aregs && reg_alloc[i] > 0) {
+				ARM_LDR_IMM(p, ARMREG_A1 + i, REG_ARGP, i*ARG_SIZE);
+			} else {
+				stack_offs -= sizeof(armword_t);
+				ARM_LDR_IMM(p, ARMREG_R0, REG_ARGP, i*ARG_SIZE);
+				ARM_STR_IMM(p, ARMREG_R0, ARMREG_SP, stack_offs);
+			}
+		} else {
+			simple_type = param->type;
+enum_marshal:
+			switch (simple_type) {
+			case MONO_TYPE_BOOLEAN:
+			case MONO_TYPE_CHAR:
+			case MONO_TYPE_I1:
+			case MONO_TYPE_U1:
+			case MONO_TYPE_I2:
+			case MONO_TYPE_U2:
+			case MONO_TYPE_I4:
+			case MONO_TYPE_U4:
+			case MONO_TYPE_I:
+			case MONO_TYPE_U:
+			case MONO_TYPE_PTR:
+			case MONO_TYPE_R4:
+			case MONO_TYPE_SZARRAY:
+			case MONO_TYPE_CLASS:
+			case MONO_TYPE_OBJECT:
+			case MONO_TYPE_STRING:
+				if (i < aregs && reg_alloc [i] > 0) {
+					/* pass in register */
+					ARM_LDR_IMM(p, ARMREG_A1 + hasthis + (aregs - reg_alloc [i]), REG_ARGP, i*ARG_SIZE);
+				} else {
+					stack_offs -= sizeof(armword_t);
+					ARM_LDR_IMM(p, ARMREG_R0, REG_ARGP, i*ARG_SIZE);
+					ARM_STR_IMM(p, ARMREG_R0, ARMREG_SP, stack_offs);
+				}
+				break;
+			case MONO_TYPE_I8:
+			case MONO_TYPE_U8:
+			case MONO_TYPE_R8:
+				if (i < aregs && reg_alloc [i] > 0) {
+					if (reg_alloc [i] > 1) {
+						/* pass in registers */
+						ARM_LDR_IMM(p, ARMREG_A1 + hasthis + (aregs - reg_alloc [i]), REG_ARGP, i*ARG_SIZE);
+						ARM_LDR_IMM(p, ARMREG_A1 + hasthis + (aregs - reg_alloc [i]) + 1, REG_ARGP, i*ARG_SIZE + 4);
+					} else {
+						stack_offs -= sizeof(armword_t);
+						ARM_LDR_IMM(p, ARMREG_R0, REG_ARGP, i*ARG_SIZE + 4);
+						ARM_STR_IMM(p, ARMREG_R0, ARMREG_SP, stack_offs);
+						ARM_LDR_IMM(p, ARMREG_A1 + hasthis + (aregs - reg_alloc [i]), REG_ARGP, i*ARG_SIZE);
+					}
+				} else {
+					/* two words transferred on the stack */
+					stack_offs -= 2*sizeof(armword_t);
+					ARM_LDR_IMM(p, ARMREG_R0, REG_ARGP, i*ARG_SIZE);
+					ARM_STR_IMM(p, ARMREG_R0, ARMREG_SP, stack_offs);
+					ARM_LDR_IMM(p, ARMREG_R0, REG_ARGP, i*ARG_SIZE + 4);
+					ARM_STR_IMM(p, ARMREG_R0, ARMREG_SP, stack_offs + 4);
+				}
+				break;
+			case MONO_TYPE_VALUETYPE:
+				if (param->data.klass->enumtype) {
+					/* it's an enum value, proceed based on its base type */
+					simple_type = param->data.klass->enum_basetype->type;
+					goto enum_marshal;
+				} else {
+					if (i < aregs && reg_alloc[i] > 0) {
+						int vtreg = ARMREG_A1 + hasthis +
+								hasthis + (aregs - reg_alloc[i]);
+						ARM_LDR_IMM(p, vtreg, REG_ARGP, i * ARG_SIZE);
+						ARM_LDR_IMM(p, vtreg, vtreg, 0);
+					} else {
+						stack_offs -= sizeof(armword_t);
+						ARM_LDR_IMM(p, ARMREG_R0, REG_ARGP, i * ARG_SIZE);
+						ARM_LDR_IMM(p, ARMREG_R0, ARMREG_R0, 0);
+						ARM_STR_IMM(p, ARMREG_R0, ARMREG_SP, stack_offs);
+					}
+				}
+				break;
+
+			default:
+				break;
+			}
+		}
+	}
+
+	if (sig->hasthis && !this_loaded) {
+		/* [this] always passed in A1, regardless of sig->call_convention */
+		ARM_MOV_REG_REG(p, ARMREG_A1, REG_THIS);
+	}
+
+	/* call [func] */
+	ARM_MOV_REG_REG(p, ARMREG_LR, ARMREG_PC);
+	ARM_MOV_REG_REG(p, ARMREG_PC, REG_FUNC_ADDR);
+
+	/* handle retval */
+	if (sig->ret->byref || string_ctor) {
+		ARM_STR_IMM(p, ARMREG_R0, REG_RETVAL, 0);
+	} else {
+		simple_type = sig->ret->type;
+enum_retvalue:
+		switch (simple_type) {
+		case MONO_TYPE_BOOLEAN:
+		case MONO_TYPE_I1:
+		case MONO_TYPE_U1:
+			ARM_STRB_IMM(p, ARMREG_R0, REG_RETVAL, 0);
+			break;
+		case MONO_TYPE_CHAR:
+		case MONO_TYPE_I2:
+		case MONO_TYPE_U2:
+			ARM_STRH_IMM(p, ARMREG_R0, REG_RETVAL, 0);
+			break;
+		/*
+		 * A 32-bit integer and integer-equivalent return value
+		 * is returned in R0.
+		 * Single-precision floating-point values are returned in R0.
+		 */
+		case MONO_TYPE_I:
+		case MONO_TYPE_U:
+		case MONO_TYPE_I4:
+		case MONO_TYPE_U4:
+		case MONO_TYPE_R4:
+		case MONO_TYPE_OBJECT:
+		case MONO_TYPE_CLASS:
+		case MONO_TYPE_ARRAY:
+		case MONO_TYPE_SZARRAY:
+		case MONO_TYPE_STRING:
+			ARM_STR_IMM(p, ARMREG_R0, REG_RETVAL, 0);
+			break;
+		/*
+		 * A 64-bit integer is returned in R0 and R1.
+		 * Double-precision floating-point values are returned in R0 and R1.
+		 */
+		case MONO_TYPE_I8:
+		case MONO_TYPE_U8:
+		case MONO_TYPE_R8:
+			ARM_STR_IMM(p, ARMREG_R0, REG_RETVAL, 0);
+			ARM_STR_IMM(p, ARMREG_R1, REG_RETVAL, 4);
+			break;
+		case MONO_TYPE_VALUETYPE:
+			if (sig->ret->data.klass->enumtype) {
+				simple_type = sig->ret->data.klass->enum_basetype->type;
+				goto enum_retvalue;
+			}
+			break;
+		case MONO_TYPE_VOID:
+			break;
+		default:
+			break;
+		}
+	}
+	
+	p = arm_emit_std_epilogue(p, stack_size,
+	        /* restore R4-R7 */
+	        (1 << ARMREG_R4) | (1 << ARMREG_R5) | (1 << ARMREG_R6) | (1 << ARMREG_R7));
+
+	flush_icache();
+
+#ifdef ARM_DUMP_DISASM
+	_armdis_decode((arminstr_t*)code_buff, ((guint8*)p) - ((guint8*)code_buff));
+#endif
+
+	return code_buff;
+}
+
+
+
+#define MINV_OFFS(member) G_STRUCT_OFFSET(MonoInvocation, member)
+
+
+
+/*
+ * Returns a pointer to a native function that can be used to
+ * call the specified method.
+ * The function created will receive the arguments according
+ * to the call convention specified in the method.
+ * This function works by creating a MonoInvocation structure,
+ * filling the fields in and calling ves_exec_method on it.
+ * Still need to figure out how to handle the exception stuff
+ * across the managed/unmanaged boundary.
+ */
+void* mono_arch_create_method_pointer (MonoMethod* method)
+{
+	MonoMethodSignature* sig;
+	guchar* p, * p_method, * p_stackval_from_data, * p_exec;
+	void* code_buff;
+	int i, stack_size, arg_pos, arg_add, stackval_pos, offs;
+	int areg, reg_args, shift, pos;
+	MonoJitInfo *ji;
+
+	code_buff = alloc_code_buff(128);
+	p = (guchar*)code_buff;
+
+	sig = method->signature;
+
+	ARM_B(p, 3);
+
+	/* embed magic number followed by method pointer */
+	*p++ = 'M';
+	*p++ = 'o';
+	*p++ = 'n';
+	*p++ = 'o';
+	/* method ptr */
+	*(void**)p = method;
+	p_method = p;
+	p += 4;
+
+	/* call table */
+	*(void**)p = stackval_from_data;
+	p_stackval_from_data = p;
+	p += 4;
+	*(void**)p = ves_exec_method;
+	p_exec = p;
+	p += 4;
+
+	stack_size = sizeof(MonoInvocation) + ARG_SIZE*(sig->param_count + 1) + ARM_NUM_ARG_REGS*2*sizeof(armword_t);
+
+	/* prologue */
+	p = (guchar*)arm_emit_lean_prologue((arminstr_t*)p, stack_size,
+	    (1 << ARMREG_R4) |
+	    (1 << ARMREG_R5) |
+	    (1 << ARMREG_R6) |
+	    (1 << ARMREG_R7));
+
+	/* R7 - ptr to stack args */
+	ARM_MOV_REG_REG(p, ARMREG_R7, ARMREG_IP);
+
+	/*
+	 * Initialize MonoInvocation fields, first the ones known now.
+	 */
+	ARM_MOV_REG_IMM8(p, ARMREG_R4, 0);
+	ARM_STR_IMM(p, ARMREG_R4, ARMREG_SP, MINV_OFFS(ex));
+	ARM_STR_IMM(p, ARMREG_R4, ARMREG_SP, MINV_OFFS(ex_handler));
+	ARM_STR_IMM(p, ARMREG_R4, ARMREG_SP, MINV_OFFS(parent));
+
+	/* Set the method pointer. */
+	ARM_LDR_IMM(p, ARMREG_R4, ARMREG_PC, -(int)(p - p_method + sizeof(arminstr_t)*2));
+	ARM_STR_IMM(p, ARMREG_R4, ARMREG_SP, MINV_OFFS(method));
+
+	if (sig->hasthis) {
+		/* [this] in A1 */
+		ARM_STR_IMM(p, ARMREG_A1, ARMREG_SP, MINV_OFFS(obj));
+	} else {
+		/* else set minv.obj to NULL */
+		ARM_STR_IMM(p, ARMREG_R4, ARMREG_SP, MINV_OFFS(obj));
+	}
+
+	/* copy args from registers to stack */
+	areg = ARMREG_A1 + sig->hasthis;
+	arg_pos = -(int)(ARM_NUM_ARG_REGS - sig->hasthis) * 2 * sizeof(armword_t);
+	arg_add = 0;
+	for (i = 0; i < sig->param_count; ++i) {
+		if (areg >= ARM_NUM_ARG_REGS) break;
+		ARM_STR_IMM(p, areg, ARMREG_R7, arg_pos);
+		++areg;
+		if (!sig->params[i]->byref) {
+			switch (sig->params[i]->type) {
+			case MONO_TYPE_I8:
+			case MONO_TYPE_U8:
+			case MONO_TYPE_R8:
+				if (areg >= ARM_NUM_ARG_REGS) {
+					/* load second half of 64-bit arg */
+					ARM_LDR_IMM(p, ARMREG_R4, ARMREG_R7, 0);
+					ARM_STR_IMM(p, ARMREG_R4, ARMREG_R7, arg_pos + sizeof(armword_t));
+					arg_add = sizeof(armword_t);
+				} else {
+					/* second half is already the register */
+					ARM_STR_IMM(p, areg, ARMREG_R7, arg_pos + sizeof(armword_t));
+					++areg;
+				}
+				break;
+			case MONO_TYPE_VALUETYPE:
+				/* assert */
+			default:
+				break;
+			}
+		}
+		arg_pos += 2 * sizeof(armword_t);
+	}
+	/* number of args passed in registers */
+	reg_args = i;
+
+
+
+	/*
+	 * Calc and save stack args ptr,
+	 * args follow MonoInvocation struct on the stack.
+	 */
+	ARM_ADD_REG_IMM8(p, ARMREG_R1, ARMREG_SP, sizeof(MonoInvocation));
+	ARM_STR_IMM(p, ARMREG_R1, ARMREG_SP, MINV_OFFS(stack_args));
+
+	/* convert method args to stackvals */
+	arg_pos = -(int)(ARM_NUM_ARG_REGS - sig->hasthis) * 2 * sizeof(armword_t);
+	stackval_pos = sizeof(MonoInvocation);
+	for (i = 0; i < sig->param_count; ++i) {
+		if (i < reg_args) {
+			ARM_SUB_REG_IMM8(p, ARMREG_A3, ARMREG_R7, -arg_pos);
+			arg_pos += 2 * sizeof(armword_t);
+		} else {
+			if (arg_pos < 0) arg_pos = 0;
+			pos = arg_pos + arg_add;
+			if (pos <= 0xFF) {
+				ARM_ADD_REG_IMM8(p, ARMREG_A3, ARMREG_R7, pos);
+			} else {
+				if (is_arm_const((armword_t)pos)) {
+					shift = calc_arm_mov_const_shift((armword_t)pos);
+					ARM_ADD_REG_IMM(p, ARMREG_A3, ARMREG_R7, pos >> ((32 - shift) & 31), shift >> 1);
+				} else {
+					p = (guchar*)arm_mov_reg_imm32((arminstr_t*)p, ARMREG_R6, (armword_t)pos);
+					ARM_ADD_REG_REG(p, ARMREG_A2, ARMREG_R7, ARMREG_R6);
+				}
+			}
+			arg_pos += sizeof(armword_t);
+			if (!sig->params[i]->byref) {
+				switch (sig->params[i]->type) {
+				case MONO_TYPE_I8:
+				case MONO_TYPE_U8:
+				case MONO_TYPE_R8:
+					arg_pos += sizeof(armword_t);
+					break;
+				case MONO_TYPE_VALUETYPE:
+					/* assert */
+				default:
+					break;
+				}
+			}
+		}
+
+		/* A2 = result */
+		if (stackval_pos <= 0xFF) {
+			ARM_ADD_REG_IMM8(p, ARMREG_A2, ARMREG_SP, stackval_pos);
+		} else {
+			if (is_arm_const((armword_t)stackval_pos)) {
+				shift = calc_arm_mov_const_shift((armword_t)stackval_pos);
+				ARM_ADD_REG_IMM(p, ARMREG_A2, ARMREG_SP, stackval_pos >> ((32 - shift) & 31), shift >> 1);
+			} else {
+				p = (guchar*)arm_mov_reg_imm32((arminstr_t*)p, ARMREG_R6, (armword_t)stackval_pos);
+				ARM_ADD_REG_REG(p, ARMREG_A2, ARMREG_SP, ARMREG_R6);
+			}
+		}
+
+		/* A1 = type */
+		p = (guchar*)arm_mov_reg_imm32((arminstr_t*)p, ARMREG_A1, (armword_t)sig->params [i]);
+
+		stackval_pos += ARG_SIZE;
+
+		offs = -(p + 2*sizeof(arminstr_t) - p_stackval_from_data);
+		/* load function address */
+		ARM_LDR_IMM(p, ARMREG_R4, ARMREG_PC, offs);
+		/* call stackval_from_data */
+		ARM_MOV_REG_REG(p, ARMREG_LR, ARMREG_PC);
+		ARM_MOV_REG_REG(p, ARMREG_PC, ARMREG_R4);
+	}
+
+	/* store retval ptr */
+	p = (guchar*)arm_mov_reg_imm32((arminstr_t*)p, ARMREG_R5, (armword_t)stackval_pos);
+	ARM_ADD_REG_REG(p, ARMREG_R5, ARMREG_SP, ARMREG_R4);
+	ARM_STR_IMM(p, ARMREG_R5, ARMREG_SP, MINV_OFFS(retval));
+
+	/*
+	 * Call the method.
+	 */
+	/* A1 = MonoInvocation ptr */
+	ARM_MOV_REG_REG(p, ARMREG_A1, ARMREG_SP);
+	offs = -(p + 2*sizeof(arminstr_t) - p_exec);
+	/* load function address */
+	ARM_LDR_IMM(p, ARMREG_R4, ARMREG_PC, offs);
+	/* call ves_exec */
+	ARM_MOV_REG_REG(p, ARMREG_LR, ARMREG_PC);
+	ARM_MOV_REG_REG(p, ARMREG_PC, ARMREG_R4);
+
+
+	/*
+	 * Move retval into reg.
+	 */
+	if (sig->ret->byref) {
+		ARM_LDR_IMM(p, ARMREG_R0, ARMREG_R5, 0);
+	} else {
+		switch (sig->ret->type) {
+		case MONO_TYPE_BOOLEAN:
+		case MONO_TYPE_I1:
+		case MONO_TYPE_U1:
+			ARM_LDRB_IMM(p, ARMREG_R0, ARMREG_R5, 0);
+			break;
+		case MONO_TYPE_CHAR:
+		case MONO_TYPE_I2:
+		case MONO_TYPE_U2:
+			ARM_LDRH_IMM(p, ARMREG_R0, ARMREG_R5, 0);
+			break;
+		case MONO_TYPE_I:
+		case MONO_TYPE_U:
+		case MONO_TYPE_I4:
+		case MONO_TYPE_U4:
+		case MONO_TYPE_R4:
+		case MONO_TYPE_OBJECT:
+		case MONO_TYPE_CLASS:
+		case MONO_TYPE_ARRAY:
+		case MONO_TYPE_SZARRAY:
+			ARM_LDR_IMM(p, ARMREG_R0, ARMREG_R5, 0);
+			break;
+		case MONO_TYPE_I8:
+		case MONO_TYPE_U8:
+		case MONO_TYPE_R8:
+			ARM_LDR_IMM(p, ARMREG_R0, ARMREG_R5, 0);
+			ARM_LDR_IMM(p, ARMREG_R1, ARMREG_R5, 4);
+			break;
+		case MONO_TYPE_VOID:
+		default:
+			break;
+		}
+	}
+
+
+	p = (guchar*)arm_emit_std_epilogue((arminstr_t*)p, stack_size,
+	    (1 << ARMREG_R4) |
+	    (1 << ARMREG_R5) |
+	    (1 << ARMREG_R6) |
+	    (1 << ARMREG_R7));
+
+	flush_icache();
+
+#ifdef ARM_DUMP_DISASM
+	_armdis_decode((arminstr_t*)code_buff, ((guint8*)p) - ((guint8*)code_buff));
+#endif
+
+	ji = g_new0(MonoJitInfo, 1);
+	ji->method = method;
+	ji->code_size = ((guint8 *) p) - ((guint8 *) code_buff);
+	ji->code_start = (gpointer) code_buff;
+
+	mono_jit_info_table_add(mono_get_root_domain (), ji);
+
+	return code_buff;
+}
+
+
+/*
+ * mono_create_method_pointer () will insert a pointer to the MonoMethod
+ * so that the interp can easily get at the data: this function will retrieve 
+ * the method from the code stream.
+ */
+MonoMethod* mono_method_pointer_get (void* code)
+{
+	unsigned char* c = code;
+	/* check out magic number that follows unconditional branch */
+	if (c[4] == 'M' &&
+	    c[5] == 'o' &&
+	    c[6] == 'n' &&
+	    c[7] == 'o') return ((MonoMethod**)code)[2];
+	return NULL;
+}
+#endif
--- a/lib/ffts/src/arch/arm/vfp_macros.th
+++ b/lib/ffts/src/arch/arm/vfp_macros.th
@ -0,0 +1,15 @@
+/* -- <Op> -- */
+
+
+/* Fd := Fn <Op> Fm */
+#define ARM_VFP_<Op>D_COND(p, rd, rn, rm, cond) \
+	ARM_EMIT((p), ARM_DEF_VFP_DYADIC(cond,ARM_VFP_COPROC_DOUBLE,ARM_VFP_<Op>,rd,rn,rm))
+#define ARM_VFP_<Op>D(p, rd, rn, rm) \
+	ARM_VFP_<Op>D_COND(p, rd, rn, rm, ARMCOND_AL)
+
+#define ARM_VFP_<Op>S_COND(p, rd, rn, rm, cond) \
+	ARM_EMIT((p), ARM_DEF_VFP_DYADIC(cond,ARM_VFP_COPROC_SINGLE,ARM_VFP_<Op>,rd,rn,rm))
+#define ARM_VFP_<Op>S(p, rd, rn, rm) \
+	ARM_VFP_<Op>S_COND(p, rd, rn, rm, ARMCOND_AL)
+
+
--- a/lib/ffts/src/arch/arm/vfpm_macros.th
+++ b/lib/ffts/src/arch/arm/vfpm_macros.th
@ -0,0 +1,14 @@
+/* -- <Op> -- */
+
+
+/* Fd := <Op> Fm */
+
+#define ARM_<Op>D_COND(p,dreg,sreg,cond) \
+        ARM_EMIT((p), ARM_DEF_VFP_MONADIC((cond),ARM_VFP_COPROC_DOUBLE,ARM_VFP_<Op>,(dreg),(sreg)))
+#define ARM_<Op>D(p,dreg,sreg)      ARM_<Op>D_COND(p,dreg,sreg,ARMCOND_AL)
+
+#define ARM_<Op>S_COND(p,dreg,sreg,cond) \
+        ARM_EMIT((p), ARM_DEF_VFP_MONADIC((cond),ARM_VFP_COPROC_SINGLE,ARM_VFP_<Op>,(dreg),(sreg)))
+#define ARM_<Op>S(p,dreg,sreg)      ARM_<Op>S_COND(p,dreg,sreg,ARMCOND_AL)
+
+
--- a/lib/ffts/src/arch/arm/vfpops.sh
+++ b/lib/ffts/src/arch/arm/vfpops.sh
@ -0,0 +1,24 @@
+#!/bin/sh
+
+DYADIC="ADD SUB MUL NMUL DIV"
+MONADIC="CPY ABS NEG SQRT CMP CMPE CMPZ CMPEZ CVT UITO SITO TOUI TOSI TOUIZ TOSIZ"
+
+# $1: opcode list
+# $2: template
+gen() {
+	for i in $1; do
+		sed "s/<Op>/$i/g" $2.th
+	done
+}
+
+echo -e "/* Macros for VFP ops, auto-generated from template */\n"
+
+echo -e "\n/* dyadic */\n"
+gen "$DYADIC" vfp_macros
+
+echo -e "\n/* monadic */\n"
+gen "$MONADIC" vfpm_macros
+
+echo -e "\n\n"
+
+echo -e "\n/* end generated */\n"
--- a/lib/ffts/src/arch/arm64/.gitignore
+++ b/lib/ffts/src/arch/arm64/.gitignore
@ -0,0 +1,6 @@
+/
+/Makefile
+/Makefile.in
+/*.o
+/*.lo
+/.deps
--- a/lib/ffts/src/arch/arm64/Makefile.am
+++ b/lib/ffts/src/arch/arm64/Makefile.am
--- a/lib/ffts/src/arch/arm64/arm64-codegen.h
+++ b/lib/ffts/src/arch/arm64/arm64-codegen.h
@ -0,0 +1,3 @@
+#include "../../../../mono-extensions/mono/arch/arm64/arm64-codegen.h"
+
+
--- a/lib/ffts/src/arch/ia64/.gitignore
+++ b/lib/ffts/src/arch/ia64/.gitignore
@ -0,0 +1,2 @@
+/Makefile
+/Makefile.in
--- a/lib/ffts/src/arch/ia64/Makefile.am
+++ b/lib/ffts/src/arch/ia64/Makefile.am
@ -0,0 +1,3 @@
+EXTRA_DIST = ia64-codegen.h
+
+
--- a/lib/ffts/src/arch/ia64/codegen.c
+++ b/lib/ffts/src/arch/ia64/codegen.c
@ -0,0 +1,861 @@
+/*
+ * codegen.c: Tests for the IA64 code generation macros
+ */
+
+#include <glib.h>
+#include <stdio.h>
+#include <ctype.h>
+
+#define IA64_SIMPLE_EMIT_BUNDLE
+
+#include <mono/arch/ia64/ia64-codegen.h>
+
+void
+mono_disassemble_code (guint8 *code, int size, char *id)
+{
+	int i;
+	FILE *ofd;
+	const char *tmp = g_get_tmp_dir ();
+	const char *objdump_args = g_getenv ("MONO_OBJDUMP_ARGS");
+	char *as_file;
+	char *o_file;
+	char *cmd;
+	
+	as_file = g_strdup_printf ("%s/test.s", tmp);    
+
+	if (!(ofd = fopen (as_file, "w")))
+		g_assert_not_reached ();
+
+	for (i = 0; id [i]; ++i) {
+		if (!isalnum (id [i]))
+			fprintf (ofd, "_");
+		else
+			fprintf (ofd, "%c", id [i]);
+	}
+	fprintf (ofd, ":\n");
+
+	for (i = 0; i < size; ++i) 
+		fprintf (ofd, ".byte %d\n", (unsigned int) code [i]);
+
+	fclose (ofd);
+
+#ifdef __ia64__
+#define DIS_CMD "objdump -d"
+#define AS_CMD "as"
+#else
+#define DIS_CMD "ia64-linux-gnu-objdump -d"
+#define AS_CMD "ia64-linux-gnu-as"
+#endif
+
+	o_file = g_strdup_printf ("%s/test.o", tmp);    
+	cmd = g_strdup_printf (AS_CMD " %s -o %s", as_file, o_file);
+	system (cmd); 
+	g_free (cmd);
+	if (!objdump_args)
+		objdump_args = "";
+	
+	cmd = g_strdup_printf (DIS_CMD " %s %s", objdump_args, o_file);
+	system (cmd);
+	g_free (cmd);
+	
+	g_free (o_file);
+	g_free (as_file);
+}
+
+int
+main ()
+{
+	Ia64CodegenState code;
+
+	guint8 *buf = g_malloc0 (40960);
+
+	ia64_codegen_init (code, buf);
+
+	ia64_add (code, 1, 2, 3);
+	ia64_add1 (code, 1, 2, 3);
+	ia64_sub (code, 1, 2, 3);
+	ia64_sub1 (code, 1, 2, 3);
+	ia64_addp4 (code, 1, 2, 3);
+	ia64_and (code, 1, 2, 3);
+	ia64_andcm (code, 1, 2, 3);
+	ia64_or (code, 1, 2, 3);
+	ia64_xor (code, 1, 2, 3);
+	ia64_shladd (code, 1, 2, 3, 4);
+	ia64_shladdp4 (code, 1, 2, 3, 4);
+	ia64_sub_imm (code, 1, 0x7f, 2);
+	ia64_sub_imm (code, 1, -1, 2);
+	ia64_and_imm (code, 1, -128, 2);
+	ia64_andcm_imm (code, 1, -128, 2);
+	ia64_or_imm (code, 1, -128, 2);
+	ia64_xor_imm (code, 1, -128, 2);
+	ia64_adds_imm (code, 1, 8191, 2);
+	ia64_adds_imm (code, 1, -8192, 2);
+	ia64_adds_imm (code, 1, 1234, 2);
+	ia64_adds_imm (code, 1, -1234, 2);
+	ia64_addp4_imm (code, 1, -1234, 2);
+	ia64_addl_imm (code, 1, 1234, 2);
+	ia64_addl_imm (code, 1, -1234, 2);
+	ia64_addl_imm (code, 1, 2097151, 2);
+	ia64_addl_imm (code, 1, -2097152, 2);
+
+	ia64_cmp_lt (code, 1, 2, 1, 2);
+	ia64_cmp_ltu (code, 1, 2, 1, 2);
+	ia64_cmp_eq (code, 1, 2, 1, 2);
+	ia64_cmp_lt_unc (code, 1, 2, 1, 2);
+	ia64_cmp_ltu_unc (code, 1, 2, 1, 2);
+	ia64_cmp_eq_unc (code, 1, 2, 1, 2);
+	ia64_cmp_eq_and (code, 1, 2, 1, 2);
+	ia64_cmp_eq_or (code, 1, 2, 1, 2);
+	ia64_cmp_eq_or_andcm (code, 1, 2, 1, 2);
+	ia64_cmp_ne_and (code, 1, 2, 1, 2);
+	ia64_cmp_ne_or (code, 1, 2, 1, 2);
+	ia64_cmp_ne_or_andcm (code, 1, 2, 1, 2);
+
+	ia64_cmp4_lt (code, 1, 2, 1, 2);
+	ia64_cmp4_ltu (code, 1, 2, 1, 2);
+	ia64_cmp4_eq (code, 1, 2, 1, 2);
+	ia64_cmp4_lt_unc (code, 1, 2, 1, 2);
+	ia64_cmp4_ltu_unc (code, 1, 2, 1, 2);
+	ia64_cmp4_eq_unc (code, 1, 2, 1, 2);
+	ia64_cmp4_eq_and (code, 1, 2, 1, 2);
+	ia64_cmp4_eq_or (code, 1, 2, 1, 2);
+	ia64_cmp4_eq_or_andcm (code, 1, 2, 1, 2);
+	ia64_cmp4_ne_and (code, 1, 2, 1, 2);
+	ia64_cmp4_ne_or (code, 1, 2, 1, 2);
+	ia64_cmp4_ne_or_andcm (code, 1, 2, 1, 2);
+
+	ia64_cmp_gt_and (code, 1, 2, 0, 2);
+	ia64_cmp_gt_or (code, 1, 2, 0, 2);
+	ia64_cmp_gt_or_andcm (code, 1, 2, 0, 2);
+	ia64_cmp_le_and (code, 1, 2, 0, 2);
+	ia64_cmp_le_or (code, 1, 2, 0, 2);
+	ia64_cmp_le_or_andcm (code, 1, 2, 0, 2);
+	ia64_cmp_ge_and (code, 1, 2, 0, 2);
+	ia64_cmp_ge_or (code, 1, 2, 0, 2);
+	ia64_cmp_ge_or_andcm (code, 1, 2, 0, 2);
+	ia64_cmp_lt_and (code, 1, 2, 0, 2);
+	ia64_cmp_lt_or (code, 1, 2, 0, 2);
+	ia64_cmp_lt_or_andcm (code, 1, 2, 0, 2);
+
+	ia64_cmp4_gt_and (code, 1, 2, 0, 2);
+	ia64_cmp4_gt_or (code, 1, 2, 0, 2);
+	ia64_cmp4_gt_or_andcm (code, 1, 2, 0, 2);
+	ia64_cmp4_le_and (code, 1, 2, 0, 2);
+	ia64_cmp4_le_or (code, 1, 2, 0, 2);
+	ia64_cmp4_le_or_andcm (code, 1, 2, 0, 2);
+	ia64_cmp4_ge_and (code, 1, 2, 0, 2);
+	ia64_cmp4_ge_or (code, 1, 2, 0, 2);
+	ia64_cmp4_ge_or_andcm (code, 1, 2, 0, 2);
+	ia64_cmp4_lt_and (code, 1, 2, 0, 2);
+	ia64_cmp4_lt_or (code, 1, 2, 0, 2);
+	ia64_cmp4_lt_or_andcm (code, 1, 2, 0, 2);
+
+	ia64_cmp_lt_imm (code, 1, 2, 127, 2);
+	ia64_cmp_lt_imm (code, 1, 2, -128, 2);
+
+	ia64_cmp_lt_imm (code, 1, 2, -128, 2);
+	ia64_cmp_ltu_imm (code, 1, 2, -128, 2);
+	ia64_cmp_eq_imm (code, 1, 2, -128, 2);
+	ia64_cmp_lt_unc_imm (code, 1, 2, -128, 2);
+	ia64_cmp_ltu_unc_imm (code, 1, 2, -128, 2);
+	ia64_cmp_eq_unc_imm (code, 1, 2, -128, 2);
+	ia64_cmp_eq_and_imm (code, 1, 2, -128, 2);
+	ia64_cmp_eq_or_imm (code, 1, 2, -128, 2);
+	ia64_cmp_eq_unc_imm (code, 1, 2, -128, 2);
+	ia64_cmp_ne_and_imm (code, 1, 2, -128, 2);
+	ia64_cmp_ne_or_imm (code, 1, 2, -128, 2);
+	ia64_cmp_ne_or_andcm_imm (code, 1, 2, -128, 2);
+
+	ia64_cmp4_lt_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_ltu_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_eq_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_lt_unc_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_ltu_unc_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_eq_unc_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_eq_and_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_eq_or_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_eq_unc_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_ne_and_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_ne_or_imm (code, 1, 2, -128, 2);
+	ia64_cmp4_ne_or_andcm_imm (code, 1, 2, -128, 2);
+
+	ia64_padd1 (code, 1, 2, 3);
+	ia64_padd2 (code, 1, 2, 3);
+	ia64_padd4 (code, 1, 2, 3);
+	ia64_padd1_sss (code, 1, 2, 3);
+	ia64_padd2_sss (code, 1, 2, 3);
+	ia64_padd1_uuu (code, 1, 2, 3);
+	ia64_padd2_uuu (code, 1, 2, 3);
+	ia64_padd1_uus (code, 1, 2, 3);
+	ia64_padd2_uus (code, 1, 2, 3);
+
+	ia64_psub1 (code, 1, 2, 3);
+	ia64_psub2 (code, 1, 2, 3);
+	ia64_psub4 (code, 1, 2, 3);
+	ia64_psub1_sss (code, 1, 2, 3);
+	ia64_psub2_sss (code, 1, 2, 3);
+	ia64_psub1_uuu (code, 1, 2, 3);
+	ia64_psub2_uuu (code, 1, 2, 3);
+	ia64_psub1_uus (code, 1, 2, 3);
+	ia64_psub2_uus (code, 1, 2, 3);
+
+	ia64_pavg1 (code, 1, 2, 3);
+	ia64_pavg2 (code, 1, 2, 3);
+	ia64_pavg1_raz (code, 1, 2, 3);
+	ia64_pavg2_raz (code, 1, 2, 3);
+	ia64_pavgsub1 (code, 1, 2, 3);
+	ia64_pavgsub2 (code, 1, 2, 3);
+	ia64_pcmp1_eq (code, 1, 2, 3);
+	ia64_pcmp2_eq (code, 1, 2, 3);
+	ia64_pcmp4_eq (code, 1, 2, 3);
+	ia64_pcmp1_gt (code, 1, 2, 3);
+	ia64_pcmp2_gt (code, 1, 2, 3);
+	ia64_pcmp4_gt (code, 1, 2, 3);
+	
+	ia64_pshladd2 (code, 1, 2, 3, 4);
+	ia64_pshradd2 (code, 1, 2, 3, 4);
+
+	ia64_pmpyshr2 (code, 1, 2, 3, 0);
+	ia64_pmpyshr2_u (code, 1, 2, 3, 0);
+	ia64_pmpyshr2 (code, 1, 2, 3, 7);
+	ia64_pmpyshr2_u (code, 1, 2, 3, 7);
+	ia64_pmpyshr2 (code, 1, 2, 3, 15);
+	ia64_pmpyshr2_u (code, 1, 2, 3, 15);
+	ia64_pmpyshr2 (code, 1, 2, 3, 16);
+	ia64_pmpyshr2_u (code, 1, 2, 3, 16);
+
+	ia64_pmpy2_r (code, 1, 2, 3);
+	ia64_pmpy2_l (code, 1, 2, 3);
+	ia64_mix1_r (code, 1, 2, 3);
+	ia64_mix2_r (code, 1, 2, 3);
+	ia64_mix4_r (code, 1, 2, 3);
+	ia64_mix1_l (code, 1, 2, 3);
+	ia64_mix2_l (code, 1, 2, 3);
+	ia64_mix4_l (code, 1, 2, 3);
+	ia64_pack2_uss (code, 1, 2, 3);
+	ia64_pack2_sss (code, 1, 2, 3);
+	ia64_pack4_sss (code, 1, 2, 3);
+	ia64_unpack1_h (code, 1, 2, 3);
+	ia64_unpack2_h (code, 1, 2, 3);
+	ia64_unpack4_h (code, 1, 2, 3);
+	ia64_unpack1_l (code, 1, 2, 3);
+	ia64_unpack2_l (code, 1, 2, 3);
+	ia64_unpack4_l (code, 1, 2, 3);
+	ia64_pmin1_u (code, 1, 2, 3);
+	ia64_pmax1_u (code, 1, 2, 3);
+	ia64_pmin2 (code, 1, 2, 3);
+	ia64_pmax2 (code, 1, 2, 3);
+	ia64_psad1 (code, 1, 2, 3);
+
+	ia64_mux1 (code, 1, 2, IA64_MUX1_BRCST);
+	ia64_mux1 (code, 1, 2, IA64_MUX1_MIX);
+	ia64_mux1 (code, 1, 2, IA64_MUX1_SHUF);
+	ia64_mux1 (code, 1, 2, IA64_MUX1_ALT);
+	ia64_mux1 (code, 1, 2, IA64_MUX1_REV);
+
+	ia64_mux2 (code, 1, 2, 0x8d);
+
+	ia64_pshr2 (code, 1, 2, 3);
+	ia64_pshr4 (code, 1, 2, 3);
+	ia64_shr (code, 1, 2, 3);
+	ia64_pshr2_u (code, 1, 2, 3);
+	ia64_pshr4_u (code, 1, 2, 3);
+	ia64_shr_u (code, 1, 2, 3);
+
+	ia64_pshr2_imm (code, 1, 2, 20);
+	ia64_pshr4_imm (code, 1, 2, 20);
+	ia64_pshr2_u_imm (code, 1, 2, 20);
+	ia64_pshr4_u_imm (code, 1, 2, 20);
+
+	ia64_pshl2 (code, 1, 2, 3);
+	ia64_pshl4 (code, 1, 2, 3);
+	ia64_shl (code, 1, 2, 3);
+
+	ia64_pshl2_imm (code, 1, 2, 20);
+	ia64_pshl4_imm (code, 1, 2, 20);
+
+	ia64_popcnt (code, 1, 2);
+
+	ia64_shrp (code, 1, 2, 3, 62);
+
+	ia64_extr_u (code, 1, 2, 62, 61);
+	ia64_extr (code, 1, 2, 62, 61);
+
+	ia64_dep_z (code, 1, 2, 62, 61);
+
+	ia64_dep_z_imm (code, 1, 127, 62, 61);
+	ia64_dep_z_imm (code, 1, -128, 62, 61);
+	ia64_dep_imm (code, 1, 0, 2, 62, 61);
+	ia64_dep_imm (code, 1, -1, 2, 62, 61);
+	ia64_dep (code, 1, 2, 3, 10, 15);
+
+	ia64_tbit_z (code, 1, 2, 3, 0);
+
+	ia64_tbit_z (code, 1, 2, 3, 63);
+	ia64_tbit_z_unc (code, 1, 2, 3, 63);
+	ia64_tbit_z_and (code, 1, 2, 3, 63);
+	ia64_tbit_nz_and (code, 1, 2, 3, 63);
+	ia64_tbit_z_or (code, 1, 2, 3, 63);
+	ia64_tbit_nz_or (code, 1, 2, 3, 63);
+	ia64_tbit_z_or_andcm (code, 1, 2, 3, 63);
+	ia64_tbit_nz_or_andcm (code, 1, 2, 3, 63);
+
+	ia64_tnat_z (code, 1, 2, 3);
+	ia64_tnat_z_unc (code, 1, 2, 3);
+	ia64_tnat_z_and (code, 1, 2, 3);
+	ia64_tnat_nz_and (code, 1, 2, 3);
+	ia64_tnat_z_or (code, 1, 2, 3);
+	ia64_tnat_nz_or (code, 1, 2, 3);
+	ia64_tnat_z_or_andcm (code, 1, 2, 3);
+	ia64_tnat_nz_or_andcm (code, 1, 2, 3);
+
+	ia64_nop_i (code, 0x1234);
+	ia64_hint_i (code, 0x1234);
+
+	ia64_break_i (code, 0x1234);
+
+	ia64_chk_s_i (code, 1, 0);
+	ia64_chk_s_i (code, 1, -1);
+	ia64_chk_s_i (code, 1, 1);
+
+	ia64_mov_to_br_hint (code, 1, 1, -1, IA64_MOV_TO_BR_WH_NONE, 0);
+	ia64_mov_to_br_hint (code, 1, 1, -1, IA64_MOV_TO_BR_WH_SPTK, 0);
+	ia64_mov_to_br_hint (code, 1, 1, -1, IA64_MOV_TO_BR_WH_DPTK, 0);
+	ia64_mov_to_br_hint (code, 1, 1, -1, IA64_MOV_TO_BR_WH_DPTK, IA64_BR_IH_IMP);
+	ia64_mov_ret_to_br_hint (code, 1, 1, -1, IA64_MOV_TO_BR_WH_NONE, 0);
+
+	ia64_mov_from_br (code, 1, 1);
+
+	ia64_mov_to_pred (code, 1, 0xfe);
+
+	ia64_mov_to_pred_rot_imm (code, 0xff0000);
+
+	ia64_mov_from_ip (code, 1);
+	ia64_mov_from_pred (code, 1);
+
+	ia64_mov_to_ar_i (code, 1, 1);
+
+	ia64_mov_to_ar_imm_i (code, 1, 127);
+
+	ia64_mov_from_ar_i (code, 1, 1);
+
+	ia64_zxt1 (code, 1, 2);
+	ia64_zxt2 (code, 1, 2);
+	ia64_zxt4 (code, 1, 2);
+	ia64_sxt1 (code, 1, 2);
+	ia64_sxt2 (code, 1, 2);
+	ia64_sxt4 (code, 1, 2);
+
+	ia64_czx1_l (code, 1, 2);
+	ia64_czx2_l (code, 1, 2);
+	ia64_czx1_r (code, 1, 2);
+	ia64_czx2_r (code, 1, 2);
+
+	ia64_ld1_hint (code, 1, 2, IA64_LD_HINT_NONE);
+	ia64_ld1_hint (code, 1, 2, IA64_LD_HINT_NT1);
+	ia64_ld1_hint (code, 1, 2, IA64_LD_HINT_NTA);
+
+	ia64_ld1_hint (code, 1, 2, 0);
+	ia64_ld2_hint (code, 1, 2, 0);
+	ia64_ld4_hint (code, 1, 2, 0);
+	ia64_ld8_hint (code, 1, 2, 0);
+
+	ia64_ld1_s_hint (code, 1, 2, 0);
+	ia64_ld2_s_hint (code, 1, 2, 0);
+	ia64_ld4_s_hint (code, 1, 2, 0);
+	ia64_ld8_s_hint (code, 1, 2, 0);
+
+	ia64_ld1_a_hint (code, 1, 2, 0);
+	ia64_ld2_a_hint (code, 1, 2, 0);
+	ia64_ld4_a_hint (code, 1, 2, 0);
+	ia64_ld8_a_hint (code, 1, 2, 0);
+
+	ia64_ld1_sa_hint (code, 1, 2, 0);
+	ia64_ld2_sa_hint (code, 1, 2, 0);
+	ia64_ld4_sa_hint (code, 1, 2, 0);
+	ia64_ld8_sa_hint (code, 1, 2, 0);
+
+	ia64_ld1_bias_hint (code, 1, 2, 0);
+	ia64_ld2_bias_hint (code, 1, 2, 0);
+	ia64_ld4_bias_hint (code, 1, 2, 0);
+	ia64_ld8_bias_hint (code, 1, 2, 0);
+
+	ia64_ld1_inc_hint (code, 1, 2, 3, IA64_LD_HINT_NONE);
+
+	ia64_ld1_inc_imm_hint (code, 1, 2, 255, IA64_LD_HINT_NONE);
+	ia64_ld1_inc_imm_hint (code, 1, 2, -256, IA64_LD_HINT_NONE);
+
+	ia64_st1_hint (code, 1, 2, IA64_ST_HINT_NTA);
+
+	ia64_st1_hint (code, 1, 2, IA64_ST_HINT_NONE);
+	ia64_st2_hint (code, 1, 2, IA64_ST_HINT_NONE);
+	ia64_st4_hint (code, 1, 2, IA64_ST_HINT_NONE);
+	ia64_st8_hint (code, 1, 2, IA64_ST_HINT_NONE);
+
+	ia64_st1_rel_hint (code, 1, 2, IA64_ST_HINT_NONE);
+	ia64_st2_rel_hint (code, 1, 2, IA64_ST_HINT_NONE);
+	ia64_st4_rel_hint (code, 1, 2, IA64_ST_HINT_NONE);
+	ia64_st8_rel_hint (code, 1, 2, IA64_ST_HINT_NONE);
+
+	ia64_st8_spill_hint (code, 1, 2, IA64_ST_HINT_NONE);
+
+	ia64_st16_hint (code, 1, 2, IA64_ST_HINT_NONE);
+	ia64_st16_rel_hint (code, 1, 2, IA64_ST_HINT_NONE);
+
+	ia64_st1_inc_imm_hint (code, 1, 2, 255, IA64_ST_HINT_NONE);
+	ia64_st2_inc_imm_hint (code, 1, 2, 255, IA64_ST_HINT_NONE);
+	ia64_st4_inc_imm_hint (code, 1, 2, 255, IA64_ST_HINT_NONE);
+	ia64_st8_inc_imm_hint (code, 1, 2, 255, IA64_ST_HINT_NONE);
+
+	ia64_st1_rel_inc_imm_hint (code, 1, 2, 255, IA64_ST_HINT_NONE);
+	ia64_st2_rel_inc_imm_hint (code, 1, 2, 255, IA64_ST_HINT_NONE);
+	ia64_st4_rel_inc_imm_hint (code, 1, 2, 255, IA64_ST_HINT_NONE);
+	ia64_st8_rel_inc_imm_hint (code, 1, 2, 255, IA64_ST_HINT_NONE);
+
+	ia64_st8_spill_inc_imm_hint (code, 1, 2, 255, IA64_ST_HINT_NONE);
+
+	ia64_ldfs_hint (code, 1, 2, 0);
+	ia64_ldfd_hint (code, 1, 2, 0);
+	ia64_ldf8_hint (code, 1, 2, 0);
+	ia64_ldfe_hint (code, 1, 2, 0);
+
+	ia64_ldfs_s_hint (code, 1, 2, 0);
+	ia64_ldfd_s_hint (code, 1, 2, 0);
+	ia64_ldf8_s_hint (code, 1, 2, 0);
+	ia64_ldfe_s_hint (code, 1, 2, 0);
+
+	ia64_ldfs_a_hint (code, 1, 2, 0);
+	ia64_ldfd_a_hint (code, 1, 2, 0);
+	ia64_ldf8_a_hint (code, 1, 2, 0);
+	ia64_ldfe_a_hint (code, 1, 2, 0);
+
+	ia64_ldfs_sa_hint (code, 1, 2, 0);
+	ia64_ldfd_sa_hint (code, 1, 2, 0);
+	ia64_ldf8_sa_hint (code, 1, 2, 0);
+	ia64_ldfe_sa_hint (code, 1, 2, 0);
+
+	ia64_ldfs_c_clr_hint (code, 1, 2, 0);
+	ia64_ldfd_c_clr_hint (code, 1, 2, 0);
+	ia64_ldf8_c_clr_hint (code, 1, 2, 0);
+	ia64_ldfe_c_clr_hint (code, 1, 2, 0);
+
+	ia64_ldfs_c_nc_hint (code, 1, 2, 0);
+	ia64_ldfd_c_nc_hint (code, 1, 2, 0);
+	ia64_ldf8_c_nc_hint (code, 1, 2, 0);
+	ia64_ldfe_c_nc_hint (code, 1, 2, 0);
+
+	ia64_ldf_fill_hint (code, 1, 2, 0);
+
+	ia64_ldfs_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfd_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldf8_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfe_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfs_s_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfd_s_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldf8_s_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfe_s_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfs_a_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfd_a_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldf8_a_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfe_a_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfs_sa_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfd_sa_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldf8_sa_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfe_sa_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfs_c_clr_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfd_c_clr_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldf8_c_clr_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfe_c_clr_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfs_c_nc_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfd_c_nc_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldf8_c_nc_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfe_c_nc_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldf_fill_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfs_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfd_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldf8_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfe_inc_imm_hint (code, 1, 2, 255, 0);
+
+	ia64_ldfs_s_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfd_s_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldf8_s_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfe_s_inc_imm_hint (code, 1, 2, 255, 0);
+
+	ia64_ldfs_a_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfd_a_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldf8_a_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfe_a_inc_imm_hint (code, 1, 2, 255, 0);
+
+	ia64_ldfs_sa_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfd_sa_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldf8_sa_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfe_sa_inc_imm_hint (code, 1, 2, 255, 0);
+
+	ia64_ldfs_c_clr_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfd_c_clr_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldf8_c_clr_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfe_c_clr_inc_imm_hint (code, 1, 2, 255, 0);
+
+	ia64_ldfs_c_nc_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfd_c_nc_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldf8_c_nc_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_ldfe_c_nc_inc_imm_hint (code, 1, 2, 255, 0);
+
+	ia64_ldf_fill_inc_imm_hint (code, 1, 2, 255, 0);
+
+	ia64_stfs_hint (code, 1, 2, 0);
+	ia64_stfd_hint (code, 1, 2, 0);
+	ia64_stf8_hint (code, 1, 2, 0);
+	ia64_stfe_hint (code, 1, 2, 0);
+
+	ia64_stf_spill_hint (code, 1, 2, 0);
+
+	ia64_stfs_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_stfd_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_stf8_inc_imm_hint (code, 1, 2, 255, 0);
+	ia64_stfe_inc_imm_hint (code, 1, 2, 255, 0);
+
+	ia64_stf_spill_inc_imm_hint (code, 1, 2, 255, 0);
+
+	ia64_ldfps_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_s_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_s_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_s_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_a_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_a_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_a_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_sa_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_sa_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_sa_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_c_clr_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_c_clr_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_c_clr_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_c_nc_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_c_nc_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_c_nc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_s_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_s_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_s_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_a_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_a_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_a_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_sa_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_sa_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_sa_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_c_clr_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_c_clr_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_c_clr_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_ldfps_c_nc_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfpd_c_nc_inc_hint (code, 1, 2, 3, 0);
+	ia64_ldfp8_c_nc_inc_hint (code, 1, 2, 3, 0);
+
+	ia64_lfetch_hint (code, 1, 0);
+	ia64_lfetch_excl_hint (code, 1, 0);
+	ia64_lfetch_fault_hint (code, 1, 0);
+	ia64_lfetch_fault_excl_hint (code, 1, 0);
+
+	ia64_lfetch_hint (code, 1, IA64_LFHINT_NT1);
+	ia64_lfetch_hint (code, 1, IA64_LFHINT_NT2);
+	ia64_lfetch_hint (code, 1, IA64_LFHINT_NTA);
+
+	ia64_lfetch_inc_hint (code, 1, 2, 0);
+	ia64_lfetch_excl_inc_hint (code, 1, 2, 0);
+	ia64_lfetch_fault_inc_hint (code, 1, 2, 0);
+	ia64_lfetch_fault_excl_inc_hint (code, 1, 2, 0);
+
+	ia64_lfetch_inc_imm_hint (code, 1, 255, 0);
+	ia64_lfetch_excl_inc_imm_hint (code, 1, 255, 0);
+	ia64_lfetch_fault_inc_imm_hint (code, 1, 255, 0);
+	ia64_lfetch_fault_excl_inc_imm_hint (code, 1, 255, 0);
+
+	ia64_cmpxchg1_acq_hint (code, 1, 2, 3, 0);
+	ia64_cmpxchg2_acq_hint (code, 1, 2, 3, 0);
+	ia64_cmpxchg4_acq_hint (code, 1, 2, 3, 0);
+	ia64_cmpxchg8_acq_hint (code, 1, 2, 3, 0);
+	ia64_cmpxchg1_rel_hint (code, 1, 2, 3, 0);
+	ia64_cmpxchg2_rel_hint (code, 1, 2, 3, 0);
+	ia64_cmpxchg4_rel_hint (code, 1, 2, 3, 0);
+	ia64_cmpxchg8_rel_hint (code, 1, 2, 3, 0);
+	ia64_cmpxchg16_acq_hint (code, 1, 2, 3, 0);
+	ia64_cmpxchg16_rel_hint (code, 1, 2, 3, 0);
+	ia64_xchg1_hint (code, 1, 2, 3, 0);
+	ia64_xchg2_hint (code, 1, 2, 3, 0);
+	ia64_xchg4_hint (code, 1, 2, 3, 0);
+	ia64_xchg8_hint (code, 1, 2, 3, 0);
+
+	ia64_fetchadd4_acq_hint (code, 1, 2, -16, 0);
+	ia64_fetchadd4_acq_hint (code, 1, 2, -8, 0);
+	ia64_fetchadd4_acq_hint (code, 1, 2, -4, 0);
+	ia64_fetchadd4_acq_hint (code, 1, 2, -1, 0);
+	ia64_fetchadd4_acq_hint (code, 1, 2, 1, 0);
+	ia64_fetchadd4_acq_hint (code, 1, 2, 4, 0);
+	ia64_fetchadd4_acq_hint (code, 1, 2, 8, 0);
+	ia64_fetchadd4_acq_hint (code, 1, 2, 16, 0);
+
+	ia64_fetchadd4_acq_hint (code, 1, 2, 16, 0);
+	ia64_fetchadd8_acq_hint (code, 1, 2, 16, 0);
+	ia64_fetchadd4_rel_hint (code, 1, 2, 16, 0);
+	ia64_fetchadd8_rel_hint (code, 1, 2, 16, 0);
+
+	ia64_setf_sig (code, 1, 2);
+	ia64_setf_exp (code, 1, 2);
+	ia64_setf_s (code, 1, 2);
+	ia64_setf_d (code, 1, 2);
+
+	ia64_getf_sig (code, 1, 2);
+	ia64_getf_exp (code, 1, 2);
+	ia64_getf_s (code, 1, 2);
+	ia64_getf_d (code, 1, 2);
+
+	ia64_chk_s_m (code, 1, 0);
+	ia64_chk_s_m (code, 1, 1);
+	ia64_chk_s_m (code, 1, -1);
+
+	ia64_chk_s_float_m (code, 1, 0);
+
+	ia64_chk_a_nc (code, 1, 0);
+	ia64_chk_a_nc (code, 1, 1);
+	ia64_chk_a_nc (code, 1, -1);
+
+	ia64_chk_a_nc (code, 1, 0);
+	ia64_chk_a_clr (code, 1, 0);
+
+	ia64_chk_a_nc_float (code, 1, 0);
+	ia64_chk_a_clr_float (code, 1, 0);
+
+	ia64_invala (code);
+	ia64_fwb (code);
+	ia64_mf (code);
+	ia64_mf_a (code);
+	ia64_srlz_d (code);
+	ia64_stlz_i (code);
+	ia64_sync_i (code);
+
+	ia64_flushrs (code);
+	ia64_loadrs (code);
+
+	ia64_invala_e (code, 1);
+	ia64_invala_e_float (code, 1);
+
+	ia64_fc (code, 1);
+	ia64_fc_i (code, 1);
+
+	ia64_mov_to_ar_m (code, 1, 1);
+
+	ia64_mov_to_ar_imm_m (code, 1, 127);
+
+	ia64_mov_from_ar_m (code, 1, 1);
+
+	ia64_mov_to_cr (code, 1, 2);
+
+	ia64_mov_from_cr (code, 1, 2);
+
+	ia64_alloc (code, 1, 3, 4, 5, 0);
+	ia64_alloc (code, 1, 3, 4, 5, 8);
+
+	ia64_mov_to_psr_l (code, 1);
+	ia64_mov_to_psr_um (code, 1);
+
+	ia64_mov_from_psr (code, 1);
+	ia64_mov_from_psr_um (code, 1);
+
+	ia64_break_m (code, 0x1234);
+	ia64_nop_m (code, 0x1234);
+	ia64_hint_m (code, 0x1234);
+
+	ia64_br_cond_hint (code, 0, 0, 0, 0);
+	ia64_br_wexit_hint (code, 0, 0, 0, 0);
+	ia64_br_wtop_hint (code, 0, 0, 0, 0);
+
+	ia64_br_cloop_hint (code, 0, 0, 0, 0);
+	ia64_br_cexit_hint (code, 0, 0, 0, 0);
+	ia64_br_ctop_hint (code, 0, 0, 0, 0);
+
+	ia64_br_call_hint (code, 1, 0, 0, 0, 0);
+
+	ia64_br_cond_reg_hint (code, 1, 0, 0, 0);
+	ia64_br_ia_reg_hint (code, 1, 0, 0, 0);
+	ia64_br_ret_reg_hint (code, 1, 0, 0, 0);
+
+	ia64_br_call_reg_hint (code, 1, 2, 0, 0, 0);
+
+	ia64_cover (code);
+	ia64_clrrrb (code);
+	ia64_clrrrb_pr (code);
+	ia64_rfi (code);
+	ia64_bsw_0 (code);
+	ia64_bsw_1 (code);
+	ia64_epc (code);
+
+	ia64_break_b (code, 0x1234);
+	ia64_nop_b (code, 0x1234);
+	ia64_hint_b (code, 0x1234);
+
+	ia64_break_x (code, 0x2123456789ABCDEFULL);
+
+	ia64_movl (code, 1, 0x123456789ABCDEF0LL);
+
+	ia64_brl_cond_hint (code, 0, 0, 0, 0);
+	ia64_brl_cond_hint (code, -1, 0, 0, 0);
+
+	ia64_brl_call_hint (code, 1, 0, 0, 0, 0);
+	ia64_brl_call_hint (code, 1, -1, 0, 0, 0);
+
+	ia64_nop_x (code, 0x2123456789ABCDEFULL);
+	ia64_hint_x (code, 0x2123456789ABCDEFULL);
+
+	ia64_movl_pred (code, 1, 1, 0x123456789ABCDEF0LL);
+
+	/* FLOATING-POINT */
+	ia64_fma_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fma_s_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fma_d_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fpma_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fms_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fms_s_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fms_d_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fpms_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fnma_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fnma_s_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fnma_d_sf_pred (code, 1, 1, 2, 3, 4, 2);
+	ia64_fpnma_sf_pred (code, 1, 1, 2, 3, 4, 2);
+
+	ia64_xma_l_pred (code, 1, 1, 2, 3, 4);
+	ia64_xma_h_pred (code, 1, 1, 2, 3, 4);
+	ia64_xma_hu_pred (code, 1, 1, 2, 3, 4);
+
+	ia64_fselect_pred (code, 1, 1, 2, 3, 4);
+
+	ia64_fcmp_eq_sf_pred (code, 1, 1, 2, 3, 4, 0);
+	ia64_fcmp_lt_sf_pred (code, 1, 1, 2, 3, 4, 0);
+	ia64_fcmp_le_sf_pred (code, 1, 1, 2, 3, 4, 0);
+	ia64_fcmp_unord_sf_pred (code, 1, 1, 2, 3, 4, 0);
+	ia64_fcmp_eq_unc_sf_pred (code, 1, 1, 2, 3, 4, 0);
+	ia64_fcmp_lt_unc_sf_pred (code, 1, 1, 2, 3, 4, 0);
+	ia64_fcmp_le_unc_sf_pred (code, 1, 1, 2, 3, 4, 0);
+	ia64_fcmp_unord_unc_sf_pred (code, 1, 1, 2, 3, 4, 0);
+
+	ia64_fclass_m_pred (code, 1, 1, 2, 3, 0x1ff);
+	ia64_fclass_m_unc_pred (code, 1, 1, 2, 3, 0x1ff);
+
+	ia64_frcpa_sf_pred (code, 1, 1, 2, 3, 4, 0);
+	ia64_fprcpa_sf_pred (code, 1, 1, 2, 3, 4, 0);
+
+	ia64_frsqrta_sf_pred (code, 1, 1, 2, 4, 0);
+	ia64_fprsqrta_sf_pred (code, 1, 1, 2, 4, 0);
+
+	ia64_fmin_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fman_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_famin_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_famax_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpmin_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpman_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpamin_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpamax_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpcmp_eq_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpcmp_lt_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpcmp_le_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpcmp_unord_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpcmp_neq_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpcmp_nlt_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpcmp_nle_sf_pred (code, 1, 2, 3, 4, 0);
+	ia64_fpcmp_ord_sf_pred (code, 1, 2, 3, 4, 0);
+
+	ia64_fmerge_s_pred (code, 1, 2, 3, 4);
+	ia64_fmerge_ns_pred (code, 1, 2, 3, 4);
+	ia64_fmerge_se_pred (code, 1, 2, 3, 4);
+	ia64_fmix_lr_pred (code, 1, 2, 3, 4);
+	ia64_fmix_r_pred (code, 1, 2, 3, 4);
+	ia64_fmix_l_pred (code, 1, 2, 3, 4);
+	ia64_fsxt_r_pred (code, 1, 2, 3, 4);
+	ia64_fsxt_l_pred (code, 1, 2, 3, 4);
+	ia64_fpack_pred (code, 1, 2, 3, 4);
+	ia64_fswap_pred (code, 1, 2, 3, 4);
+	ia64_fswap_nl_pred (code, 1, 2, 3, 4);
+	ia64_fswap_nr_pred (code, 1, 2, 3, 4);
+	ia64_fand_pred (code, 1, 2, 3, 4);
+	ia64_fandcm_pred (code, 1, 2, 3, 4);
+	ia64_for_pred (code, 1, 2, 3, 4);
+	ia64_fxor_pred (code, 1, 2, 3, 4);
+	ia64_fpmerge_s_pred (code, 1, 2, 3, 4);
+	ia64_fpmerge_ns_pred (code, 1, 2, 3, 4);
+	ia64_fpmerge_se_pred (code, 1, 2, 3, 4);
+	
+	ia64_fcvt_fx_sf_pred ((code), 1, 2, 3, 0);
+	ia64_fcvt_fxu_sf_pred ((code), 1, 2, 3, 0);
+	ia64_fcvt_fx_trunc_sf_pred ((code), 1, 2, 3, 0);
+	ia64_fcvt_fxu_trunc_sf_pred ((code), 1, 2, 3, 0);
+	ia64_fpcvt_fx_sf_pred ((code), 1, 2, 3, 0);
+	ia64_fpcvt_fxu_sf_pred ((code), 1, 2, 3, 0);
+	ia64_fpcvt_fx_trunc_sf_pred ((code), 1, 2, 3, 0);
+	ia64_fpcvt_fxu_trunc_sf_pred ((code), 1, 2, 3, 0);
+
+	ia64_fcvt_xf_pred ((code), 1, 2, 3);
+
+	ia64_fsetc_sf_pred ((code), 1, 0x33, 0x33, 3);
+
+	ia64_fclrf_sf_pred ((code), 1, 3);
+
+	ia64_fchkf_sf_pred ((code), 1, -1, 3);
+
+	ia64_break_f_pred ((code), 1, 0x1234);
+
+	ia64_movl (code, 31, -123456);
+
+	ia64_codegen_close (code);
+
+#if 0
+	/* disassembly */
+	{
+		guint8 *buf = code.buf;
+		int template;
+		guint64 dw1, dw2;
+		guint64 ins1, ins2, ins3;
+
+		ia64_break_i (code, 0x1234);
+
+		ia64_codegen_close (code);
+
+		dw1 = ((guint64*)buf) [0];
+		dw2 = ((guint64*)buf) [1];
+
+		template = ia64_bundle_template (buf);
+		ins1 = ia64_bundle_ins1 (buf);
+		ins2 = ia64_bundle_ins2 (buf);
+		ins3 = ia64_bundle_ins3 (buf);
+
+		code.buf = buf;
+		ia64_emit_bundle_template (&code, template, ins1, ins2, ins3);
+
+		g_assert (dw1 == ((guint64*)buf) [0]);
+		g_assert (dw2 == ((guint64*)buf) [1]);
+	}
+#endif
+
+	mono_disassemble_code (buf, 40960, "code");
+
+	return 0;
+}
--- a/lib/ffts/src/arch/ia64/ia64-codegen.h
+++ b/lib/ffts/src/arch/ia64/ia64-codegen.h
--- a/lib/ffts/src/arch/mips/.gitignore
+++ b/lib/ffts/src/arch/mips/.gitignore
@ -0,0 +1,6 @@
+/
+/Makefile
+/Makefile.in
+/*.o
+/*.lo
+/.deps
--- a/lib/ffts/src/arch/mips/Makefile.am
+++ b/lib/ffts/src/arch/mips/Makefile.am
@ -0,0 +1,8 @@
+
+AM_CPPFLAGS = $(GLIB_CFLAGS) -I$(top_srcdir)
+
+noinst_LTLIBRARIES = libmonoarch-mips.la
+
+libmonoarch_mips_la_SOURCES = mips-codegen.h
+
+noinst_PROGRAMS = test
--- a/lib/ffts/src/arch/mips/mips-codegen.h
+++ b/lib/ffts/src/arch/mips/mips-codegen.h
@ -0,0 +1,435 @@
+#ifndef __MIPS_CODEGEN_H__
+#define __MIPS_CODEGEN_H__
+/*
+ * Copyright (c) 2004 Novell, Inc
+ * Author: Paolo Molaro (lupus@ximian.com)
+ *
+ */
+
+/* registers */
+enum {
+	mips_zero,
+	mips_at, /* assembler temp */
+	mips_v0, /* return values */
+	mips_v1,
+	mips_a0, /* 4 - func arguments */
+	mips_a1,
+	mips_a2,
+	mips_a3,
+#if _MIPS_SIM == _ABIO32
+	mips_t0, /* 8 temporaries */
+	mips_t1,
+	mips_t2,
+	mips_t3,
+	mips_t4,
+	mips_t5,
+	mips_t6,
+	mips_t7,
+#elif _MIPS_SIM == _ABIN32
+	mips_a4, /* 4 more argument registers */
+	mips_a5,
+	mips_a6,
+	mips_a7,
+	mips_t0, /* 4 temporaries */
+	mips_t1,
+	mips_t2,
+	mips_t3,
+#endif
+	mips_s0, /* 16 calle saved */
+	mips_s1,
+	mips_s2,
+	mips_s3,
+	mips_s4,
+	mips_s5,
+	mips_s6,
+	mips_s7,
+	mips_t8, /* 24 temps */
+	mips_t9, /* 25 temp / pic call-through register */
+	mips_k0, /* 26 kernel-reserved */
+	mips_k1,
+	mips_gp, /* 28 */
+	mips_sp, /* stack pointer */
+	mips_fp, /* frame pointer */
+	mips_ra /* return address */
+};
+
+/* we treat the register file as containing just doubles... */
+enum {
+	mips_f0, /* return regs */
+	mips_f1,
+	mips_f2,
+	mips_f3,
+	mips_f4, /* temps */
+	mips_f5,
+	mips_f6,
+	mips_f7,
+	mips_f8,
+	mips_f9,
+	mips_f10,
+	mips_f11,
+	mips_f12, /* first arg */
+	mips_f13,
+	mips_f14, /* second arg */
+	mips_f15,
+	mips_f16, /* temps */
+	mips_f17,
+	mips_f18,
+	mips_f19,
+	mips_f20, /* callee saved */
+	mips_f21,
+	mips_f22,
+	mips_f23,
+	mips_f24,
+	mips_f25,
+	mips_f26,
+	mips_f27,
+	mips_f28,
+	mips_f29,
+	mips_f30,
+	mips_f31
+};
+
+/* prefetch hints */
+enum {
+	MIPS_FOR_LOAD,
+	MIPS_FOR_STORE,
+	MIPS_FOR_LOAD_STREAMED = 4,
+	MIPS_FOR_STORE_STREAMED,
+	MIPS_FOR_LOAD_RETAINED,
+	MIPS_FOR_STORE_RETAINED
+};
+
+/* coprocessors */
+enum {
+	MIPS_COP0,
+	MIPS_COP1,
+	MIPS_COP2,
+	MIPS_COP3
+};
+
+enum {
+	MIPS_FMT_SINGLE = 16,
+	MIPS_FMT_DOUBLE = 17,
+	MIPS_FMT_WORD = 20,
+	MIPS_FMT_LONG = 21,
+	MIPS_FMT3_SINGLE = 0,
+	MIPS_FMT3_DOUBLE = 1
+};
+
+/* fpu rounding mode */
+enum {
+	MIPS_ROUND_TO_NEAREST,
+	MIPS_ROUND_TO_ZERO,
+	MIPS_ROUND_TO_POSINF,
+	MIPS_ROUND_TO_NEGINF,
+	MIPS_ROUND_MASK = 3
+};
+
+/* fpu enable/cause flags, cc */
+enum {
+	MIPS_FPU_C_MASK = 1 << 23,
+	MIPS_INEXACT = 1,
+	MIPS_UNDERFLOW = 2,
+	MIPS_OVERFLOW = 4,
+	MIPS_DIVZERO = 8,
+	MIPS_INVALID = 16,
+	MIPS_NOTIMPL = 32,
+	MIPS_FPU_FLAGS_OFFSET = 2,
+	MIPS_FPU_ENABLES_OFFSET = 7,
+	MIPS_FPU_CAUSES_OFFSET = 12
+};
+
+/* fpu condition values - see manual entry for C.cond.fmt instructions */
+enum {
+	MIPS_FPU_F,
+	MIPS_FPU_UN,
+	MIPS_FPU_EQ,
+	MIPS_FPU_UEQ,
+	MIPS_FPU_OLT,
+	MIPS_FPU_ULT,
+	MIPS_FPU_OLE,
+	MIPS_FPU_ULE,
+	MIPS_FPU_SF,
+	MIPS_FPU_NGLE,
+	MIPS_FPU_SEQ,
+	MIPS_FPU_NGL,
+	MIPS_FPU_LT,
+	MIPS_FPU_NGE,
+	MIPS_FPU_LE,
+	MIPS_FPU_NGT
+};
+
+#if SIZEOF_REGISTER == 4
+
+#define MIPS_SW		mips_sw
+#define MIPS_LW		mips_lw
+#define MIPS_ADDU	mips_addu
+#define MIPS_ADDIU	mips_addiu
+#define MIPS_SWC1	mips_swc1
+#define MIPS_LWC1	mips_lwc1
+#define MIPS_MOVE	mips_move
+
+#elif SIZEOF_REGISTER == 8
+
+#define MIPS_SW		mips_sd
+#define MIPS_LW		mips_ld
+#define MIPS_ADDU	mips_daddu
+#define MIPS_ADDIU	mips_daddiu
+#define MIPS_SWC1	mips_sdc1
+#define MIPS_LWC1	mips_ldc1
+#define MIPS_MOVE	mips_dmove
+
+#else
+#error Unknown SIZEOF_REGISTER
+#endif
+
+#define mips_emit32(c,x) do {				\
+		*((guint32 *) (void *)(c)) = x;				\
+		(c) = (typeof(c))(((guint32 *)(void *)(c)) + 1);	\
+	} while (0)
+
+#define mips_format_i(code,op,rs,rt,imm) mips_emit32 ((code), (((op)<<26)|((rs)<<21)|((rt)<<16)|((imm)&0xffff)))
+#define mips_format_j(code,op,imm) mips_emit32 ((code), (((op)<<26)|((imm)&0x03ffffff)))
+#define mips_format_r(code,op,rs,rt,rd,sa,func) mips_emit32 ((code), (((op)<<26)|((rs)<<21)|((rt)<<16)|((rd)<<11)|((sa)<<6)|(func)))
+#define mips_format_divmul(code,op,src1,src2,fun) mips_emit32 ((code), (((op)<<26)|((src1)<<21)|((src2)<<16)|(fun)))
+
+#define mips_is_imm16(val) ((gint)(gshort)(gint)(val) == (gint)(val))
+
+/* Load always using lui/addiu pair (for later patching) */
+#define mips_load(c,D,v) do {	\
+		if (((guint32)(v)) & (1 << 15)) {								\
+			mips_lui ((c), (D), mips_zero, (((guint32)(v))>>16)+1);		\
+		}																\
+		else {															\
+			mips_lui ((c), (D), mips_zero, (((guint32)(v))>>16));		\
+		}																\
+		mips_addiu ((c), (D), (D), ((guint32)(v)) & 0xffff);			\
+	} while (0)
+
+/* load constant - no patch-up */
+#define mips_load_const(c,D,v) do {	\
+		if (!mips_is_imm16 ((v)))	{	\
+			if (((guint32)(v)) & (1 << 15)) {		\
+				mips_lui ((c), (D), mips_zero, (((guint32)(v))>>16)+1); \
+			} \
+			else {			\
+				mips_lui ((c), (D), mips_zero, (((guint32)(v))>>16)); \
+			}						\
+			if (((guint32)(v)) & 0xffff) \
+				mips_addiu ((c), (D), (D), ((guint32)(v)) & 0xffff); \
+		}							\
+		else							\
+			mips_addiu ((c), (D), mips_zero, ((guint32)(v)) & 0xffff); \
+	} while (0)
+
+/* arithmetric ops */
+#define mips_add(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,32)
+#define mips_addi(c,dest,src1,imm) mips_format_i(c,8,src1,dest,imm)
+#define mips_addu(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,33)
+#define mips_addiu(c,dest,src1,imm) mips_format_i(c,9,src1,dest,imm)
+#define mips_dadd(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,44)
+#define mips_daddi(c,dest,src1,imm) mips_format_i(c,24,src1,dest,imm)
+#define mips_daddu(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,45)
+#define mips_daddiu(c,dest,src1,imm) mips_format_i(c,25,src1,dest,imm)
+#define mips_dsub(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,46)
+#define mips_dsubu(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,47)
+#define mips_mul(c,dest,src1,src2) mips_format_r(c,28,src1,src2,dest,0,2)
+#define mips_sub(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,34)
+#define mips_subu(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,35)
+
+/* div and mul ops */
+#define mips_ddiv(c,src1,src2) mips_format_divmul(c,0,src1,src2,30)
+#define mips_ddivu(c,src1,src2) mips_format_divmul(c,0,src1,src2,31)
+#define mips_div(c,src1,src2) mips_format_divmul(c,0,src1,src2,26)
+#define mips_divu(c,src1,src2) mips_format_divmul(c,0,src1,src2,27)
+#define mips_dmult(c,src1,src2) mips_format_divmul(c,0,src1,src2,28)
+#define mips_dmultu(c,src1,src2) mips_format_divmul(c,0,src1,src2,29)
+#define mips_mult(c,src1,src2) mips_format_divmul(c,0,src1,src2,24)
+#define mips_multu(c,src1,src2) mips_format_divmul(c,0,src1,src2,25)
+
+/* shift ops */
+#define mips_dsll(c,dest,src1,imm) mips_format_r(c,0,0,src1,dest,imm,56)
+#define mips_dsll32(c,dest,src1,imm) mips_format_r(c,0,0,src1,dest,imm,60)
+#define mips_dsllv(c,dest,src1,src2) mips_format_r(c,0,src2,src1,dest,0,20)
+#define mips_dsra(c,dest,src1,imm) mips_format_r(c,0,0,src1,dest,imm,59)
+#define mips_dsra32(c,dest,src1,imm) mips_format_r(c,0,0,src1,dest,imm,63)
+#define mips_dsrav(c,dest,src1,src2) mips_format_r(c,0,src2,src1,dest,0,23)
+#define mips_dsrl(c,dest,src1,imm) mips_format_r(c,0,0,src1,dest,imm,58)
+#define mips_dsrl32(c,dest,src1,imm) mips_format_r(c,0,0,src1,dest,imm,62)
+#define mips_dsrlv(c,dest,src1,src2) mips_format_r(c,0,src2,src1,dest,0,22)
+#define mips_sll(c,dest,src1,imm) mips_format_r(c,0,0,src1,dest,imm,0)
+#define mips_sllv(c,dest,src1,src2) mips_format_r(c,0,src2,src1,dest,0,4)
+#define mips_sra(c,dest,src1,imm) mips_format_r(c,0,0,src1,dest,imm,3)
+#define mips_srav(c,dest,src1,src2) mips_format_r(c,0,src2,src1,dest,0,7)
+#define mips_srl(c,dest,src1,imm) mips_format_r(c,0,0,src1,dest,imm,2)
+#define mips_srlv(c,dest,src1,src2) mips_format_r(c,0,src2,src1,dest,0,6)
+
+/* logical ops */
+#define mips_and(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,36)
+#define mips_andi(c,dest,src1,imm) mips_format_i(c,12,src1,dest,imm)
+#define mips_nor(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,39)
+#define mips_or(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,37)
+#define mips_ori(c,dest,src1,uimm) mips_format_i(c,13,src1,dest,uimm)
+#define mips_xor(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,38)
+#define mips_xori(c,dest,src1,uimm) mips_format_i(c,14,src1,dest,uimm)
+
+/* compares */
+#define mips_slt(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,42)
+#define mips_slti(c,dest,src1,imm) mips_format_i(c,10,src1,dest,imm)
+#define mips_sltiu(c,dest,src1,imm) mips_format_i(c,11,src1,dest,imm)
+#define mips_sltu(c,dest,src1,src2) mips_format_r(c,0,src1,src2,dest,0,43)
+/* missing traps: teq, teqi, tge, tgei, tgeiu, tgeu, tlt, tlti, tltiu, tltu, tne, tnei, */
+
+/* conditional branches */
+#define mips_beq(c,src1,src2,offset) mips_format_i(c,4,src1,src2,offset)
+#define mips_beql(c,src1,src2,offset) mips_format_i(c,20,src1,src2,offset)
+#define mips_bgez(c,src1,offset) mips_format_i(c,1,src1,1,offset)
+#define mips_bgezal(c,src1,offset) mips_format_i(c,1,src1,17,offset)
+#define mips_bgezall(c,src1,offset) mips_format_i(c,1,src1,19,offset)
+#define mips_bgezl(c,src1,offset) mips_format_i(c,1,src1,3,offset)
+#define mips_bgtz(c,src1,offset) mips_format_i(c,7,src1,0,offset)
+#define mips_bgtzl(c,src1,offset) mips_format_i(c,23,src1,0,offset)
+#define mips_blez(c,src1,offset) mips_format_i(c,6,src1,0,offset)
+#define mips_blezl(c,src1,offset) mips_format_i(c,22,src1,0,offset)
+#define mips_bltz(c,src1,offset) mips_format_i(c,1,src1,0,offset)
+#define mips_bltzal(c,src1,offset) mips_format_i(c,1,src1,16,offset)
+#define mips_bltzall(c,src1,offset) mips_format_i(c,1,src1,18,offset)
+#define mips_bltzl(c,src1,offset) mips_format_i(c,1,src1,2,offset)
+#define mips_bne(c,src1,src2,offset) mips_format_i(c,5,src1,src2,offset)
+#define mips_bnel(c,src1,src2,offset) mips_format_i(c,21,src1,src2,offset)
+
+/* uncond branches and calls */
+#define mips_jump(c,target) mips_format_j(c,2,target)
+#define mips_jumpl(c,target) mips_format_j(c,3,target)
+#define mips_jalr(c,src1,retreg) mips_format_r(c,0,src1,0,retreg,0,9)
+#define mips_jr(c,src1) mips_emit32(c,((src1)<<21)|8)
+
+/* loads and stores */
+#define mips_lb(c,dest,base,offset) mips_format_i(c,32,base,dest,offset)
+#define mips_lbu(c,dest,base,offset) mips_format_i(c,36,base,dest,offset)
+#define mips_ld(c,dest,base,offset) mips_format_i(c,55,base,dest,offset)
+#define mips_ldl(c,dest,base,offset) mips_format_i(c,26,base,dest,offset)
+#define mips_ldr(c,dest,base,offset) mips_format_i(c,27,base,dest,offset)
+#define mips_lh(c,dest,base,offset) mips_format_i(c,33,base,dest,offset)
+#define mips_lhu(c,dest,base,offset) mips_format_i(c,37,base,dest,offset)
+#define mips_ll(c,dest,base,offset) mips_format_i(c,48,base,dest,offset)
+#define mips_lld(c,dest,base,offset) mips_format_i(c,52,base,dest,offset)
+#define mips_lui(c,dest,base,uimm) mips_format_i(c,15,base,dest,uimm)
+#define mips_lw(c,dest,base,offset) mips_format_i(c,35,base,dest,offset)
+#define mips_lwl(c,dest,base,offset) mips_format_i(c,34,base,dest,offset)
+#define mips_lwr(c,dest,base,offset) mips_format_i(c,38,base,dest,offset)
+#define mips_lwu(c,dest,base,offset) mips_format_i(c,39,base,dest,offset)
+
+#define mips_sb(c,src,base,offset) mips_format_i(c,40,base,src,offset)
+#define mips_sc(c,src,base,offset) mips_format_i(c,56,base,src,offset)
+#define mips_scd(c,src,base,offset) mips_format_i(c,60,base,src,offset)
+#define mips_sd(c,src,base,offset) mips_format_i(c,63,base,src,offset)
+#define mips_sdl(c,src,base,offset) mips_format_i(c,44,base,src,offset)
+#define mips_sdr(c,src,base,offset) mips_format_i(c,45,base,src,offset)
+#define mips_sh(c,src,base,offset) mips_format_i(c,41,base,src,offset)
+#define mips_sw(c,src,base,offset) mips_format_i(c,43,base,src,offset)
+#define mips_swl(c,src,base,offset) mips_format_i(c,50,base,src,offset)
+#define mips_swr(c,src,base,offset) mips_format_i(c,54,base,src,offset)
+
+/* misc and coprocessor ops */
+#define mips_move(c,dest,src) mips_addu(c,dest,src,mips_zero)
+#define mips_dmove(c,dest,src) mips_daddu(c,dest,src,mips_zero)
+#define mips_nop(c) mips_or(c,mips_at,mips_at,0)
+#define mips_break(c,code) mips_emit32(c, ((code)<<6)|13)
+#define mips_mfhi(c,dest) mips_format_r(c,0,0,0,dest,0,16)
+#define mips_mflo(c,dest) mips_format_r(c,0,0,0,dest,0,18)
+#define mips_mthi(c,src) mips_format_r(c,0,src,0,0,0,17)
+#define mips_mtlo(c,src) mips_format_r(c,0,src,0,0,0,19)
+#define mips_movn(c,dest,src,test) mips_format_r(c,0,src,test,dest,0,11)
+#define mips_movz(c,dest,src,test) mips_format_r(c,0,src,test,dest,0,10)
+#define mips_pref(c,hint,base,offset) mips_format_i(c,51,base,hint,offset)
+#define mips_prefidx(c,hint,base,idx) mips_format_r(c,19,base,idx,hint,0,15)
+#define mips_sync(c,stype) mips_emit32(c, ((stype)<<6)|15)
+#define mips_syscall(c,code) mips_emit32(c, ((code)<<6)|12)
+
+#define mips_cop(c,cop,fun) mips_emit32(c, ((16|(cop))<<26)|(fun))
+#define mips_ldc(c,cop,dest,base,offset) mips_format_i(c,(52|(cop)),base,dest,offset)
+#define mips_lwc(c,cop,dest,base,offset) mips_format_i(c,(48|(cop)),base,dest,offset)
+#define mips_sdc(c,cop,src,base,offset) mips_format_i(c,(60|(cop)),base,src,offset)
+#define mips_swc(c,cop,src,base,offset) mips_format_i(c,(56|(cop)),base,src,offset)
+#define mips_cfc1(c,dest,src) mips_format_r(c,17,2,dest,src,0,0)
+#define mips_ctc1(c,dest,src) mips_format_r(c,17,6,dest,src,0,0)
+
+/* fpu ops */
+#define mips_fabss(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,5)
+#define mips_fabsd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,5)
+#define mips_fadds(c,dest,src1,src2) mips_format_r(c,17,MIPS_FMT_SINGLE,src2,src1,dest,0)
+#define mips_faddd(c,dest,src1,src2) mips_format_r(c,17,MIPS_FMT_DOUBLE,src2,src1,dest,0)
+#define mips_fdivs(c,dest,src1,src2) mips_format_r(c,17,MIPS_FMT_SINGLE,src2,src1,dest,3)
+#define mips_fdivd(c,dest,src1,src2) mips_format_r(c,17,MIPS_FMT_DOUBLE,src2,src1,dest,3)
+#define mips_fmuls(c,dest,src1,src2) mips_format_r(c,17,MIPS_FMT_SINGLE,src2,src1,dest,2)
+#define mips_fmuld(c,dest,src1,src2) mips_format_r(c,17,MIPS_FMT_DOUBLE,src2,src1,dest,2)
+#define mips_fnegs(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,7)
+#define mips_fnegd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,7)
+#define mips_fsqrts(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,4)
+#define mips_fsqrtd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,4)
+#define mips_fsubs(c,dest,src1,src2) mips_format_r(c,17,MIPS_FMT_SINGLE,src2,src1,dest,1)
+#define mips_fsubd(c,dest,src1,src2) mips_format_r(c,17,MIPS_FMT_DOUBLE,src2,src1,dest,1)
+#define mips_madds(c,dest,src1,src2,srcadd) mips_format_r(c,19,srcadd,src2,src1,dest,32|MIPS_FMT_SINGLE)
+#define mips_maddd(c,dest,src1,src2,srcadd) mips_format_r(c,19,srcadd,src2,src1,dest,32|MIPS_FMT_DOUBLE)
+#define mips_nmadds(c,dest,src1,src2,srcadd) mips_format_r(c,19,srcadd,src2,src1,dest,48|MIPS_FMT_SINGLE)
+#define mips_nmaddd(c,dest,src1,src2,srcadd) mips_format_r(c,19,srcadd,src2,src1,dest,48|MIPS_FMT_DOUBLE)
+#define mips_msubs(c,dest,src1,src2,srcsub) mips_format_r(c,19,srcsub,src2,src1,dest,40|MIPS_FMT_SINGLE)
+#define mips_msubd(c,dest,src1,src2,srcsub) mips_format_r(c,19,srcsub,src2,src1,dest,40|MIPS_FMT_DOUBLE)
+#define mips_nmsubs(c,dest,src1,src2,srcsub) mips_format_r(c,19,srcsub,src2,src1,dest,56|MIPS_FMT_SINGLE)
+#define mips_nmsubd(c,dest,src1,src2,srcsub) mips_format_r(c,19,srcsub,src2,src1,dest,56|MIPS_FMT_DOUBLE)
+
+/* fp compare and branch */
+#define mips_fcmps(c,cond,src1,src2) mips_format_r(c,17,MIPS_FMT_SINGLE,src2,src1,0,(3<<4)|(cond))
+#define mips_fcmpd(c,cond,src1,src2) mips_format_r(c,17,MIPS_FMT_DOUBLE,src2,src1,0,(3<<4)|(cond))
+#define mips_fbfalse(c,offset) mips_format_i(c,17,8,0,offset)
+#define mips_fbfalsel(c,offset) mips_format_i(c,17,8,2,offset)
+#define mips_fbtrue(c,offset) mips_format_i(c,17,8,1,offset)
+#define mips_fbtruel(c,offset) mips_format_i(c,17,8,3,offset)
+
+/* fp convert */
+#define mips_ceills(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,10)
+#define mips_ceilld(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,10)
+#define mips_ceilws(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,14)
+#define mips_ceilwd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,14)
+#define mips_cvtds(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,33)
+#define mips_cvtdw(c,dest,src) mips_format_r(c,17,MIPS_FMT_WORD,0,src,dest,33)
+#define mips_cvtdl(c,dest,src) mips_format_r(c,17,MIPS_FMT_LONG,0,src,dest,33)
+#define mips_cvtls(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,37)
+#define mips_cvtld(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,37)
+#define mips_cvtsd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,32)
+#define mips_cvtsw(c,dest,src) mips_format_r(c,17,MIPS_FMT_WORD,0,src,dest,32)
+#define mips_cvtsl(c,dest,src) mips_format_r(c,17,MIPS_FMT_LONG,0,src,dest,32)
+#define mips_cvtws(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,36)
+#define mips_cvtwd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,36)
+#define mips_floorls(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,11)
+#define mips_floorld(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,11)
+#define mips_floorws(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,15)
+#define mips_floorwd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,15)
+#define mips_roundls(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,8)
+#define mips_roundld(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,8)
+#define mips_roundws(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,12)
+#define mips_roundwd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,12)
+#define mips_truncls(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,9)
+#define mips_truncld(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,9)
+#define mips_truncws(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,13)
+#define mips_truncwd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,13)
+
+/* fp moves, loads */
+#define mips_fmovs(c,dest,src) mips_format_r(c,17,MIPS_FMT_SINGLE,0,src,dest,6)
+#define mips_fmovd(c,dest,src) mips_format_r(c,17,MIPS_FMT_DOUBLE,0,src,dest,6)
+#define mips_mfc1(c,dest,src) mips_format_r(c,17,0,dest,src,0,0)
+#define mips_mtc1(c,dest,src) mips_format_r(c,17,4,src,dest,0,0)
+#define mips_dmfc1(c,dest,src) mips_format_r(c,17,1,0,dest,src,0)
+#define mips_dmtc1(c,dest,src) mips_format_r(c,17,1,0,src,dest,0)
+#define mips_ldc1(c,dest,base,offset) mips_ldc(c,1,dest,base,offset)
+#define mips_ldxc1(c,dest,base,idx) mips_format_r(c,19,base,idx,0,dest,1)
+#define mips_lwc1(c,dest,base,offset) mips_lwc(c,1,dest,base,offset)
+#define mips_lwxc1(c,dest,base,idx) mips_format_r(c,19,base,idx,0,dest,0)
+#define mips_sdc1(c,src,base,offset) mips_sdc(c,1,src,base,offset)
+#define mips_sdxc1(c,src,base,idx) mips_format_r(c,19,base,idx,src,0,9)
+#define mips_swc1(c,src,base,offset) mips_swc(c,1,src,base,offset)
+#define mips_swxc1(c,src,base,idx) mips_format_r(c,19,base,idx,src,0,8)
+
+#endif /* __MIPS_CODEGEN_H__ */
+
--- a/lib/ffts/src/arch/mips/test.c
+++ b/lib/ffts/src/arch/mips/test.c
@ -0,0 +1,159 @@
+#include "config.h"
+#include <stdlib.h>
+#include <string.h>
+
+#define NO_MIPS_JIT_DEBUG
+
+#include "mips-codegen.h"
+#include "mono/metadata/class.h"
+
+/* don't run the resulting program, it will destroy your computer,
+ * just objdump -d it to inspect we generated the correct assembler.
+ */
+
+int main (int argc, char *argv[]) {
+	guint32 *code, * p;
+
+	code = p = (guint32 *) malloc (sizeof (guint32) * 1024);
+	
+	mips_add (p, 3, 4, 5);
+	mips_addi (p, 3, 4, 5);
+	mips_addu (p, 3, 4, 5);
+	mips_addiu (p, 3, 4, 5);
+	mips_sub (p, 3, 4, 5);
+	mips_subu (p, 3, 4, 5);
+	mips_dadd (p, 3, 4, 5);
+	mips_daddi (p, 3, 4, 5);
+	mips_daddu (p, 3, 4, 5);
+	mips_daddiu (p, 3, 4, 5);
+	mips_dsub (p, 3, 4, 5);
+	mips_dsubu (p, 3, 4, 5);
+
+	mips_mult (p, 6, 7);
+	mips_multu (p, 6, 7);
+	mips_div (p, 6, 7);
+	mips_divu (p, 6, 7);
+	mips_dmult (p, 6, 7);
+	mips_dmultu (p, 6, 7);
+	mips_ddiv (p, 6, 7);
+	mips_ddivu (p, 6, 7);
+
+	mips_sll (p, 3, 4, 5);
+	mips_sllv (p, 3, 4, 5);
+	mips_sra (p, 3, 4, 5);
+	mips_srav (p, 3, 4, 5);
+	mips_srl (p, 3, 4, 5);
+	mips_srlv (p, 3, 4, 5);
+	mips_dsll (p, 3, 4, 5);
+	mips_dsll32 (p, 3, 4, 5);
+	mips_dsllv (p, 3, 4, 5);
+	mips_dsra (p, 3, 4, 5);
+	mips_dsra32 (p, 3, 4, 5);
+	mips_dsrav (p, 3, 4, 5);
+	mips_dsrl (p, 3, 4, 5);
+	mips_dsrl32 (p, 3, 4, 5);
+	mips_dsrlv (p, 3, 4, 5);
+
+	mips_and (p, 8, 9, 10);
+	mips_andi (p, 8, 9, 10);
+	mips_nor (p, 8, 9, 10);
+	mips_or (p, 8, 9, 10);
+	mips_ori (p, 8, 9, 10);
+	mips_xor (p, 8, 9, 10);
+	mips_xori (p, 8, 9, 10);
+
+	mips_slt (p, 8, 9, 10);
+	mips_slti (p, 8, 9, 10);
+	mips_sltu (p, 8, 9, 10);
+	mips_sltiu (p, 8, 9, 10);
+
+	mips_beq (p, 8, 9, 0xff1f);
+	mips_beql (p, 8, 9, 0xff1f);
+	mips_bne (p, 8, 9, 0xff1f);
+	mips_bnel (p, 8, 9, 0xff1f);
+	mips_bgez (p, 11, 0xff1f);
+	mips_bgezal (p, 11, 0xff1f);
+	mips_bgezall (p, 11, 0xff1f);
+	mips_bgezl (p, 11, 0xff1f);
+	mips_bgtz (p, 11, 0xff1f);
+	mips_bgtzl (p, 11, 0xff1f);
+	mips_blez (p, 11, 0xff1f);
+	mips_blezl (p, 11, 0xff1f);
+	mips_bltz (p, 11, 0xff1f);
+	mips_bltzal (p, 11, 0xff1f);
+	mips_bltzall (p, 11, 0xff1f);
+	mips_bltzl (p, 11, 0xff1f);
+
+	mips_jump (p, 0xff1f);
+	mips_jumpl (p, 0xff1f);
+	mips_jalr (p, 12, mips_ra);
+	mips_jr (p, 12);
+
+	mips_lb (p, 13, 14, 128);
+	mips_lbu (p, 13, 14, 128);
+	mips_ld (p, 13, 14, 128);
+	mips_ldl (p, 13, 14, 128);
+	mips_ldr (p, 13, 14, 128);
+	mips_lh (p, 13, 14, 128);
+	mips_lhu (p, 13, 14, 128);
+	mips_ll (p, 13, 14, 128);
+	mips_lld (p, 13, 14, 128);
+	mips_lui (p, 13, 14, 128);
+	mips_lw (p, 13, 14, 128);
+	mips_lwl (p, 13, 14, 128);
+	mips_lwr (p, 13, 14, 128);
+	mips_lwu (p, 13, 14, 128);
+	mips_sb (p, 13, 14, 128);
+	mips_sc (p, 13, 14, 128);
+	mips_scd (p, 13, 14, 128);
+	mips_sd (p, 13, 14, 128);
+	mips_sdl (p, 13, 14, 128);
+	mips_sdr (p, 13, 14, 128);
+	mips_sh (p, 13, 14, 128);
+	mips_sw (p, 13, 14, 128);
+	mips_swl (p, 13, 14, 128);
+	mips_swr (p, 13, 14, 128);
+
+	mips_move (p, 15, 16);
+	mips_nop (p);
+	mips_break (p, 0);
+	mips_sync (p, 0);
+	mips_mfhi (p, 17);
+	mips_mflo (p, 17);
+	mips_mthi (p, 17);
+	mips_mtlo (p, 17);
+
+	mips_fabsd (p, 16, 18);
+	mips_fnegd (p, 16, 18);
+	mips_fsqrtd (p, 16, 18);
+	mips_faddd (p, 16, 18, 20);
+	mips_fdivd (p, 16, 18, 20);
+	mips_fmuld (p, 16, 18, 20);
+	mips_fsubd (p, 16, 18, 20);
+
+	mips_fcmpd (p, MIPS_FPU_EQ, 18, 20);
+	mips_fbfalse (p, 0xff1f);
+	mips_fbfalsel (p, 0xff1f);
+	mips_fbtrue (p, 0xff1f);
+	mips_fbtruel (p, 0xff1f);
+
+	mips_ceilwd (p, 20, 22);
+	mips_ceilld (p, 20, 22);
+	mips_floorwd (p, 20, 22);
+	mips_floorld (p, 20, 22);
+	mips_roundwd (p, 20, 22);
+	mips_roundld (p, 20, 22);
+	mips_truncwd (p, 20, 22);
+	mips_truncld (p, 20, 22);
+	mips_cvtdw (p, 20, 22);
+	mips_cvtds (p, 20, 22);
+	mips_cvtdl (p, 20, 22);
+	mips_cvtld (p, 20, 22);
+	mips_cvtsd (p, 20, 22);
+	mips_cvtwd (p, 20, 22);
+
+	mips_fmovd (p, 20, 22);
+	printf ("size: %d\n", p - code);
+
+	return 0;
+}
--- a/lib/ffts/src/arch/ppc/.gitignore
+++ b/lib/ffts/src/arch/ppc/.gitignore
@ -0,0 +1,7 @@
+/Makefile
+/Makefile.in
+/.libs
+/.deps
+/*.la
+/*.lo
+/test
--- a/lib/ffts/src/arch/ppc/Makefile.am
+++ b/lib/ffts/src/arch/ppc/Makefile.am
@ -0,0 +1 @@
+EXTRA_DIST = ppc-codegen.h
--- a/lib/ffts/src/arch/ppc/ppc-codegen.h
+++ b/lib/ffts/src/arch/ppc/ppc-codegen.h
@ -0,0 +1,953 @@
+/*
+   Authors:
+     Radek Doulik
+     Christopher Taylor <ct_AT_clemson_DOT_edu>
+     Andreas Faerber <andreas.faerber@web.de>
+
+   Copyright (C)  2001 Radek Doulik
+   Copyright (C)  2007-2008 Andreas Faerber
+
+   for testing do the following: ./test | as -o test.o
+*/
+
+#ifndef __MONO_PPC_CODEGEN_H__
+#define __MONO_PPC_CODEGEN_H__
+#include <glib.h>
+#include <assert.h>
+
+typedef enum {
+	ppc_r0 = 0,
+	ppc_r1,
+	ppc_sp = ppc_r1,
+	ppc_r2,
+	ppc_r3,
+	ppc_r4,
+	ppc_r5,
+	ppc_r6,
+	ppc_r7,
+	ppc_r8,
+	ppc_r9,
+	ppc_r10,
+	ppc_r11,
+	ppc_r12,
+	ppc_r13,
+	ppc_r14,
+	ppc_r15,
+	ppc_r16,
+	ppc_r17,
+	ppc_r18,
+	ppc_r19,
+	ppc_r20,
+	ppc_r21,
+	ppc_r22,
+	ppc_r23,
+	ppc_r24,
+	ppc_r25,
+	ppc_r26,
+	ppc_r27,
+	ppc_r28,
+	ppc_r29,
+	ppc_r30,
+	ppc_r31
+} PPCIntRegister;
+
+typedef enum {
+	ppc_f0 = 0,
+	ppc_f1,
+	ppc_f2,
+	ppc_f3,
+	ppc_f4,
+	ppc_f5,
+	ppc_f6,
+	ppc_f7,
+	ppc_f8,
+	ppc_f9,
+	ppc_f10,
+	ppc_f11,
+	ppc_f12,
+	ppc_f13,
+	ppc_f14,
+	ppc_f15,
+	ppc_f16,
+	ppc_f17,
+	ppc_f18,
+	ppc_f19,
+	ppc_f20,
+	ppc_f21,
+	ppc_f22,
+	ppc_f23,
+	ppc_f24,
+	ppc_f25,
+	ppc_f26,
+	ppc_f27,
+	ppc_f28,
+	ppc_f29,
+	ppc_f30,
+	ppc_f31
+} PPCFloatRegister;
+
+typedef enum {
+	ppc_lr = 256,
+	ppc_ctr = 256 + 32,
+	ppc_xer = 32
+} PPCSpecialRegister;
+
+enum {
+	/* B0 operand for branches */
+	PPC_BR_DEC_CTR_NONZERO_FALSE = 0,
+	PPC_BR_LIKELY = 1, /* can be or'ed with the conditional variants */
+	PPC_BR_DEC_CTR_ZERO_FALSE = 2,
+	PPC_BR_FALSE  = 4,
+	PPC_BR_DEC_CTR_NONZERO_TRUE = 8,
+	PPC_BR_DEC_CTR_ZERO_TRUE = 10,
+	PPC_BR_TRUE   = 12,
+	PPC_BR_DEC_CTR_NONZERO = 16,
+	PPC_BR_DEC_CTR_ZERO = 18,
+	PPC_BR_ALWAYS = 20,
+	/* B1 operand for branches */
+	PPC_BR_LT     = 0,
+	PPC_BR_GT     = 1,
+	PPC_BR_EQ     = 2,
+	PPC_BR_SO     = 3
+};
+
+enum {
+	PPC_TRAP_LT = 1,
+	PPC_TRAP_GT = 2,
+	PPC_TRAP_EQ = 4,
+	PPC_TRAP_LT_UN = 8,
+	PPC_TRAP_GT_UN = 16,
+	PPC_TRAP_LE = 1 + PPC_TRAP_EQ,
+	PPC_TRAP_GE = 2 + PPC_TRAP_EQ,
+	PPC_TRAP_LE_UN = 8 + PPC_TRAP_EQ,
+	PPC_TRAP_GE_UN = 16 + PPC_TRAP_EQ
+};
+
+#define ppc_emit32(c,x) do { *((guint32 *) (c)) = GUINT32_TO_BE (x); (c) = (gpointer)((guint8 *)(c) + sizeof (guint32));} while (0)
+
+#define ppc_is_imm16(val) ((((val)>> 15) == 0) || (((val)>> 15) == -1))
+#define ppc_is_uimm16(val) ((glong)(val) >= 0L && (glong)(val) <= 65535L)
+#define ppc_ha(val) (((val >> 16) + ((val & 0x8000) ? 1 : 0)) & 0xffff)
+
+#define ppc_load32(c,D,v) G_STMT_START {	\
+		ppc_lis ((c), (D),      (guint32)(v) >> 16);	\
+		ppc_ori ((c), (D), (D), (guint32)(v) & 0xffff);	\
+	} G_STMT_END
+
+/* Macros to load/store pointer sized quantities */
+
+#if defined(__mono_ppc64__) && !defined(__mono_ilp32__)
+
+#define ppc_ldptr(c,D,d,A)         ppc_ld   ((c), (D), (d), (A))
+#define ppc_ldptr_update(c,D,d,A)  ppc_ldu  ((c), (D), (d), (A))
+#define ppc_ldptr_indexed(c,D,A,B)        ppc_ldx  ((c), (D), (A), (B))
+#define ppc_ldptr_update_indexed(c,D,A,B) ppc_ldux ((c), (D), (A), (B))
+
+#define ppc_stptr(c,S,d,A)        ppc_std  ((c), (S), (d), (A))
+#define ppc_stptr_update(c,S,d,A) ppc_stdu ((c), (S), (d), (A))
+#define ppc_stptr_indexed(c,S,A,B)        ppc_stdx  ((c), (S), (A), (B))
+#define ppc_stptr_update_indexed(c,S,A,B) ppc_stdux ((c), (S), (A), (B))
+
+#else
+
+/* Same as ppc32 */
+#define ppc_ldptr(c,D,d,A)         ppc_lwz  ((c), (D), (d), (A))
+#define ppc_ldptr_update(c,D,d,A)  ppc_lwzu ((c), (D), (d), (A))
+#define ppc_ldptr_indexed(c,D,A,B)        ppc_lwzx ((c), (D), (A), (B))
+#define ppc_ldptr_update_indexed(c,D,A,B) ppc_lwzux ((c), (D), (A), (B))
+
+#define ppc_stptr(c,S,d,A)        ppc_stw  ((c), (S), (d), (A))
+#define ppc_stptr_update(c,S,d,A) ppc_stwu ((c), (S), (d), (A))
+#define ppc_stptr_indexed(c,S,A,B)        ppc_stwx  ((c), (S), (A), (B))
+#define ppc_stptr_update_indexed(c,S,A,B) ppc_stwux ((c), (S), (A), (B))
+
+#endif
+
+/* Macros to load pointer sized immediates */
+#define ppc_load_ptr(c,D,v) ppc_load ((c),(D),(gsize)(v))
+#define ppc_load_ptr_sequence(c,D,v) ppc_load_sequence ((c),(D),(gsize)(v))
+
+/* Macros to load/store regsize quantities */
+
+#ifdef __mono_ppc64__
+#define ppc_ldr(c,D,d,A)         ppc_ld  ((c), (D), (d), (A))
+#define ppc_ldr_indexed(c,D,A,B) ppc_ldx  ((c), (D), (A), (B))
+#define ppc_str(c,S,d,A)         ppc_std ((c), (S), (d), (A))
+#define ppc_str_update(c,S,d,A)  ppc_stdu ((c), (S), (d), (A))
+#define ppc_str_indexed(c,S,A,B) ppc_stdx ((c), (S), (A), (B))
+#define ppc_str_update_indexed(c,S,A,B) ppc_stdux ((c), (S), (A), (B))
+#else
+#define ppc_ldr(c,D,d,A)         ppc_lwz  ((c), (D), (d), (A))
+#define ppc_ldr_indexed(c,D,A,B) ppc_lwzx ((c), (D), (A), (B))
+#define ppc_str(c,S,d,A)         ppc_stw ((c), (S), (d), (A))
+#define ppc_str_update(c,S,d,A)  ppc_stwu ((c), (S), (d), (A))
+#define ppc_str_indexed(c,S,A,B) ppc_stwx ((c), (S), (A), (B))
+#define ppc_str_update_indexed(c,S,A,B) ppc_stwux ((c), (S), (A), (B))
+#endif
+
+#define ppc_str_multiple(c,S,d,A) ppc_store_multiple_regs((c),(S),(d),(A))
+#define ppc_ldr_multiple(c,D,d,A) ppc_load_multiple_regs((c),(D),(d),(A))
+
+/* PPC32 macros */
+
+#ifndef __mono_ppc64__
+
+#define ppc_load_sequence(c,D,v) ppc_load32 ((c), (D), (guint32)(v))
+
+#define PPC_LOAD_SEQUENCE_LENGTH	8
+
+#define ppc_load(c,D,v) G_STMT_START {	\
+		if (ppc_is_imm16 ((guint32)(v)))	{	\
+			ppc_li ((c), (D), (guint16)(guint32)(v));	\
+		} else {	\
+			ppc_load32 ((c), (D), (guint32)(v));	\
+		}	\
+	} G_STMT_END
+
+#define ppc_load_func(c,D,V)	      ppc_load_sequence ((c), (D), (V))
+
+#define ppc_load_multiple_regs(c,D,d,A)      ppc_lmw   ((c), (D), (d), (A))
+
+#define ppc_store_multiple_regs(c,S,d,A)      ppc_stmw  ((c), (S), (d), (A))
+
+#define ppc_compare(c,cfrD,A,B)		      ppc_cmp((c), (cfrD), 0, (A), (B))
+#define ppc_compare_reg_imm(c,cfrD,A,B)	      ppc_cmpi((c), (cfrD), 0, (A), (B))
+#define ppc_compare_log(c,cfrD,A,B)	      ppc_cmpl((c), (cfrD), 0, (A), (B))
+
+#define ppc_shift_left(c,A,S,B)		      ppc_slw((c), (S), (A), (B))
+#define ppc_shift_left_imm(c,A,S,n)	      ppc_slwi((c), (A), (S), (n))
+
+#define ppc_shift_right_imm(c,A,S,B)	      ppc_srwi((c), (A), (S), (B))
+#define ppc_shift_right_arith_imm(c,A,S,B)    ppc_srawi((c), (A), (S), (B))
+
+#define ppc_multiply(c,D,A,B)		      ppc_mullw((c), (D), (A), (B))
+
+#define ppc_clear_right_imm(c,A,S,n)	      ppc_clrrwi((c), (A), (S), (n))
+
+#endif
+
+#define ppc_opcode(c) ((c) >> 26)
+#define ppc_split_5_1_1(x) (((x) >> 5) & 0x1)
+#define ppc_split_5_1_5(x) ((x) & 0x1F)
+#define ppc_split_5_1(x) ((ppc_split_5_1_5(x) << 1) | ppc_split_5_1_1(x))
+
+#define ppc_break(c) ppc_tw((c),31,0,0)
+#define  ppc_addi(c,D,A,i) ppc_emit32 (c, (14 << 26) | ((D) << 21) | ((A) << 16) | (guint16)(i))
+#define ppc_addis(c,D,A,i) ppc_emit32 (c, (15 << 26) | ((D) << 21) | ((A) << 16) | (guint16)(i))
+#define    ppc_li(c,D,v)   ppc_addi   (c, D, 0, (guint16)(v))
+#define   ppc_lis(c,D,v)   ppc_addis  (c, D, 0, (guint16)(v))
+#define   ppc_lwz(c,D,d,A) ppc_emit32 (c, (32 << 26) | ((D) << 21) | ((A) << 16) | (guint16)(d))
+#define   ppc_lhz(c,D,d,A) ppc_emit32 (c, (40 << 26) | ((D) << 21) | ((A) << 16) | (guint16)(d))
+#define   ppc_lbz(c,D,d,A) ppc_emit32 (c, (34 << 26) | ((D) << 21) | ((A) << 16) | (guint16)(d))
+#define   ppc_stw(c,S,d,A) ppc_emit32 (c, (36 << 26) | ((S) << 21) | ((A) << 16) | (guint16)(d))
+#define   ppc_sth(c,S,d,A) ppc_emit32 (c, (44 << 26) | ((S) << 21) | ((A) << 16) | (guint16)(d))
+#define   ppc_stb(c,S,d,A) ppc_emit32 (c, (38 << 26) | ((S) << 21) | ((A) << 16) | (guint16)(d))
+#define  ppc_stwu(c,s,d,A) ppc_emit32 (c, (37 << 26) | ((s) << 21) | ((A) << 16) | (guint16)(d))
+#define    ppc_or(c,a,s,b) ppc_emit32 (c, (31 << 26) | ((s) << 21) | ((a) << 16) | ((b) << 11) | 888)
+#define    ppc_mr(c,a,s)   ppc_or     (c, a, s, s)
+#define   ppc_ori(c,S,A,ui) ppc_emit32 (c, (24 << 26) | ((S) << 21) | ((A) << 16) | (guint16)(ui))
+#define	  ppc_nop(c)       ppc_ori    (c, 0, 0, 0)
+#define ppc_mfspr(c,D,spr) ppc_emit32 (c, (31 << 26) | ((D) << 21) | ((spr) << 11) | (339 << 1))
+#define  ppc_mflr(c,D)     ppc_mfspr  (c, D, ppc_lr)
+#define ppc_mtspr(c,spr,S) ppc_emit32 (c, (31 << 26) | ((S) << 21) | ((spr) << 11) | (467 << 1))
+#define  ppc_mtlr(c,S)     ppc_mtspr  (c, ppc_lr, S)
+#define  ppc_mtctr(c,S)     ppc_mtspr  (c, ppc_ctr, S)
+#define  ppc_mtxer(c,S)     ppc_mtspr  (c, ppc_xer, S)
+
+#define  ppc_b(c,li)       ppc_emit32 (c, (18 << 26) | ((li) << 2))
+#define  ppc_bl(c,li)       ppc_emit32 (c, (18 << 26) | ((li) << 2) | 1)
+#define  ppc_ba(c,li)       ppc_emit32 (c, (18 << 26) | ((li) << 2) | 2)
+#define  ppc_bla(c,li)       ppc_emit32 (c, (18 << 26) | ((li) << 2) | 3)
+#define  ppc_blrl(c)       ppc_emit32 (c, 0x4e800021)
+#define   ppc_blr(c)       ppc_emit32 (c, 0x4e800020)
+
+#define   ppc_lfs(c,D,d,A) ppc_emit32 (c, (48 << 26) | ((D) << 21) | ((A) << 16) | (guint16)(d))
+#define   ppc_lfd(c,D,d,A) ppc_emit32 (c, (50 << 26) | ((D) << 21) | ((A) << 16) | (guint16)(d))
+#define  ppc_stfs(c,S,d,a) ppc_emit32 (c, (52 << 26) | ((S) << 21) | ((a) << 16) | (guint16)(d))
+#define  ppc_stfd(c,S,d,a) ppc_emit32 (c, (54 << 26) | ((S) << 21) | ((a) << 16) | (guint16)(d))
+
+/***********************************************************************
+The macros below were tapped out by Christopher Taylor <ct_AT_clemson_DOT_edu>
+from 18 November 2002 to 19 December 2002.
+
+Special thanks to rodo, lupus, dietmar, miguel, and duncan for patience,
+and motivation.
+
+The macros found in this file are based on the assembler instructions found 
+in Motorola and Digital DNA's:
+
+"Programming Enviornments Manual For 32-bit Implementations of the PowerPC Architecture"
+
+MPCFPE32B/AD
+12/2001
+REV2
+
+see pages 326 - 524 for detailed information regarding each instruction
+
+Also see the "Ximian Copyright Agreement, 2002" for more information regarding
+my and Ximian's copyright to this code. ;)
+*************************************************************************/
+
+#define ppc_addx(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (OE << 10) | (266 << 1) | Rc)
+#define ppc_add(c,D,A,B) ppc_addx(c,D,A,B,0,0)
+#define ppc_addd(c,D,A,B) ppc_addx(c,D,A,B,0,1)
+#define ppc_addo(c,D,A,B) ppc_addx(c,D,A,B,1,0)
+#define ppc_addod(c,D,A,B) ppc_addx(c,D,A,B,1,1)
+
+#define ppc_addcx(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (OE << 10) | (10 << 1) | Rc)
+#define ppc_addc(c,D,A,B) ppc_addcx(c,D,A,B,0,0)
+#define ppc_addcd(c,D,A,B) ppc_addcx(c,D,A,B,0,1)
+#define ppc_addco(c,D,A,B) ppc_addcx(c,D,A,B,1,0)
+#define ppc_addcod(c,D,A,B) ppc_addcx(c,D,A,B,1,1)
+
+#define ppc_addex(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (OE << 10) | (138 << 1) | Rc)
+#define ppc_adde(c,D,A,B) ppc_addex(c,D,A,B,0,0)
+#define ppc_added(c,D,A,B) ppc_addex(c,D,A,B,0,1)
+#define ppc_addeo(c,D,A,B) ppc_addex(c,D,A,B,1,0)
+#define ppc_addeod(c,D,A,B) ppc_addex(c,D,A,B,1,1)
+
+#define ppc_addic(c,D,A,i) ppc_emit32(c, (12 << 26) | ((D) << 21) | ((A) << 16) | (guint16)(i))
+#define ppc_addicd(c,D,A,i) ppc_emit32(c, (13 << 26) | ((D) << 21) | ((A) << 16) | (guint16)(i))
+
+#define ppc_addmex(c,D,A,OE,RC) ppc_emit32(c, (31 << 26) | ((D) << 21 ) | ((A) << 16) | (0 << 11) | ((OE) << 10) | (234 << 1) | RC)
+#define ppc_addme(c,D,A) ppc_addmex(c,D,A,0,0)
+#define ppc_addmed(c,D,A) ppc_addmex(c,D,A,0,1)
+#define ppc_addmeo(c,D,A) ppc_addmex(c,D,A,1,0)
+#define ppc_addmeod(c,D,A) ppc_addmex(c,D,A,1,1)
+
+#define ppc_addzex(c,D,A,OE,RC) ppc_emit32(c, (31 << 26) | ((D) << 21 ) | ((A) << 16) | (0 << 11) | ((OE) << 10) | (202 << 1) | RC)
+#define ppc_addze(c,D,A) ppc_addzex(c,D,A,0,0)
+#define ppc_addzed(c,D,A) ppc_addzex(c,D,A,0,1)
+#define ppc_addzeo(c,D,A) ppc_addzex(c,D,A,1,0)
+#define ppc_addzeod(c,D,A) ppc_addzex(c,D,A,1,1)
+
+#define ppc_andx(c,S,A,B,RC) ppc_emit32(c, (31 << 26) | ((S) << 21 ) | ((A) << 16) | ((B) << 11) | (28 << 1) | RC)
+#define ppc_and(c,S,A,B) ppc_andx(c,S,A,B,0)
+#define ppc_andd(c,S,A,B) ppc_andx(c,S,A,B,1)
+
+#define ppc_andcx(c,S,A,B,RC) ppc_emit32(c, (31 << 26) | ((S) << 21 ) | ((A) << 16) | ((B) << 11) | (60 << 1) | RC)
+#define ppc_andc(c,S,A,B) ppc_andcx(c,S,A,B,0)
+#define ppc_andcd(c,S,A,B) ppc_andcx(c,S,A,B,1)
+
+#define ppc_andid(c,S,A,ui) ppc_emit32(c, (28 << 26) | ((S) << 21 ) | ((A) << 16) | ((guint16)(ui)))
+#define ppc_andisd(c,S,A,ui) ppc_emit32(c, (29 << 26) | ((S) << 21 ) | ((A) << 16) | ((guint16)(ui)))
+
+#define ppc_bcx(c,BO,BI,BD,AA,LK) ppc_emit32(c, (16 << 26) | (BO << 21 )| (BI << 16) | (BD << 2) | ((AA) << 1) | LK)
+#define ppc_bc(c,BO,BI,BD) ppc_bcx(c,BO,BI,BD,0,0) 
+#define ppc_bca(c,BO,BI,BD) ppc_bcx(c,BO,BI,BD,1,0)
+#define ppc_bcl(c,BO,BI,BD) ppc_bcx(c,BO,BI,BD,0,1)
+#define ppc_bcla(c,BO,BI,BD) ppc_bcx(c,BO,BI,BD,1,1)
+
+#define ppc_bcctrx(c,BO,BI,LK) ppc_emit32(c, (19 << 26) | (BO << 21 )| (BI << 16) | (0 << 11) | (528 << 1) | LK)
+#define ppc_bcctr(c,BO,BI) ppc_bcctrx(c,BO,BI,0)
+#define ppc_bcctrl(c,BO,BI) ppc_bcctrx(c,BO,BI,1)
+
+#define ppc_bnectrp(c,BO,BI) ppc_bcctr(c,BO,BI)
+#define ppc_bnectrlp(c,BO,BI) ppc_bcctr(c,BO,BI)
+
+#define ppc_bclrx(c,BO,BI,BH,LK) ppc_emit32(c, (19 << 26) | ((BO) << 21 )| ((BI) << 16) | (0 << 13) | ((BH) << 11) | (16 << 1) | (LK))
+#define ppc_bclr(c,BO,BI,BH) ppc_bclrx(c,BO,BI,BH,0)
+#define ppc_bclrl(c,BO,BI,BH) ppc_bclrx(c,BO,BI,BH,1)
+
+#define ppc_bnelrp(c,BO,BI) ppc_bclr(c,BO,BI,0)
+#define ppc_bnelrlp(c,BO,BI) ppc_bclr(c,BO,BI,0)
+
+#define ppc_cmp(c,cfrD,L,A,B) ppc_emit32(c, (31 << 26) | ((cfrD) << 23) | (0 << 22) | ((L) << 21) | ((A) << 16) | ((B) << 11) | (0 << 1) | 0)
+#define ppc_cmpi(c,cfrD,L,A,B) ppc_emit32(c, (11 << 26) | (cfrD << 23) | (0 << 22) | (L << 21) | (A << 16) | (guint16)(B))
+#define ppc_cmpl(c,cfrD,L,A,B) ppc_emit32(c, (31 << 26) | ((cfrD) << 23) | (0 << 22) | ((L) << 21) | ((A) << 16) | ((B) << 11) | (32 << 1) | 0)
+#define ppc_cmpli(c,cfrD,L,A,B) ppc_emit32(c, (10 << 26) | (cfrD << 23) | (0 << 22) | (L << 21) | (A << 16) | (guint16)(B))
+#define ppc_cmpw(c,cfrD,A,B) ppc_cmp(c, (cfrD), 0, (A), (B))
+
+#define ppc_cntlzwx(c,S,A,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (0 << 11) | (26 << 1) | Rc)
+#define ppc_cntlzw(c,S,A) ppc_cntlzwx(c,S,A,0)
+#define ppc_cntlzwd(c,S,A) ppc_cntlzwx(c,S,A,1)
+
+#define ppc_crand(c,D,A,B) ppc_emit32(c, (19 << 26) | (D << 21) | (A << 16) | (B << 11) | (257 << 1) | 0)
+#define ppc_crandc(c,D,A,B) ppc_emit32(c, (19 << 26) | (D << 21) | (A << 16) | (B << 11) | (129 << 1) | 0)
+#define ppc_creqv(c,D,A,B) ppc_emit32(c, (19 << 26) | (D << 21) | (A << 16) | (B << 11) | (289 << 1) | 0)
+#define ppc_crnand(c,D,A,B) ppc_emit32(c, (19 << 26) | (D << 21) | (A << 16) | (B << 11) | (225 << 1) | 0)
+#define ppc_crnor(c,D,A,B) ppc_emit32(c, (19 << 26) | (D << 21) | (A << 16) | (B << 11) | (33 << 1) | 0)
+#define ppc_cror(c,D,A,B) ppc_emit32(c, (19 << 26) | (D << 21) | (A << 16) | (B << 11) | (449 << 1) | 0)
+#define ppc_crorc(c,D,A,B) ppc_emit32(c, (19 << 26) | (D << 21) | (A << 16) | (B << 11) | (417 << 1) | 0)
+#define ppc_crxor(c,D,A,B) ppc_emit32(c, (19 << 26) | (D << 21) | (A << 16) | (B << 11) | (193 << 1) | 0)
+
+#define ppc_dcba(c,A,B) ppc_emit32(c, (31 << 26) | (0 << 21) | (A << 16) | (B << 11) | (758 << 1) | 0)
+#define ppc_dcbf(c,A,B) ppc_emit32(c, (31 << 26) | (0 << 21) | (A << 16) | (B << 11) | (86 << 1) | 0)
+#define ppc_dcbi(c,A,B) ppc_emit32(c, (31 << 26) | (0 << 21) | (A << 16) | (B << 11) | (470 << 1) | 0)
+#define ppc_dcbst(c,A,B) ppc_emit32(c, (31 << 26) | (0 << 21) | (A << 16) | (B << 11) | (54 << 1) | 0)
+#define ppc_dcbt(c,A,B) ppc_emit32(c, (31 << 26) | (0 << 21) | (A << 16) | (B << 11) | (278 << 1) | 0)
+#define ppc_dcbtst(c,A,B) ppc_emit32(c, (31 << 26) | (0 << 21) | (A << 16) | (B << 11) | (246 << 1) | 0)
+#define ppc_dcbz(c,A,B) ppc_emit32(c, (31 << 26) | (0 << 21) | (A << 16) | (B << 11) | (1014 << 1) | 0)
+
+#define ppc_divwx(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (OE << 10) | (491 << 1) | Rc)
+#define ppc_divw(c,D,A,B) ppc_divwx(c,D,A,B,0,0)
+#define ppc_divwd(c,D,A,B) ppc_divwx(c,D,A,B,0,1)
+#define ppc_divwo(c,D,A,B) ppc_divwx(c,D,A,B,1,0)
+#define ppc_divwod(c,D,A,B) ppc_divwx(c,D,A,B,1,1)
+
+#define ppc_divwux(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (OE << 10) | (459 << 1) | Rc)
+#define ppc_divwu(c,D,A,B) ppc_divwux(c,D,A,B,0,0)
+#define ppc_divwud(c,D,A,B) ppc_divwux(c,D,A,B,0,1)
+#define ppc_divwuo(c,D,A,B) ppc_divwux(c,D,A,B,1,0)
+#define ppc_divwuod(c,D,A,B) ppc_divwux(c,D,A,B,1,1)
+
+#define ppc_eciwx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (310 << 1) | 0)
+#define ppc_ecowx(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (438 << 1) | 0)
+#define ppc_eieio(c) ppc_emit32(c, (31 << 26) | (0 << 21) | (0 << 16) | (0 << 11) | (854 << 1) | 0)
+
+#define ppc_eqvx(c,A,S,B,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (284 << 1) | Rc)
+#define ppc_eqv(c,A,S,B) ppc_eqvx(c,A,S,B,0)
+#define ppc_eqvd(c,A,S,B) ppc_eqvx(c,A,S,B,1)
+
+#define ppc_extsbx(c,A,S,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (0 << 11) | (954 << 1) | Rc) 
+#define ppc_extsb(c,A,S) ppc_extsbx(c,A,S,0)
+#define ppc_extsbd(c,A,S) ppc_extsbx(c,A,S,1)
+
+#define ppc_extshx(c,A,S,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (0 << 11) | (922 << 1) | Rc) 
+#define ppc_extsh(c,A,S) ppc_extshx(c,A,S,0)
+#define ppc_extshd(c,A,S) ppc_extshx(c,A,S,1)
+
+#define ppc_fabsx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (264 << 1) | Rc) 
+#define ppc_fabs(c,D,B) ppc_fabsx(c,D,B,0)
+#define ppc_fabsd(c,D,B) ppc_fabsx(c,D,B,1)
+
+#define ppc_faddx(c,D,A,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (A << 16) | (B << 11) | (0 << 6) | (21 << 1) | Rc)
+#define ppc_fadd(c,D,A,B) ppc_faddx(c,D,A,B,0)
+#define ppc_faddd(c,D,A,B) ppc_faddx(c,D,A,B,1)
+
+#define ppc_faddsx(c,D,A,B,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (A << 16) | (B << 11) | (0 << 6) | (21 << 1) | Rc)
+#define ppc_fadds(c,D,A,B) ppc_faddsx(c,D,A,B,0)
+#define ppc_faddsd(c,D,A,B) ppc_faddsx(c,D,A,B,1)
+
+#define ppc_fcmpo(c,crfD,A,B) ppc_emit32(c, (63 << 26) | (crfD << 23) | (0 << 21) | (A << 16) | (B << 11) | (32 << 1) | 0)
+#define ppc_fcmpu(c,crfD,A,B) ppc_emit32(c, (63 << 26) | (crfD << 23) | (0 << 21) | (A << 16) | (B << 11) | (0 << 1) | 0)
+
+#define ppc_fctiwx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (14 << 1) | Rc)
+#define ppc_fctiw(c,D,B) ppc_fctiwx(c,D,B,0)
+#define ppc_fctiwd(c,D,B) ppc_fctiwx(c,D,B,1)
+
+#define ppc_fctiwzx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (15 << 1) | Rc)
+#define ppc_fctiwz(c,D,B) ppc_fctiwzx(c,D,B,0)
+#define ppc_fctiwzd(c,D,B) ppc_fctiwzx(c,D,B,1)
+
+#define ppc_fdivx(c,D,A,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (A << 16) | (B << 11) | (0 << 6) | (18 << 1) | Rc)
+#define ppc_fdiv(c,D,A,B) ppc_fdivx(c,D,A,B,0)
+#define ppc_fdivd(c,D,A,B) ppc_fdivx(c,D,A,B,1)
+
+#define ppc_fdivsx(c,D,A,B,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (A << 16) | (B << 11) | (0 << 6) | (18 << 1) | Rc)
+#define ppc_fdivs(c,D,A,B) ppc_fdivsx(c,D,A,B,0)
+#define ppc_fdivsd(c,D,A,B) ppc_fdivsx(c,D,A,B,1)
+
+#define ppc_fmaddx(c,D,A,B,C,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (29 << 1) | Rc)
+#define ppc_fmadd(c,D,A,B,C) ppc_fmaddx(c,D,A,B,C,0)
+#define ppc_fmaddd(c,D,A,B,C) ppc_fmaddx(c,D,A,B,C,1) 
+
+#define ppc_fmaddsx(c,D,A,B,C,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (29 << 1) | Rc)
+#define ppc_fmadds(c,D,A,B,C) ppc_fmaddsx(c,D,A,B,C,0)
+#define ppc_fmaddsd(c,D,A,B,C) ppc_fmaddsx(c,D,A,B,C,1) 
+
+#define ppc_fmrx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (72 << 1) | Rc)
+#define ppc_fmr(c,D,B) ppc_fmrx(c,D,B,0)
+#define ppc_fmrd(c,D,B) ppc_fmrx(c,D,B,1)
+
+#define ppc_fmsubx(c,D,A,C,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (28 << 1) | Rc)
+#define ppc_fmsub(c,D,A,C,B) ppc_fmsubx(c,D,A,C,B,0)
+#define ppc_fmsubd(c,D,A,C,B) ppc_fmsubx(c,D,A,C,B,1)
+
+#define ppc_fmsubsx(c,D,A,C,B,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (28 << 1) | Rc)
+#define ppc_fmsubs(c,D,A,C,B) ppc_fmsubsx(c,D,A,C,B,0)
+#define ppc_fmsubsd(c,D,A,C,B) ppc_fmsubsx(c,D,A,C,B,1)
+
+#define ppc_fmulx(c,D,A,C,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (A << 16) | (0 << 11) | (C << 6) | (25 << 1) | Rc) 
+#define ppc_fmul(c,D,A,C) ppc_fmulx(c,D,A,C,0)
+#define ppc_fmuld(c,D,A,C) ppc_fmulx(c,D,A,C,1)
+
+#define ppc_fmulsx(c,D,A,C,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (A << 16) | (0 << 11) | (C << 6) | (25 << 1) | Rc) 
+#define ppc_fmuls(c,D,A,C) ppc_fmulsx(c,D,A,C,0)
+#define ppc_fmulsd(c,D,A,C) ppc_fmulsx(c,D,A,C,1)
+
+#define ppc_fnabsx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (136 << 1) | Rc)
+#define ppc_fnabs(c,D,B) ppc_fnabsx(c,D,B,0)
+#define ppc_fnabsd(c,D,B) ppc_fnabsx(c,D,B,1)
+
+#define ppc_fnegx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (40 << 1) | Rc)
+#define ppc_fneg(c,D,B) ppc_fnegx(c,D,B,0)
+#define ppc_fnegd(c,D,B) ppc_fnegx(c,D,B,1)
+
+#define ppc_fnmaddx(c,D,A,C,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (31 << 1) | Rc)
+#define ppc_fnmadd(c,D,A,C,B) ppc_fnmaddx(c,D,A,C,B,0)
+#define ppc_fnmaddd(c,D,A,C,B) ppc_fnmaddx(c,D,A,C,B,1)
+
+#define ppc_fnmaddsx(c,D,A,C,B,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (31 << 1) | Rc)
+#define ppc_fnmadds(c,D,A,C,B) ppc_fnmaddsx(c,D,A,C,B,0)
+#define ppc_fnmaddsd(c,D,A,C,B) ppc_fnmaddsx(c,D,A,C,B,1)
+
+#define ppc_fnmsubx(c,D,A,C,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (30 << 1) | Rc)
+#define ppc_fnmsub(c,D,A,C,B) ppc_fnmsubx(c,D,A,C,B,0)
+#define ppc_fnmsubd(c,D,A,C,B) ppc_fnmsubx(c,D,A,C,B,1)
+
+#define ppc_fnmsubsx(c,D,A,C,B,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (30 << 1) | Rc)
+#define ppc_fnmsubs(c,D,A,C,B) ppc_fnmsubsx(c,D,A,C,B,0)
+#define ppc_fnmsubsd(c,D,A,C,B) ppc_fnmsubsx(c,D,A,C,B,1)
+
+#define ppc_fresx(c,D,B,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (0 << 16) | (B << 11) | (0 << 6) | (24 << 1) | Rc)
+#define ppc_fres(c,D,B) ppc_fresx(c,D,B,0)
+#define ppc_fresd(c,D,B) ppc_fresx(c,D,B,1)
+
+#define ppc_frspx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (12 << 1) | Rc)
+#define ppc_frsp(c,D,B) ppc_frspx(c,D,B,0)
+#define ppc_frspd(c,D,B) ppc_frspx(c,D,B,1)
+
+#define ppc_frsqrtex(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (0 << 6) | (26 << 1) | Rc)
+#define ppc_frsqrte(c,D,B) ppc_frsqrtex(c,D,B,0)
+#define ppc_frsqrted(c,D,B) ppc_frsqrtex(c,D,B,1)
+
+#define ppc_fselx(c,D,A,C,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (A << 16) | (B << 11) | (C << 6) | (23 << 1) | Rc)
+#define ppc_fsel(c,D,A,C,B) ppc_fselx(c,D,A,C,B,0)
+#define ppc_fseld(c,D,A,C,B) ppc_fselx(c,D,A,C,B,1)
+
+#define ppc_fsqrtx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (B << 11) | (0 << 6) | (22 << 1) | Rc)
+#define ppc_fsqrt(c,D,B) ppc_fsqrtx(c,D,B,0)
+#define ppc_fsqrtd(c,D,B) ppc_fsqrtx(c,D,B,1)
+
+#define ppc_fsqrtsx(c,D,B,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (0 << 16) | (B << 11) | (0 << 6) | (22 << 1) | Rc)
+#define ppc_fsqrts(c,D,B) ppc_fsqrtsx(c,D,B,0)
+#define ppc_fsqrtsd(c,D,B) ppc_fsqrtsx(c,D,B,1)
+
+#define ppc_fsubx(c,D,A,B,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (A << 16) | (B << 11) | (0 << 6) | (20 << 1) | Rc)
+#define ppc_fsub(c,D,A,B) ppc_fsubx(c,D,A,B,0)
+#define ppc_fsubd(c,D,A,B) ppc_fsubx(c,D,A,B,1)
+
+#define ppc_fsubsx(c,D,A,B,Rc) ppc_emit32(c, (59 << 26) | (D << 21) | (A << 16) | (B << 11) | (0 << 6) | (20 << 1) | Rc)
+#define ppc_fsubs(c,D,A,B) ppc_fsubsx(c,D,A,B,0)
+#define ppc_fsubsd(c,D,A,B) ppc_fsubsx(c,D,A,B,1)
+
+#define ppc_icbi(c,A,B) ppc_emit32(c, (31 << 26) | (0 << 21) | (A << 16) | (B << 11) | (982 << 1) | 0)
+
+#define ppc_isync(c) ppc_emit32(c, (19 << 26) | (0 << 11) | (150 << 1) | 0)
+
+#define ppc_lbzu(c,D,d,A) ppc_emit32(c, (35 << 26) | (D << 21) | (A << 16) | (guint16)d)
+#define ppc_lbzux(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (119 << 1) | 0)
+#define ppc_lbzx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (87 << 1) | 0)
+
+#define ppc_lfdu(c,D,d,A) ppc_emit32(c, (51 << 26) | (D << 21) | (A << 16) | (guint16)d)
+#define ppc_lfdux(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (631 << 1) | 0)
+#define ppc_lfdx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (599 << 1) | 0)
+
+#define ppc_lfsu(c,D,d,A) ppc_emit32(c, (49 << 26) | (D << 21) | (A << 16) | (guint16)d)
+#define ppc_lfsux(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (567 << 1) | 0)
+#define ppc_lfsx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (535 << 1) | 0)
+
+#define ppc_lha(c,D,d,A) ppc_emit32(c, (42 << 26) | (D << 21) | (A << 16) | (guint16)d)
+#define ppc_lhau(c,D,d,A) ppc_emit32(c, (43 << 26) | (D << 21) | (A << 16) | (guint16)d)
+#define ppc_lhaux(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (375 << 1) | 0)
+#define ppc_lhax(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (343 << 1) | 0)
+#define ppc_lhbrx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (790 << 1) | 0)
+#define ppc_lhzu(c,D,d,A) ppc_emit32(c, (41 << 26) | (D << 21) | (A << 16) | (guint16)d)
+
+#define ppc_lhzux(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (311 << 1) | 0)
+#define ppc_lhzx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (279 << 1) | 0)
+
+#define ppc_lmw(c,D,d,A) ppc_emit32(c, (46 << 26) | (D << 21) | (A << 16) | (guint16)d)
+
+#define ppc_lswi(c,D,A,NB) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (NB << 11) | (597 << 1) | 0)
+#define ppc_lswx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (533 << 1) | 0)
+#define ppc_lwarx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (20 << 1) | 0)
+#define ppc_lwbrx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (534 << 1) | 0)
+
+#define ppc_lwzu(c,D,d,A) ppc_emit32(c, (33 << 26) | (D << 21) | (A << 16) | (guint16)d)
+#define ppc_lwzux(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (55 << 1) | 0)
+#define ppc_lwzx(c,D,A,B) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (23 << 1) | 0)
+
+#define ppc_mcrf(c,crfD,crfS) ppc_emit32(c, (19 << 26) | (crfD << 23) | (0 << 21) | (crfS << 18) | 0)
+#define ppc_mcrfs(c,crfD,crfS) ppc_emit32(c, (63 << 26) | (crfD << 23) | (0 << 21) | (crfS << 18) | (0 << 16) | (64 << 1) | 0)
+#define ppc_mcrxr(c,crfD) ppc_emit32(c, (31 << 26) | (crfD << 23) | (0 << 16) | (512 << 1) | 0)
+
+#define ppc_mfcr(c,D) ppc_emit32(c, (31 << 26) | (D << 21) | (0 << 16) | (19 << 1) | 0)
+#define ppc_mffsx(c,D,Rc) ppc_emit32(c, (63 << 26) | (D << 21) | (0 << 16) | (583 << 1) | Rc)
+#define ppc_mffs(c,D) ppc_mffsx(c,D,0)
+#define ppc_mffsd(c,D) ppc_mffsx(c,D,1)
+#define ppc_mfmsr(c,D) ppc_emit32(c, (31 << 26) | (D << 21) | (0 << 16) | (83 << 1) | 0)
+#define ppc_mfsr(c,D,SR) ppc_emit32(c, (31 << 26) | (D << 21) | (0 << 20) | (SR << 16) | (0 << 11) | (595 << 1) | 0)
+#define ppc_mfsrin(c,D,B) ppc_emit32(c, (31 << 26) | (D << 21) | (0 << 16) | (B << 11) | (659 << 1) | 0)
+#define ppc_mftb(c,D,TBR) ppc_emit32(c, (31 << 26) | (D << 21) | (TBR << 11) | (371 << 1) | 0)
+
+#define ppc_mtcrf(c,CRM,S) ppc_emit32(c, (31 << 26) | (S << 21) | (0 << 20) | (CRM << 12) | (0 << 11) | (144 << 1) | 0)
+
+#define ppc_mtfsb0x(c,CRB,Rc) ppc_emit32(c, (63 << 26) | (CRB << 21) | (0 << 11) | (70 << 1) | Rc)
+#define ppc_mtfsb0(c,CRB) ppc_mtfsb0x(c,CRB,0)
+#define ppc_mtfsb0d(c,CRB) ppc_mtfsb0x(c,CRB,1)
+
+#define ppc_mtfsb1x(c,CRB,Rc) ppc_emit32(c, (63 << 26) | (CRB << 21) | (0 << 11) | (38 << 1) | Rc)
+#define ppc_mtfsb1(c,CRB) ppc_mtfsb1x(c,CRB,0)
+#define ppc_mtfsb1d(c,CRB) ppc_mtfsb1x(c,CRB,1)
+
+#define ppc_mtfsfx(c,FM,B,Rc) ppc_emit32(c, (63 << 26) | (0 << 25) | (FM << 22) | (0 << 21) | (B << 11) | (711 << 1) | Rc)
+#define ppc_mtfsf(c,FM,B) ppc_mtfsfx(c,FM,B,0)
+#define ppc_mtfsfd(c,FM,B) ppc_mtfsfx(c,FM,B,1)
+
+#define ppc_mtfsfix(c,crfD,IMM,Rc) ppc_emit32(c, (63 << 26) | (crfD << 23) | (0 << 16) | (IMM << 12) | (0 << 11) | (134 << 1) | Rc)
+#define ppc_mtfsfi(c,crfD,IMM) ppc_mtfsfix(c,crfD,IMM,0)
+#define ppc_mtfsfid(c,crfD,IMM) ppc_mtfsfix(c,crfD,IMM,1)
+
+#define ppc_mtmsr(c, S) ppc_emit32(c, (31 << 26) | (S << 21) | (0 << 11) | (146 << 1) | 0)
+
+#define ppc_mtsr(c,SR,S) ppc_emit32(c, (31 << 26) | (S << 21) | (0 << 20) | (SR << 16) | (0 << 11) | (210 << 1) | 0)
+#define ppc_mtsrin(c,S,B) ppc_emit32(c, (31 << 26) | (S << 21) | (0 << 16) | (B << 11) | (242 << 1) | 0)
+
+#define ppc_mulhwx(c,D,A,B,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (0 << 10) | (75 << 1) | Rc)
+#define ppc_mulhw(c,D,A,B) ppc_mulhwx(c,D,A,B,0)
+#define ppc_mulhwd(c,D,A,B) ppc_mulhwx(c,D,A,B,1)
+
+#define ppc_mulhwux(c,D,A,B,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (0 << 10) | (11 << 1) | Rc)
+#define ppc_mulhwu(c,D,A,B) ppc_mulhwux(c,D,A,B,0)
+#define ppc_mulhwud(c,D,A,B) ppc_mulhwux(c,D,A,B,1)
+
+#define ppc_mulli(c,D,A,SIMM) ppc_emit32(c, ((07) << 26) | (D << 21) | (A << 16) | (guint16)(SIMM))
+
+#define ppc_mullwx(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (OE << 10) | (235 << 1) | Rc)
+#define ppc_mullw(c,D,A,B) ppc_mullwx(c,D,A,B,0,0)
+#define ppc_mullwd(c,D,A,B) ppc_mullwx(c,D,A,B,0,1)
+#define ppc_mullwo(c,D,A,B) ppc_mullwx(c,D,A,B,1,0)
+#define ppc_mullwod(c,D,A,B) ppc_mullwx(c,D,A,B,1,1)
+
+#define ppc_nandx(c,A,S,B,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (476 << 1) | Rc)
+#define ppc_nand(c,A,S,B) ppc_nandx(c,A,S,B,0)
+#define ppc_nandd(c,A,S,B) ppc_nandx(c,A,S,B,1)
+
+#define ppc_negx(c,D,A,OE,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (0 << 11) | (OE << 10) | (104 << 1) | Rc)
+#define ppc_neg(c,D,A) ppc_negx(c,D,A,0,0)
+#define ppc_negd(c,D,A) ppc_negx(c,D,A,0,1)
+#define ppc_nego(c,D,A) ppc_negx(c,D,A,1,0)
+#define ppc_negod(c,D,A) ppc_negx(c,D,A,1,1)
+
+#define ppc_norx(c,A,S,B,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (124 << 1) | Rc)
+#define ppc_nor(c,A,S,B) ppc_norx(c,A,S,B,0)
+#define ppc_nord(c,A,S,B) ppc_norx(c,A,S,B,1)
+
+#define ppc_not(c,A,S) ppc_norx(c,A,S,S,0)
+
+#define ppc_orx(c,A,S,B,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (444 << 1) | Rc)
+#define ppc_ord(c,A,S,B) ppc_orx(c,A,S,B,1)
+
+#define ppc_orcx(c,A,S,B,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (412 << 1) | Rc)
+#define ppc_orc(c,A,S,B) ppc_orcx(c,A,S,B,0)
+#define ppc_orcd(c,A,S,B) ppc_orcx(c,A,S,B,1)
+
+#define ppc_oris(c,A,S,UIMM) ppc_emit32(c, (25 << 26) | (S << 21) | (A << 16) | (guint16)(UIMM))
+
+#define ppc_rfi(c) ppc_emit32(c, (19 << 26) | (0 << 11) | (50 << 1) | 0)
+
+#define ppc_rlwimix(c,A,S,SH,MB,ME,Rc) ppc_emit32(c, (20 << 26) | (S << 21) | (A << 16) | (SH << 11) | (MB << 6) | (ME << 1) | Rc)
+#define ppc_rlwimi(c,A,S,SH,MB,ME) ppc_rlwimix(c,A,S,SH,MB,ME,0)
+#define ppc_rlwimid(c,A,S,SH,MB,ME) ppc_rlwimix(c,A,S,SH,MB,ME,1)
+
+#define ppc_rlwinmx(c,A,S,SH,MB,ME,Rc) ppc_emit32(c, (21 << 26) | ((S) << 21) | ((A) << 16) | ((SH) << 11) | ((MB) << 6) | ((ME) << 1) | (Rc))
+#define ppc_rlwinm(c,A,S,SH,MB,ME) ppc_rlwinmx(c,A,S,SH,MB,ME,0)
+#define ppc_rlwinmd(c,A,S,SH,MB,ME) ppc_rlwinmx(c,A,S,SH,MB,ME,1)
+#define ppc_extlwi(c,A,S,n,b) ppc_rlwinm(c,A,S, b, 0, (n) - 1)
+#define ppc_extrwi(c,A,S,n,b) ppc_rlwinm(c,A,S, (b) + (n), 32 - (n), 31)
+#define ppc_rotlwi(c,A,S,n) ppc_rlwinm(c,A,S, n, 0, 31)
+#define ppc_rotrwi(c,A,S,n) ppc_rlwinm(c,A,S, 32 - (n), 0, 31)
+#define ppc_slwi(c,A,S,n) ppc_rlwinm(c,A,S, n, 0, 31 - (n))
+#define ppc_srwi(c,A,S,n) ppc_rlwinm(c,A,S, 32 - (n), n, 31)
+#define ppc_clrlwi(c,A,S,n) ppc_rlwinm(c,A,S, 0, n, 31)
+#define ppc_clrrwi(c,A,S,n) ppc_rlwinm(c,A,S, 0, 0, 31 - (n))
+#define ppc_clrlslwi(c,A,S,b,n) ppc_rlwinm(c,A,S, n, (b) - (n), 31 - (n))
+
+#define ppc_rlwnmx(c,A,S,SH,MB,ME,Rc) ppc_emit32(c, (23 << 26) | (S << 21) | (A << 16) | (SH << 11) | (MB << 6) | (ME << 1) | Rc)
+#define ppc_rlwnm(c,A,S,SH,MB,ME) ppc_rlwnmx(c,A,S,SH,MB,ME,0)
+#define ppc_rlwnmd(c,A,S,SH,MB,ME) ppc_rlwnmx(c,A,S,SH,MB,ME,1)
+
+#define ppc_sc(c) ppc_emit32(c, (17 << 26) | (0 << 2) | (1 << 1) | 0)
+
+#define ppc_slwx(c,S,A,B,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (24 << 1) | Rc)
+#define ppc_slw(c,S,A,B) ppc_slwx(c,S,A,B,0)
+#define ppc_slwd(c,S,A,B) ppc_slwx(c,S,A,B,1)
+
+#define ppc_srawx(c,A,S,B,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (792 << 1) | Rc)
+#define ppc_sraw(c,A,S,B) ppc_srawx(c,A,S,B,0)
+#define ppc_srawd(c,A,S,B) ppc_srawx(c,A,S,B,1)
+
+#define ppc_srawix(c,A,S,SH,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (SH << 11) | (824 << 1) | Rc)
+#define ppc_srawi(c,A,S,B) ppc_srawix(c,A,S,B,0)
+#define ppc_srawid(c,A,S,B) ppc_srawix(c,A,S,B,1)
+
+#define ppc_srwx(c,A,S,SH,Rc) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (SH << 11) | (536 << 1) | Rc)
+#define ppc_srw(c,A,S,B) ppc_srwx(c,A,S,B,0)
+#define ppc_srwd(c,A,S,B) ppc_srwx(c,A,S,B,1)
+
+#define ppc_stbu(c,S,d,A) ppc_emit32(c, (39 << 26) | (S << 21) | (A << 16) | (guint16)(d))
+
+#define ppc_stbux(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (247 << 1) | 0)
+#define ppc_stbx(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (215 << 1) | 0)
+
+#define ppc_stfdu(c,S,d,A) ppc_emit32(c, (55 << 26) | (S << 21) | (A << 16) | (guint16)(d))
+
+#define ppc_stfdx(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (727 << 1) | 0)
+#define ppc_stfiwx(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (983 << 1) | 0)
+
+#define ppc_stfsu(c,S,d,A) ppc_emit32(c, (53 << 26) | (S << 21) | (A << 16) | (guint16)(d))
+#define ppc_stfsux(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (695 << 1) | 0)  
+#define ppc_stfsx(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (663 << 1) | 0)  
+#define ppc_sthbrx(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (918 << 1) | 0)  
+#define ppc_sthu(c,S,d,A) ppc_emit32(c, (45 << 26) | (S << 21) | (A << 16) | (guint16)(d))
+#define ppc_sthux(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (439 << 1) | 0)
+#define ppc_sthx(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (407 << 1) | 0)
+#define ppc_stmw(c,S,d,A) ppc_emit32(c, (47 << 26) | (S << 21) | (A << 16) | (guint16)d)
+#define ppc_stswi(c,S,A,NB) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (NB << 11) | (725 << 1) | 0)
+#define ppc_stswx(c,S,A,NB) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (NB << 11) | (661 << 1) | 0)
+#define ppc_stwbrx(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (662 << 1) | 0)
+#define ppc_stwcxd(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (150 << 1) | 1)
+#define ppc_stwux(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (183 << 1) | 0)
+#define ppc_stwx(c,S,A,B) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (151 << 1) | 0)
+
+#define ppc_subfx(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (OE << 10) | (40 << 1) | Rc)
+#define ppc_subf(c,D,A,B) ppc_subfx(c,D,A,B,0,0)
+#define ppc_subfd(c,D,A,B) ppc_subfx(c,D,A,B,0,1)
+#define ppc_subfo(c,D,A,B) ppc_subfx(c,D,A,B,1,0)
+#define ppc_subfod(c,D,A,B) ppc_subfx(c,D,A,B,1,1)
+
+#define ppc_sub(c,D,A,B) ppc_subf(c,D,B,A)
+
+#define ppc_subfcx(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (OE << 10) | (8 << 1) | Rc)
+#define ppc_subfc(c,D,A,B) ppc_subfcx(c,D,A,B,0,0)
+#define ppc_subfcd(c,D,A,B) ppc_subfcx(c,D,A,B,0,1)
+#define ppc_subfco(c,D,A,B) ppc_subfcx(c,D,A,B,1,0)
+#define ppc_subfcod(c,D,A,B) ppc_subfcx(c,D,A,B,1,1)
+
+#define ppc_subfex(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (B << 11) | (OE << 10) | (136 << 1) | Rc)
+#define ppc_subfe(c,D,A,B) ppc_subfex(c,D,A,B,0,0)
+#define ppc_subfed(c,D,A,B) ppc_subfex(c,D,A,B,0,1)
+#define ppc_subfeo(c,D,A,B) ppc_subfex(c,D,A,B,1,0)
+#define ppc_subfeod(c,D,A,B) ppc_subfex(c,D,A,B,1,1)
+
+#define ppc_subfic(c,D,A,SIMM) ppc_emit32(c, (8 << 26) | (D << 21) | (A << 16) | (guint16)(SIMM))
+
+#define ppc_subfmex(c,D,A,OE,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (0 << 11) | (OE << 10) | (232 << 1) | Rc)
+#define ppc_subfme(c,D,A) ppc_subfmex(c,D,A,0,0)
+#define ppc_subfmed(c,D,A) ppc_subfmex(c,D,A,0,1)
+#define ppc_subfmeo(c,D,A) ppc_subfmex(c,D,A,1,0)
+#define ppc_subfmeod(c,D,A) ppc_subfmex(c,D,A,1,1)
+
+#define ppc_subfzex(c,D,A,OE,Rc) ppc_emit32(c, (31 << 26) | (D << 21) | (A << 16) | (0 << 11) | (OE << 10) | (200 << 1) | Rc)
+#define ppc_subfze(c,D,A) ppc_subfzex(c,D,A,0,0)
+#define ppc_subfzed(c,D,A) ppc_subfzex(c,D,A,0,1)
+#define ppc_subfzeo(c,D,A) ppc_subfzex(c,D,A,1,0)
+#define ppc_subfzeod(c,D,A) ppc_subfzex(c,D,A,1,1)
+
+#define ppc_sync(c) ppc_emit32(c, (31 << 26) | (0 << 11) | (598 << 1) | 0)
+#define ppc_tlbia(c) ppc_emit32(c, (31 << 26) | (0 << 11) | (370 << 1) | 0)
+#define ppc_tlbie(c,B) ppc_emit32(c, (31 << 26) | (0 << 16) | (B << 11) | (306 << 1) | 0)
+#define ppc_tlbsync(c) ppc_emit32(c, (31 << 26) | (0 << 11) | (566 << 1) | 0)
+
+#define ppc_tw(c,TO,A,B) ppc_emit32(c, (31 << 26) | (TO << 21) | (A << 16) | (B << 11) | (4 << 1) | 0)
+#define ppc_twi(c,TO,A,SIMM) ppc_emit32(c, (3 << 26) | (TO << 21) | (A << 16) | (guint16)(SIMM))
+
+#define ppc_xorx(c,A,S,B,RC) ppc_emit32(c, (31 << 26) | (S << 21) | (A << 16) | (B << 11) | (316 << 1) | RC)
+#define ppc_xor(c,A,S,B) ppc_xorx(c,A,S,B,0)
+#define ppc_xord(c,A,S,B) ppc_xorx(c,A,S,B,1)
+
+#define ppc_xori(c,S,A,UIMM) ppc_emit32(c, (26 << 26) | (S << 21) | (A << 16) | (guint16)(UIMM))
+#define ppc_xoris(c,S,A,UIMM) ppc_emit32(c, (27 << 26) | (S << 21) | (A << 16) | (guint16)(UIMM))
+
+/* this marks the end of my work, ct */
+
+/* PPC64 */
+
+/* The following FP instructions are not are available to 32-bit
+   implementations (prior to PowerISA-V2.01 but are available to
+   32-bit mode programs on 64-bit PowerPC implementations and all
+   processors compliant with PowerISA-2.01 or later.  */
+
+#define ppc_fcfidx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | ((D) << 21) | (0 << 16) | ((B) << 11) | (846 << 1) | (Rc))
+#define ppc_fcfid(c,D,B)  ppc_fcfidx(c,D,B,0)
+#define ppc_fcfidd(c,D,B) ppc_fcfidx(c,D,B,1)
+
+#define ppc_fctidx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | ((D) << 21) | (0 << 16) | ((B) << 11) | (814 << 1) | (Rc))
+#define ppc_fctid(c,D,B)  ppc_fctidx(c,D,B,0)
+#define ppc_fctidd(c,D,B) ppc_fctidx(c,D,B,1)
+
+#define ppc_fctidzx(c,D,B,Rc) ppc_emit32(c, (63 << 26) | ((D) << 21) | (0 << 16) | ((B) << 11) | (815 << 1) | (Rc))
+#define ppc_fctidz(c,D,B)  ppc_fctidzx(c,D,B,0)
+#define ppc_fctidzd(c,D,B) ppc_fctidzx(c,D,B,1)
+
+#ifdef __mono_ppc64__
+
+#define ppc_load_sequence(c,D,v) G_STMT_START {	\
+		ppc_lis  ((c), (D),      ((guint64)(v) >> 48) & 0xffff);	\
+		ppc_ori  ((c), (D), (D), ((guint64)(v) >> 32) & 0xffff);	\
+		ppc_sldi ((c), (D), (D), 32); \
+		ppc_oris ((c), (D), (D), ((guint64)(v) >> 16) & 0xffff);	\
+		ppc_ori  ((c), (D), (D),  (guint64)(v)        & 0xffff);	\
+	} G_STMT_END
+
+#define PPC_LOAD_SEQUENCE_LENGTH	20
+
+#define ppc_is_imm32(val) (((((gint64)val)>> 31) == 0) || ((((gint64)val)>> 31) == -1))
+#define ppc_is_imm48(val) (((((gint64)val)>> 47) == 0) || ((((gint64)val)>> 47) == -1))
+
+#define ppc_load48(c,D,v) G_STMT_START {	\
+		ppc_li   ((c), (D), ((gint64)(v) >> 32) & 0xffff);	\
+		ppc_sldi ((c), (D), (D), 32); \
+		ppc_oris ((c), (D), (D), ((guint64)(v) >> 16) & 0xffff);	\
+		ppc_ori  ((c), (D), (D),  (guint64)(v)        & 0xffff);	\
+	} G_STMT_END
+
+#define ppc_load(c,D,v) G_STMT_START {	\
+		if (ppc_is_imm16 ((guint64)(v)))	{	\
+			ppc_li ((c), (D), (guint16)(guint64)(v));	\
+		} else if (ppc_is_imm32 ((guint64)(v))) {	\
+			ppc_load32 ((c), (D), (guint32)(guint64)(v)); \
+		} else if (ppc_is_imm48 ((guint64)(v))) {	\
+			ppc_load48 ((c), (D), (guint64)(v)); \
+		} else {	\
+			ppc_load_sequence ((c), (D), (guint64)(v)); \
+		}	\
+	} G_STMT_END
+
+#define ppc_load_func(c,D,v) G_STMT_START { \
+		ppc_load_sequence ((c), ppc_r11, (guint64)(gsize)(v));	\
+		ppc_ldptr ((c), ppc_r2, sizeof (gpointer), ppc_r11);	\
+		ppc_ldptr ((c), (D), 0, ppc_r11);	\
+	} G_STMT_END
+
+#define ppc_load_multiple_regs(c,D,d,A) G_STMT_START { \
+		int __i, __o = (d);			\
+		for (__i = (D); __i <= 31; ++__i) {	\
+			ppc_ldr ((c), __i, __o, (A));		\
+			__o += sizeof (guint64);				\
+		} \
+	} G_STMT_END
+
+#define ppc_store_multiple_regs(c,S,d,A) G_STMT_START { \
+		int __i, __o = (d);			\
+		for (__i = (S); __i <= 31; ++__i) {	\
+			ppc_str ((c), __i, __o, (A));		\
+			__o += sizeof (guint64);				\
+		} \
+	} G_STMT_END
+
+#define ppc_compare(c,cfrD,A,B)		      ppc_cmp((c), (cfrD), 1, (A), (B))
+#define ppc_compare_reg_imm(c,cfrD,A,B)	      ppc_cmpi((c), (cfrD), 1, (A), (B))
+#define ppc_compare_log(c,cfrD,A,B)	      ppc_cmpl((c), (cfrD), 1, (A), (B))
+
+#define ppc_shift_left(c,A,S,B)		      ppc_sld((c), (A), (S), (B))
+#define ppc_shift_left_imm(c,A,S,n)	      ppc_sldi((c), (A), (S), (n))
+
+#define ppc_shift_right_imm(c,A,S,B)	      ppc_srdi((c), (A), (S), (B))
+#define ppc_shift_right_arith_imm(c,A,S,B)    ppc_sradi((c), (A), (S), (B))
+
+#define ppc_multiply(c,D,A,B)		      ppc_mulld((c), (D), (A), (B))
+
+#define ppc_clear_right_imm(c,A,S,n)	      ppc_clrrdi((c), (A), (S), (n))
+
+#define ppc_divdx(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | ((OE) << 10) | (489 << 1) | (Rc))
+#define ppc_divd(c,D,A,B)   ppc_divdx(c,D,A,B,0,0)
+#define ppc_divdd(c,D,A,B)  ppc_divdx(c,D,A,B,0,1)
+#define ppc_divdo(c,D,A,B)  ppc_divdx(c,D,A,B,1,0)
+#define ppc_divdod(c,D,A,B) ppc_divdx(c,D,A,B,1,1)
+
+#define ppc_divdux(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | ((OE) << 10) | (457 << 1) | (Rc))
+#define ppc_divdu(c,D,A,B)   ppc_divdux(c,D,A,B,0,0)
+#define ppc_divdud(c,D,A,B)  ppc_divdux(c,D,A,B,0,1)
+#define ppc_divduo(c,D,A,B)  ppc_divdux(c,D,A,B,1,0)
+#define ppc_divduod(c,D,A,B) ppc_divdux(c,D,A,B,1,1)
+
+#define ppc_extswx(c,S,A,Rc) ppc_emit32(c, (31 << 26) | ((S) << 21) | ((A) << 16) | (0 << 11) | (986 << 1) | (Rc))
+#define ppc_extsw(c,A,S)  ppc_extswx(c,S,A,0)
+#define ppc_extswd(c,A,S) ppc_extswx(c,S,A,1)
+
+/* These move float to/from instuctions are only available on POWER6 in
+   native mode.  These instruction are faster then the equivalent
+   store/load because they avoid the store queue and associated delays.
+   These instructions should only be used in 64-bit mode unless the
+   kernel preserves the 64-bit GPR on signals and dispatch in 32-bit
+   mode.  The Linux kernel does not.  */
+#define ppc_mftgpr(c,T,B) ppc_emit32(c, (31 << 26) | ((T) << 21) | (0 << 16) | ((B) << 11) | (735 << 1) | 0)
+#define ppc_mffgpr(c,T,B) ppc_emit32(c, (31 << 26) | ((T) << 21) | (0 << 16) | ((B) << 11) | (607 << 1) | 0)
+
+#define ppc_ld(c,D,ds,A) ppc_emit32(c, (58 << 26) | ((D) << 21) | ((A) << 16) | ((guint32)(ds) & 0xfffc) | 0)
+#define ppc_lwa(c,D,ds,A) ppc_emit32(c, (58 << 26) | ((D) << 21) | ((A) << 16) | ((ds) & 0xfffc) | 2)
+#define ppc_ldarx(c,D,A,B) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (84 << 1) | 0)
+#define ppc_ldu(c,D,ds,A) ppc_emit32(c, (58 <<	26) | ((D) << 21) | ((A) << 16) | ((guint32)(ds) & 0xfffc) | 1)
+#define ppc_ldux(c,D,A,B) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (53 << 1) | 0)
+#define ppc_lwaux(c,D,A,B) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (373 << 1) | 0)
+#define ppc_ldx(c,D,A,B) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (21 << 1) | 0)
+#define ppc_lwax(c,D,A,B) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (341 << 1) | 0)
+
+#define ppc_mulhdx(c,D,A,B,Rc) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (0 << 10) | (73 << 1) | (Rc))
+#define ppc_mulhd(c,D,A,B)  ppc_mulhdx(c,D,A,B,0)
+#define ppc_mulhdd(c,D,A,B) ppc_mulhdx(c,D,A,B,1)
+#define ppc_mulhdux(c,D,A,B,Rc) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | (0 << 10) | (9 << 1) | (Rc))
+#define ppc_mulhdu(c,D,A,B)  ppc_mulhdux(c,D,A,B,0)
+#define ppc_mulhdud(c,D,A,B) ppc_mulhdux(c,D,A,B,1)
+
+#define ppc_mulldx(c,D,A,B,OE,Rc) ppc_emit32(c, (31 << 26) | ((D) << 21) | ((A) << 16) | ((B) << 11) | ((OE) << 10) | (233 << 1) | (Rc))
+#define ppc_mulld(c,D,A,B)   ppc_mulldx(c,D,A,B,0,0)
+#define ppc_mulldd(c,D,A,B)  ppc_mulldx(c,D,A,B,0,1)
+#define ppc_mulldo(c,D,A,B)  ppc_mulldx(c,D,A,B,1,0)
+#define ppc_mulldod(c,D,A,B) ppc_mulldx(c,D,A,B,1,1)
+
+#define ppc_rldclx(c,A,S,B,MB,Rc) ppc_emit32(c, (30 << 26) | ((S) << 21) | ((A) << 16) | ((B) << 11) | (ppc_split_5_1(MB) << 5) | (8 << 1) | (Rc))
+#define ppc_rldcl(c,A,S,B,MB)  ppc_rldclx(c,A,S,B,MB,0)
+#define ppc_rldcld(c,A,S,B,MB) ppc_rldclx(c,A,S,B,MB,1)
+#define ppc_rotld(c,A,S,B) ppc_rldcl(c, A, S, B, 0)
+
+#define ppc_rldcrx(c,A,S,B,ME,Rc) ppc_emit32(c, (30 << 26) | ((S) << 21) | ((A) << 16) | ((B) << 11) | (ppc_split_5_1(ME) << 5) | (9 << 1) | (Rc))
+#define ppc_rldcr(c,A,S,B,ME)  ppc_rldcrx(c,A,S,B,ME,0)
+#define ppc_rldcrd(c,A,S,B,ME) ppc_rldcrx(c,A,S,B,ME,1)
+
+#define ppc_rldicx(c,S,A,SH,MB,Rc) ppc_emit32(c, (30 << 26) | ((S) << 21) | ((A) << 16) | (ppc_split_5_1_5(SH) << 11) | (ppc_split_5_1(MB) << 5) | (2 << 2) | (ppc_split_5_1_1(SH) << 1) | (Rc))
+#define ppc_rldic(c,A,S,SH,MB)  ppc_rldicx(c,S,A,SH,MB,0)
+#define ppc_rldicd(c,A,S,SH,MB) ppc_rldicx(c,S,A,SH,MB,1)
+
+#define ppc_rldiclx(c,S,A,SH,MB,Rc) ppc_emit32(c, (30 << 26) | ((S) << 21) | ((A) << 16) | (ppc_split_5_1_5(SH) << 11) | (ppc_split_5_1(MB) << 5) | (0 << 2) | (ppc_split_5_1_1(SH) << 1) | (Rc))
+#define ppc_rldicl(c,A,S,SH,MB)  ppc_rldiclx(c,S,A,SH,MB,0)
+#define ppc_rldicld(c,A,S,SH,MB) ppc_rldiclx(c,S,A,SH,MB,1)
+#define ppc_extrdi(c,A,S,n,b) ppc_rldicl(c,A,S, (b) + (n), 64 - (n))
+#define ppc_rotldi(c,A,S,n)   ppc_rldicl(c,A,S, n, 0)
+#define ppc_rotrdi(c,A,S,n)   ppc_rldicl(c,A,S, 64 - (n), 0)
+#define ppc_srdi(c,A,S,n)     ppc_rldicl(c,A,S, 64 - (n), n)
+#define ppc_clrldi(c,A,S,n)   ppc_rldicl(c,A,S, 0, n)
+
+#define ppc_rldicrx(c,A,S,SH,ME,Rc) ppc_emit32(c, (30 << 26) | ((S) << 21) | ((A) << 16) | (ppc_split_5_1_5(SH) << 11) | (ppc_split_5_1(ME) << 5) | (1 << 2) | (ppc_split_5_1_1(SH) << 1) | (Rc))
+#define ppc_rldicr(c,A,S,SH,ME)  ppc_rldicrx(c,A,S,SH,ME,0)
+#define ppc_rldicrd(c,A,S,SH,ME) ppc_rldicrx(c,A,S,SH,ME,1)
+#define ppc_extldi(c,A,S,n,b) ppc_rldicr(c, A, S, b, (n) - 1)
+#define ppc_sldi(c,A,S,n)     ppc_rldicr(c, A, S, n, 63 - (n))
+#define ppc_clrrdi(c,A,S,n)   ppc_rldicr(c, A, S, 0, 63 - (n))
+
+#define ppc_rldimix(c,S,A,SH,MB,Rc) ppc_emit32(c, (30 << 26) | ((S) << 21) | ((A) << 16) | (ppc_split_5_1_5(SH) << 11) | (ppc_split_5_1(MB) << 5) | (3 << 2) | (ppc_split_5_1_1(SH) << 1) | (Rc))
+#define ppc_rldimi(c,A,S,SH,MB)  ppc_rldimix(c,S,A,SH,MB,0)
+#define ppc_rldimid(c,A,S,SH,MB) ppc_rldimix(c,S,A,SH,MB,1)
+
+#define ppc_slbia(c)  ppc_emit32(c, (31 << 26) | (0 << 21) | (0 << 16) | (0 << 11) | (498 << 1) | 0)
+#define ppc_slbie(c,B) ppc_emit32(c, (31 << 26) | (0 << 21) | (0 << 16) | ((B) << 11) | (434 << 1) | 0)
+#define ppc_sldx(c,S,A,B,Rc) ppc_emit32(c, (31 << 26) | ((S) << 21) | ((A) << 16) | ((B) << 11) | (27 << 1) | (Rc))
+#define ppc_sld(c,A,S,B)  ppc_sldx(c,S,A,B,0)
+#define ppc_sldd(c,A,S,B) ppc_sldx(c,S,A,B,1)
+
+#define ppc_sradx(c,S,A,B,Rc) ppc_emit32(c, (31 << 26) | ((S) << 21) | ((A) << 16) | ((B) << 11) | (794 << 1) | (Rc))
+#define ppc_srad(c,A,S,B)  ppc_sradx(c,S,A,B,0)
+#define ppc_sradd(c,A,S,B) ppc_sradx(c,S,A,B,1)
+#define ppc_sradix(c,S,A,SH,Rc) ppc_emit32(c, (31 << 26) | ((S) << 21) | ((A) << 16) | (((SH) & 31) << 11) | (413 << 2) | (((SH) >> 5) << 1) | (Rc))
+#define ppc_sradi(c,A,S,SH)  ppc_sradix(c,S,A,SH,0)
+#define ppc_sradid(c,A,S,SH) ppc_sradix(c,S,A,SH,1)
+
+#define ppc_srdx(c,S,A,B,Rc) ppc_emit32(c, (31 << 26) | ((S) << 21) | ((A) << 16) | ((B) << 11) | (539 << 1) | (Rc))
+#define ppc_srd(c,A,S,B)  ppc_srdx(c,S,A,B,0)
+#define ppc_srdd(c,A,S,B) ppc_srdx(c,S,A,B,1)
+
+#define ppc_std(c,S,ds,A)   ppc_emit32(c, (62 << 26) | ((S) << 21) | ((A) << 16) | ((guint32)(ds) & 0xfffc) | 0)
+#define ppc_stdcxd(c,S,A,B) ppc_emit32(c, (31 << 26) | ((S) << 21) | ((A) << 16) | ((B) << 11) | (214 << 1) | 1)
+#define ppc_stdu(c,S,ds,A)  ppc_emit32(c, (62 << 26) | ((S) << 21) | ((A) << 16) | ((guint32)(ds) & 0xfffc) | 1)
+#define ppc_stdux(c,S,A,B)  ppc_emit32(c, (31 << 26) | ((S) << 21) | ((A) << 16) | ((B) << 11) | (181 << 1) | 0)
+#define ppc_stdx(c,S,A,B)   ppc_emit32(c, (31 << 26) | ((S) << 21) | ((A) << 16) | ((B) << 11) | (149 << 1) | 0)
+
+#else
+/* Always true for 32-bit */
+#define ppc_is_imm32(val) (1)
+#endif
+
+#endif
--- a/lib/ffts/src/arch/s390x/.gitignore
+++ b/lib/ffts/src/arch/s390x/.gitignore
@ -0,0 +1,6 @@
+/Makefile
+/Makefile.in
+/.libs
+/.deps
+/*.la
+/*.lo
--- a/lib/ffts/src/arch/s390x/ChangeLog
+++ b/lib/ffts/src/arch/s390x/ChangeLog
@ -0,0 +1,35 @@
+2010-03-23	Neale Ferguson <neale@sinenomine.net>
+	
+	* s390x-codegen.h: Remove duplicate
+
+2009-06-24	Neale Ferguson <neale@sinenomine.net>
+	
+	* s390x-codegen.h: Add some new instructions.
+
+2007-04-12	Neale Ferguson <neale@sinenomine.net>
+	
+	* tramp.c: Add MONO_TYPE_PTR case.
+
+2007-01-23	Neale Ferguson <neale@sinenomine.net>
+	
+	* s390x-codegen.h: Add packed attribute to several instruction structures.
+
+2006-03-13	Neale Ferguson <neale@sinenomine.net>
+
+	* s390x-codegen.h: Fix immediate checks.
+
+2006-01-06	Neale Ferguson <neale@sinenomine.net>
+
+	* s390x-codegen.h: Add lpdbr instruction (OP_ABS).
+
+2006-01-03	Neale Ferguson <neale@sinenomine.net>
+
+	* s390x-codegen.h: Add some new instructions.
+
+2004-12-15	Neale Ferguson <Neale.Ferguson@SoftwareAG-usa.com>
+
+	* s390x-codegen.h: Add some new instructions (CS, CSG, CSY, CDS, CDSG, CDSY)
+
+2004-08-03	Neale Ferguson <Neale.Ferguson@SoftwareAG-usa.com>
+
+	* s390x-codegen.h Makefile.am tramp.c: S/390 64-bit interpreter
--- a/lib/ffts/src/arch/s390x/Makefile.am
+++ b/lib/ffts/src/arch/s390x/Makefile.am
@ -0,0 +1,7 @@
+
+AM_CPPFLAGS = $(GLIB_CFLAGS) -I$(top_srcdir)
+
+noinst_LTLIBRARIES = libmonoarch-s390x.la
+
+libmonoarch_s390x_la_SOURCES = tramp.c s390x-codegen.h
+
--- a/lib/ffts/src/arch/s390x/s390x-codegen.h
+++ b/lib/ffts/src/arch/s390x/s390x-codegen.h
@ -0,0 +1,997 @@
+/*
+   Copyright (C)  2001 Radek Doulik
+*/
+
+#ifndef S390X_H
+#define S390X_H
+#include <glib.h>
+#include <assert.h>
+#include <limits.h>
+
+#define FLOAT_REGS 	2	/* No. float registers for parms    */
+#define GENERAL_REGS 	5	/* No. general registers for parms  */
+
+#define ARG_BASE s390_r10	/* Register for addressing arguments*/
+#define STKARG \
+	(i*(sizeof(stackval)))	/* Displacement of ith argument     */
+
+#define MINV_POS  	160	/* MonoInvocation stack offset      */
+#define STACK_POS 	(MINV_POS - sizeof (stackval) * sig->param_count)
+#define OBJ_POS   	8
+#define TYPE_OFFSET 	(G_STRUCT_OFFSET (stackval, type))
+
+#define MIN_CACHE_LINE 256
+
+/*------------------------------------------------------------------*/
+/* Sequence to add an int/long long to parameters to stack_from_data*/
+/*------------------------------------------------------------------*/
+#define ADD_ISTACK_PARM(r, i) \
+	if (reg_param < GENERAL_REGS-(r)) { \
+		s390_lay (p, s390_r4, 0, STK_BASE, \
+		          local_start + (reg_param - this_flag) * sizeof(long)); \
+		reg_param += (i); \
+	} else { \
+		s390_lay (p, s390_r4, 0, STK_BASE, \
+			  sz.stack_size + MINV_POS + stack_param * sizeof(long)); \
+		stack_param += (i); \
+	}
+
+/*------------------------------------------------------------------*/
+/* Sequence to add a float/double to parameters to stack_from_data  */
+/*------------------------------------------------------------------*/
+#define ADD_RSTACK_PARM(i) \
+	if (fpr_param < FLOAT_REGS) { \
+		s390_lay (p, s390_r4, 0, STK_BASE, \
+		          float_pos + (fpr_param * sizeof(float) * (i))); \
+		fpr_param++; \
+	} else { \
+		stack_param += (stack_param % (i)); \
+		s390_lay (p, s390_r4, 0, STK_BASE, \
+		          sz.stack_size + MINV_POS + stack_param * sizeof(float) * (i)); \
+		stack_param += (i); \
+	}
+
+/*------------------------------------------------------------------*/
+/* Sequence to add a structure ptr to parameters to stack_from_data */
+/*------------------------------------------------------------------*/
+#define ADD_TSTACK_PARM \
+	if (reg_param < GENERAL_REGS) { \
+		s390_ly (p, s390_r4, 0, STK_BASE, \
+			local_start + (reg_param - this_flag) * sizeof(long)); \
+		reg_param++; \
+	} else { \
+		s390_ly (p, s390_r4, 0, STK_BASE, \
+		 	 sz.stack_size + MINV_POS + stack_param * sizeof(long)); \
+		stack_param++; \
+	}
+
+#define ADD_PSTACK_PARM(r, i) \
+	if (reg_param < GENERAL_REGS-(r)) { \
+		s390_lay (p, s390_r4, 0, STK_BASE, \
+			  local_start + (reg_param - this_flag) * sizeof(long)); \
+		reg_param += (i); \
+	} else { \
+		s390_ly (p, s390_r4, 0, STK_BASE, \
+		 	 sz.stack_size + MINV_POS + stack_param * sizeof(long)); \
+		stack_param++; \
+	}
+
+typedef enum {
+	s390_r0 = 0,
+	s390_r1,
+	s390_r2,
+	s390_r3,
+	s390_r4,
+	s390_r5,
+	s390_r6,
+	s390_r7,
+	s390_r8,
+	s390_r9,
+	s390_r10,
+	s390_r11,
+	s390_r12,
+	s390_r13,
+	s390_r14,
+	s390_r15,
+} S390IntRegister;
+
+typedef enum {
+	s390_f0 = 0,
+	s390_f1,
+	s390_f2,
+	s390_f3,
+	s390_f4,
+	s390_f5,
+	s390_f6,
+	s390_f7,
+	s390_f8,
+	s390_f9,
+	s390_f10,
+	s390_f11,
+	s390_f12,
+	s390_f13,
+	s390_f14,
+	s390_f15,
+} S390FloatRegister;
+
+typedef enum {
+	s390_a0 = 0,
+	s390_a1,
+	s390_a2,
+	s390_a3,
+	s390_a4,
+	s390_a5,
+	s390_a6,
+	s390_a7,
+	s390_a8,
+	s390_a9,
+	s390_a10,
+	s390_a11,
+	s390_a12,
+	s390_a13,
+	s390_a14,
+	s390_a15,
+} S390AccRegister;
+
+typedef enum {
+	s390_fpc = 256,
+} S390SpecialRegister;
+
+#define s390_is_imm16(val) 		((glong)val >= (glong) SHRT_MIN && \
+					 (glong)val <= (glong) SHRT_MAX)
+#define s390_is_imm32(val) 		((glong)val >= (glong) INT_MIN && \
+					 (glong)val <= (glong) INT_MAX)
+#define s390_is_uimm16(val) 		((glong)val >= 0 && (glong)val <= (glong) USHRT_MAX)
+#define s390_is_uimm32(val) 		((glong)val >= 0 && (glong)val <= (glong) UINT_MAX)
+#define s390_is_uimm20(val) 		((glong)val >= 0 && (glong)val <= 1048575)
+#define s390_is_imm20(val) 		((glong)val >= -524288 && (glong)val <= 524287)
+#define s390_is_imm12(val)		((glong)val >= (glong)-4096 && \
+					 (glong)val <= (glong)4095)
+#define s390_is_uimm12(val)		((glong)val >= 0 && (glong)val <= 4095)
+
+#define STK_BASE			s390_r15
+#define S390_SP				s390_r15
+#define S390_FP				s390_r11
+#define S390_MINIMAL_STACK_SIZE		160
+#define S390_REG_SAVE_OFFSET 		48
+#define S390_PARM_SAVE_OFFSET 		16
+#define S390_RET_ADDR_OFFSET		112
+#define S390_FLOAT_SAVE_OFFSET 		128
+
+#define S390_CC_ZR			8
+#define S390_CC_NE			7
+#define S390_CC_NZ			7
+#define S390_CC_LT			4
+#define S390_CC_GT			2
+#define S390_CC_GE			11
+#define S390_CC_NM			11
+#define S390_CC_LE			13
+#define S390_CC_OV			1
+#define S390_CC_NO			14
+#define S390_CC_CY			3
+#define S390_CC_NC			12
+#define S390_CC_UN			15
+
+#define s390_word(addr, value) do	 	\
+{						\
+	* (guint32 *) addr = (guint32) value;	\
+	addr += sizeof(guint32);		\
+} while (0)
+
+#define s390_float(addr, value)	do 		\
+{						\
+	* (gfloat *) addr = (gfloat) value;	\
+	addr += sizeof(gfloat);			\
+} while (0)
+
+#define s390_llong(addr, value)	do 		\
+{						\
+	* (guint64 *) addr = (guint64) value;	\
+	addr += sizeof(guint64);		\
+} while (0)
+
+#define s390_double(addr, value) do 		\
+{						\
+	* (gdouble *) addr = (gdouble) value;	\
+	addr += sizeof(gdouble);		\
+} while (0)
+
+typedef struct {
+	short 	op;
+} E_Format;
+
+typedef struct {
+	char	op;
+	int	im;
+} I_Format;
+
+typedef struct {
+	char 	op;
+	char	r1 : 4;
+	char 	r2 : 4;
+} RR_Format;
+
+typedef struct {
+	short	op;
+	char	xx;
+	char	r1 : 4;
+	char	r2 : 4;
+} RRE_Format;
+
+typedef struct {
+	short	op;
+	char	r1 : 4;
+	char	xx : 4;
+	char	r3 : 4;
+	char	r2 : 4;
+} RRF_Format_1;
+
+typedef struct {
+	short	op;
+	char	m3 : 4;
+	char	xx : 4;
+	char	r1 : 4;
+	char	r2 : 4;
+} RRF_Format_2;
+
+typedef struct {
+	short	op;
+	char	r3 : 4;
+	char	m4 : 4;
+	char	r1 : 4;
+	char	r2 : 4;
+} RRF_Format_3;
+
+typedef struct {
+	char	op;
+	char	r1 : 4;
+	char	x2 : 4;
+	char	b2 : 4;
+	short	d2 : 12;
+} RX_Format;
+
+typedef struct {
+	char 	op1;
+	char	r1 : 4;
+	char	x2 : 4;
+	char	b2 : 4;
+	int	d2 : 12;
+	char	xx;
+	char	op2;
+} RXE_Format;
+
+typedef struct {
+	char 	op1;
+	char	r3 : 4;
+	char	x2 : 4;
+	char	b2 : 4;
+	int	d2 : 12;
+	char	r1 : 4;
+	char	xx : 4;
+	char	op2;
+} RXF_Format;
+
+typedef struct {
+	char 	op1;
+	char	r1 : 4;
+	char	x2 : 4;
+	char	b2 : 4;
+	int	d2 : 20;
+	char	op2;
+} __attribute__ ((packed)) RXY_Format;
+
+typedef struct {
+	char 	op;
+	char	r1 : 4;
+	char	r3 : 4;
+	char	b2 : 4;
+	int	d2 : 12;
+} RS_Format_1;
+
+typedef struct {
+	char 	op;
+	char	r1 : 4;
+	char	m3 : 4;
+	char	b2 : 4;
+	int	d2 : 12;
+} RS_Format_2;
+
+typedef struct {
+	char 	op;
+	char	r1 : 4;
+	char	xx : 4;
+	char	b2 : 4;
+	int	d2 : 12;
+} RS_Format_3;
+
+typedef struct {
+	char 	op1;
+	char	r1 : 4;
+	char	r3 : 4;
+	char	b2 : 4;
+	int	d2 : 20;
+	char 	op2;
+} __attribute__ ((packed)) RSY_Format_1;
+
+typedef struct {
+	char 	op1;
+	char	r1 : 4;
+	char	m3 : 4;
+	char	b2 : 4;
+	int	d2 : 20;
+	char 	op2;
+} __attribute__ ((packed)) RSY_Format_2;
+
+typedef struct {
+	char 	op1;
+	char	l1 : 4;
+	char	xx : 4;
+	char	b1 : 4;
+	int 	d1 : 12;
+	char	yy;
+	char 	op2;
+} RSL_Format;
+
+typedef struct {
+	char 	op;
+	char	r1 : 4;
+	char	r3 : 4;
+	short	i2;
+} RSI_Format;
+
+typedef struct {
+	char 	op1;
+	char	m1 : 4;
+	char	op2 : 4;
+	short	i2;
+} RI_Format;
+
+typedef struct {
+	char 	op1;
+	char	r1 : 4;
+	char	r3 : 4;
+	short	i2;
+	char	xx;
+	char	op2;
+} RIE_Format_1;
+
+typedef struct {
+	char 	op1;
+	char	r1 : 4;
+	char	r3 : 4;
+	short	i2;
+	char	m2 : 4;
+	char    xx : 4;
+	char	op2;
+} RIE_Format_2;
+
+typedef struct {
+	char 	op1;
+	char	r1 : 4;
+	char	r3 : 4;
+	short	d;
+	char	i;
+	char	op2;
+} RIE_Format_3;
+
+typedef struct {
+	char 	op1;
+	char	r1 : 4;
+	char	yy : 4;
+	short	i2;
+	char	m3 : 4;
+	char	xx : 4;
+	char	op2;
+} RIE_Format_4;
+
+typedef struct {
+	char 	op1;
+	char	r1 : 4;
+	char	op2 : 4;
+	int	i2;
+} __attribute__ ((packed)) RIL_Format_1;
+
+typedef struct {
+	char 	op1;
+	char	m1 : 4;
+	char	op2 : 4;
+	int	i2;
+} __attribute__ ((packed)) RIL_Format_2;
+
+typedef struct {
+	char	op;
+	char	i2;
+	char	b1 : 4;
+	short	d1 : 12;
+} SI_Format;
+
+typedef struct {
+	char	op1;
+	char	i2;
+	char	b1 : 4;
+	int	d1 : 20;
+	char	op2;
+} __attribute__ ((packed)) SIY_Format;
+
+typedef struct {
+	short	op;
+	char	b2 : 4;
+	short	d2 : 12;
+} S_Format;
+
+typedef struct {
+	char	op;
+	char	ll;
+	char	b1 : 4;
+	short	d1 : 12;
+	char	b2 : 4;
+	short	d2 : 12;
+} SS_Format_1;
+
+typedef struct {
+	char	op;
+	char	l1 : 4;
+	char	l2 : 4;
+	char	b1 : 4;	
+	short	d1 : 12;
+	char	b2 : 4;
+	short	d2 : 12;
+} SS_Format_2;
+
+typedef struct {
+	char	op;
+	char	r1 : 4;
+	char	r3 : 4;
+	char	b1 : 4;	
+	short	d1 : 12;
+	char	b2 : 4;
+	short	d2 : 12;
+} SS_Format_3;	
+
+typedef struct {
+	char	op;
+	char	r1 : 4;
+	char	r3 : 4;
+	char	b2 : 4;	
+	short	d2 : 12;
+	char	b4 : 4;
+	short	d4 : 12;
+} SS_Format_4;	
+
+typedef struct {
+	short	op;
+	short	tb1 : 4;
+	short	d1 : 12;
+	short	b2 : 4;
+	short	d2 : 12;
+} __attribute__ ((packed)) SSE_Format;
+
+typedef struct {
+	short	op;
+	char	r3 : 4;
+	char	o2 : 4;
+	short	b1 : 4;
+	short	d1 : 12;
+	short	b2 : 4;
+	short	d2 : 12;
+} __attribute__ ((packed)) SSF_Format;
+
+#define s390_emit16(c, x) do 			\
+{						\
+	*((guint16 *) c) = (guint16) x;		\
+	c += sizeof(guint16);			\
+} while(0)
+
+#define s390_emit32(c, x) do 			\
+{						\
+	*((guint32 *) c) = (guint32) x;		\
+	c += sizeof(guint32);			\
+} while(0)
+
+#define S390_E(c,opc) 			s390_emit16(c,opc)
+
+#define S390_I(c,opc,imm) 		s390_emit16(c, (opc << 8 | imm))
+
+#define S390_RR(c,opc,g1,g2)		s390_emit16(c, (opc << 8 | (g1) << 4 | g2))
+
+#define S390_RRE(c,opc,g1,g2)		s390_emit32(c, (opc << 16 | (g1) << 4 | g2)) 
+
+#define S390_RRF_1(c,opc,g1,g2,g3)	s390_emit32(c, (opc << 16 | (g1) << 12 | (g3) << 4 | g2))
+
+#define S390_RRF_2(c,opc,g1,k3,g2)	s390_emit32(c, (opc << 16 | (k3) << 12 | (g1) << 4 | g2))
+
+#define S390_RRF_3(c,opc,g1,g2,k4,g3)	s390_emit32(c, (opc << 16 | (g3) << 12 | (k4) << 8 | (g1) << 4 | g2))
+
+#define S390_RX(c,opc,g1,n2,s2,p2)	s390_emit32(c, (opc << 24 | (g1) << 20 | (n2) << 16 | (s2) << 12 | ((p2) & 0xfff)))
+
+#define S390_RXE(c,opc,g1,n2,s2,p2) do  			\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | (g1) << 4 | n2));	\
+	s390_emit32(c, ((s2) << 28 | (((p2) & 0xfff) << 16) | 	\
+			(opc & 0xff)));				\
+} while (0)
+
+#define S390_RXY(c,opc,g1,n2,s2,p2) do				\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | (g1) << 4 | n2));	\
+	s390_emit32(c, ((s2) << 28 | (((p2) & 0xfff) << 16) | 	\
+			((((p2) & 0xff000) >> 12) << 8) |	\
+			(opc & 0xff)));				\
+} while (0)
+
+#define S390_RS_1(c,opc,g1,g3,s2,p2) 	s390_emit32(c, (opc << 24 | (g1) << 20 | (g3) << 16 | (s2) << 12 | ((p2) & 0xfff))) 
+
+#define S390_RS_2(c,opc,g1,k3,s2,p2)	s390_emit32(c, (opc << 24 | (g1) << 20 | (k3) << 16 | (s2) << 12 | ((p2) & 0xfff)))
+
+#define S390_RS_3(c,opc,g1,s2,p2)	s390_emit32(c, (opc << 24 | (g1) << 20 | (s2) << 12 | ((p2) & 0xfff)))
+
+#define S390_RSY_1(c,opc,g1,g3,s2,p2) do			\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | (g1) << 4 | g3));	\
+	s390_emit32(c, ((s2) << 28 | (((p2) & 0xfff) << 16) | 	\
+			((((p2) & 0xff000) >> 12) << 8) |	\
+			(opc & 0xff)));				\
+} while (0)
+
+#define S390_RSY_2(c,opc,g1,k3,s2,p2) do			\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | (g1) << 4 | k3));	\
+	s390_emit32(c, ((s2) << 28 | (((p2) & 0xfff) << 16) | 	\
+			((((p2) & 0xff000) >> 12) << 8) |	\
+			(opc & 0xff)));				\
+} while (0)
+
+#define S390_RSL(c,opc,ln,s1,p1) do 				\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | (ln) << 4));		\
+	s390_emit32(c, ((s1) << 28 | ((s1 & 0xfff) << 16) | 	\
+			(opc & 0xff)));				\
+} while (0)
+
+#define S390_RSI(c,opc,g1,g3,m2) 	s390_emit32(c, (opc << 24 | (g1) << 20 | (g3) << 16 | (m2 & 0xffff)))
+
+#define S390_RI(c,opc,g1,m2)		s390_emit32(c, ((opc >> 4) << 24 | (g1) << 20 | (opc & 0x0f) << 16 | (m2 & 0xffff)))
+
+#define S390_RIE_1(c,opc,g1,g3,m2) do				\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | (g1) << 4 | g3));	\
+	s390_emit32(c, ((m2) << 16 | (opc & 0xff)));		\
+} while (0)
+
+#define S390_RIE_2(c,opc,g1,g2,m3,v) do				\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | (g1) << 4 | g3));	\
+	s390_emit16(c, (v));					\
+	s390_emit16(c, ((m2) << 12 | (opc & 0xff)));		\
+} while (0)
+
+#define S390_RIE_3(c,opc,g1,i,m3,d) do				\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | (g1) << 4 | m3));	\
+	s390_emit16(c, (d));					\
+	s390_emit16(c, ((i) << 8 | (opc & 0xff)));		\
+} while (0)
+
+#define S390_RIE_4(c,opc,g1,i2,m3) do				\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | (g1) << 4);		\
+	s390_emit16(c, (i2));					\
+	s390_emit16(c, ((m3) << 12 | (opc & 0xff)));		\
+} while (0)
+
+#define S390_RIL_1(c,opc,g1,m2) do					\
+{									\
+	s390_emit16(c, ((opc >> 4) << 8 | (g1) << 4 | (opc & 0xf)));	\
+	s390_emit32(c, m2);						\
+} while (0)
+
+#define S390_RIL_2(c,opc,k1,m2) do					\
+{									\
+	s390_emit16(c, ((opc >> 4) << 8 | (k1) << 4 | (opc & 0xf)));	\
+	s390_emit32(c, m2);						\
+} while (0)
+
+#define S390_RIS(c,opc,r,i,m3,b,d) do				\
+{								\
+	s390_emit16(c, ((opc, & 0xff00) | (r1) << 4) | (r2));	\
+	s390_emit16(c, ((b) << 12) | (d));			\
+	s390_emit16(c, ((i) << 4) | ((opc) & 0xff));		\
+}
+
+#define S390_RRS(c,opc,r1,r2,m3,b,d) do				\
+{								\
+	s390_emit16(c, ((opc, & 0xff00) | (r1) << 4) | (r2));	\
+	s390_emit16(c, ((b) << 12) | (d));			\
+	s390_emit16(c, ((m3) << 12) | ((opc) & 0xff));		\
+}
+
+#define S390_SI(c,opc,s1,p1,m2)		s390_emit32(c, (opc << 24 | (m2) << 16 | (s1) << 12 | ((p1) & 0xfff)));
+
+#define S390_SIY(c,opc,s1,p1,m2) do				\
+{								\
+	s390_emit16(c, ((opc & 0xff00) | m2));			\
+	s390_emit32(c, ((s1) << 24 | (((p2) & 0xfffff) << 8) | 	\
+			(opc & 0xff)));				\
+} while (0)
+
+#define S390_S(c,opc,s2,p2)	s390_emit32(c, (opc << 16 | (s2) << 12 | ((p2) & 0xfff)))
+
+#define S390_SS_1(c,opc,ln,s1,p1,s2,p2) do			\
+{								\
+	s390_emit32(c, (opc << 24 | ((ln-1) & 0xff) << 16 |	\
+			(s1) << 12 | ((p1) & 0xfff)));		\
+	s390_emit16(c, ((s2) << 12 | ((p2) & 0xfff)));		\
+} while (0)
+
+#define S390_SS_2(c,opc,n1,n2,s1,p1,s2,p2) do			\
+{								\
+	s390_emit32(c, (opc << 24 | (n1) << 16 | (n2) << 12 |	\
+			(s1) << 12 | ((p1) & 0xfff)));		\
+	s390_emit16(c, ((s2) << 12 | ((p2) & 0xfff)));		\
+} while (0)
+
+#define S390_SS_3(c,opc,g1,g3,s1,p1,s2,p2) do			\
+{								\
+	s390_emit32(c, (opc << 24 | (g1) << 16 | (g3) << 12 |	\
+			(s1) << 12 | ((p1) & 0xfff)));		\
+	s390_emit16(c, ((s2) << 12 | ((p2) & 0xfff)));		\
+} while (0)
+
+#define S390_SS_4(c,opc,g1,g3,s2,p2,s4,p4) do			\
+{								\
+	s390_emit32(c, (opc << 24 | (g1) << 16 | (g3) << 12 |	\
+			(s2) << 12 | ((p2) & 0xfff)));		\
+	s390_emit16(c, ((s4) << 12 | ((p4) & 0xfff)));		\
+} while (0)
+
+#define S390_SSE(c,opc,s1,p1,s2,p2) do			\
+{							\
+	s390_emit16(c, opc);				\
+	s390_emit16(c, ((s1) << 12 | ((p1) & 0xfff)));	\
+	s390_emit16(c, ((s2) << 12 | ((p2) & 0xfff)));	\
+} while (0)
+
+#define S390_SSF(c,opc,r3,s1,p1,s2,p2) do				\
+{									\
+	s390_emit16(c, (((opc) & 0xff00) << 8) | ((r3) << 4) | 		\
+			((opc) & 0xf));					\
+	s390_emit16(c, ((s1) << 12 | ((p1) & 0xfff)));			\
+	s390_emit16(c, ((s2) << 12 | ((p2) & 0xfff)));			\
+} while (0)
+
+#define s390_a(c, r, x, b, d)		S390_RX(c, 0x5a, r, x, b, d)
+#define s390_adb(c, r, x, b, d)		S390_RXE(c, 0xed1a, r, x, b, d)
+#define s390_adbr(c, r1, r2)		S390_RRE(c, 0xb31a, r1, r2)
+#define s390_aebr(c, r1, r2)		S390_RRE(c, 0xb30a, r1, r2)
+#define s390_afi(c, r, v)		S390_RIL_1(c, 0xc29, r, v);
+#define s390_ag(c, r, x, b, d)		S390_RXY(c, 0xe308, r, x, b, d)
+#define s390_agf(c, r, x, b, d)		S390_RXY(c, 0xe318, r, x, b, d)
+#define s390_agfi(c, r, v)		S390_RIL_1(c, 0xc28, r, v)
+#define s390_afgr(c, r1, r2)		S390_RRE(c, 0xb918, r1, r2)
+#define s390_aghi(c, r, v)		S390_RI(c, 0xa7b, r, v)
+#define s390_aghik(c, r, v)		S390_RIE_1(c, 0xecd9, r, v)
+#define s390_agr(c, r1, r2)		S390_RRE(c, 0xb908, r1, r2)
+#define s390_agrk(c, r1, r2, r3)	S390_RRF_1(c, 0xb9e8, r1, r2, r3)
+#define s390_agsi(c, r, v)		S390_SIY(c, 0xeb7a, r v)
+#define s390_ahhhr(c, r1, r2, r3)	S390_RRF_1(c, 0xb9c8, r1, r2, r3)
+#define s390_ahhlr(c, r1, r2, r3)	S390_RRF_1(c, 0xb9d8, r1, r2, r3)
+#define s390_ahi(c, r, v)		S390_RI(c, 0xa7a, r, v)
+#define s390_ahik(c, r, v)		S390_RIE_1(c, 0xecd8, r, v)
+#define s390_ahy(c, r, x, b, d)		S390_RXY(c, 0xe37a, r, b, d)
+#define s390_aih(c, r, v)		S390_RIL_1(c, 0xcc8, r, v)
+#define s390_al(c, r, x, b, d)		S390_RX(c, 0x5e, r, x, b, d)
+#define s390_alc(c, r, x, b, d)		S390_RXY(c, 0xe398, r, x, b, d)
+#define s390_alcg(c, r, x, b, d)	S390_RXY(c, 0xe388, r, x, b, d)
+#define s390_alcgr(c, r1, r2)		S390_RRE(c, 0xb988, r1, r2)
+#define s390_alcr(c, r1, r2)		S390_RRE(c, 0xb998, r1, r2)
+#define s390_alfi(c, r, v)		S390_RIL_1(c, 0xc2b, r, v)
+#define s390_alg(c, r, x, b, d)		S390_RXY(c, 0xe30a, r, x, b, d)
+#define s390_algf(c, r, x, b, d)	S390_RXY(c, 0xe31a, r, x, b, d)
+#define s390_algfi(c, r, v)		S390_RIL_1(c, 0xc2a, r, v)
+#define s390_algfr(c, r1, r2)		S390_RRE(c, 0xb91a, r1, r2)
+#define s390_alghsik(c, r, v)		S390_RIE_1(c, 0xecd8, r, v)
+#define s390_algr(c, r1, r2)		S390_RRE(c, 0xb90a, r1, r2)
+#define s390_algsi(c, r, v)		S390_SIY(c, 0xeb7e, r, v)
+#define s390_alhhhr(c, r1, r2, r3)	S390_RRF_1(c, 0xb9ca, r1, r2, r3)
+#define s390_alhhlr(c, r1, r2, r3)	S390_RRF_1(c, 0xb9da, r1, r2, r3)
+#define s390_alhsik(c, r, v)		S390_RIE_1(c, 0xecda, r, v)
+#define s390_alr(c, r1, r2)		S390_RR(c, 0x1e, r1, r2)
+#define s390_alrk(c, r1, r2)		S390_RRF(c, 0xb9fa, r1, r2)
+#define s390_alsi(c, r, v)		S390_SIY(c, 0xeb6e, r, v)
+#define s390_alsih(c, r, v)		S390_RIL_1(c, 0xcca, r, v)
+#define s390_alsihn(c, r, v)		S390_RIL_1(c, 0xccb, r, v)
+#define s390_aly(c, r, x, b, d)		S390_RXY(c, 0xe35e, r, x, b, d)
+#define s390_ar(c, r1, r2)		S390_RR(c, 0x1a, r1, r2)
+#define s390_ark(c, r1, r2, r3)		S390_RRF_1(c, 0xb9f8, r1, r2, r3)
+#define s390_asi(c, r, v)		S390_SIY(c, 0xeb6a, r, v)
+#define s390_ay(c, r, x, b, d)		S390_RXY(c, 0xe35a, r, x, b, d)
+#define s390_basr(c, r1, r2)		S390_RR(c, 0x0d, r1, r2)
+#define s390_bctr(c, r1, r2)		S390_RR(c, 0x06, r1, r2)
+#define s390_bctrg(c, r1, r2)		S390_RRE(c, 0xb946, r1, r2)
+#define s390_bnzr(c, r)			S390_RR(c, 0x07, 0x07, r)
+#define s390_bras(c, r, o)		S390_RI(c, 0xa75, r, o)
+#define s390_brasl(c, r, o)		S390_RIL_1(c, 0xc05, r, o)
+#define s390_brc(c, m, d)		S390_RI(c, 0xa74, m, d)
+#define s390_brcl(c, m, d)		S390_RIL_2(c, 0xc04, m, d)
+#define s390_br(c, r)			S390_RR(c, 0x07, 0xf, r)
+#define s390_break(c)			S390_RR(c, 0, 0, 0)
+#define s390_bzr(c, r)			S390_RR(c, 0x07, 0x08, r)
+#define s390_c(c, r, x, b, d)		S390_RX(c, 0x59, r, x, b, d)
+#define s390_cdb(c, r, x, b, d)		S390_RXE(c, 0xed19, r, x, b, d)
+#define s390_cdbr(c, r1, r2)		S390_RRE(c, 0xb319, r1, r2)
+#define s390_cdfbr(c, r1, r2)		S390_RRE(c, 0xb395, r1, r2)
+#define s390_cdgbr(c, r1, r2)		S390_RRE(c, 0xb3a5, r1, r2)
+#define s390_cds(c, r1, r2, b, d)	S390_RX(c, 0xbb, r1, r2, b, d)
+#define s390_cdsg(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb3e, r1, r2, b, d)
+#define s390_cdsy(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb31, r1, r2, b, d)
+#define s390_cebr(c, r1, r2)		S390_RRE(c, 0xb309, r1, r2)
+#define s390_cegbr(c, r1, r2)		S390_RRE(c, 0xb3a4, r1, r2)
+#define s390_cfdbr(c, r1, m, r2)	S390_RRF_2(c, 0xb399, r1, m, r2)
+#define s390_cfi(c, r, v)		S390_RIL_1(c, 0xc2d, r, v)
+#define s390_cgdbr(c, r1, m, r2)	S390_RRF_2(c, 0xb3a9, r1, m, r2)
+#define s390_cg(c, r, x, b, d)		S390_RXY(c, 0xe320, r, x, b, d)
+#define s390_cgfi(c, r, v)		S390_RIL_1(c, 0xc2c, r, v)
+#define s390_cgfrl(c, r, v)		S390_RIL_1(c, 0xc6c, r, v)
+#define s390_cghi(c, r, i)		S390_RI(c, 0xa7f, r, i)
+#define s390_cgib(c, r, i, m, b, d)	S390_RIS(c, 0xecfc, r, i, m, b, d)
+#define s390_cgij(c, r, i, m, d)	S390_RIE_3(c, 0xec7c, r, i, m, d)
+#define s390_cgit(c, r, i, m)		S390_RIE_4(c, 0xec70, r, i m);
+#define s390_cgr(c, r1, r2)		S390_RRE(c, 0xb920, r1, r2)
+#define s390_cgrb(c, r1, r2, m3, b, d)	S390_RRS(c, 0xece4, r1, r2, m3, b, d)
+#define s390_cgrj(c, r1, r2, m3, v)	S390_RIE_2(c, 0xec64, r1, r2, m3, v)
+#define s390_cgrl(c, r, v)		S390_RIL_1(c, 0xc68, r, v)
+#define s390_chi(c, r, i)		S390_RI(c, 0xa7e, r, i)
+#define s390_cib(c, r, i, m, b, d)	S390_RIS(c, 0xecfe, r, i, m, b, d)
+#define s390_cij(c, r, i, m, d)		S390_RIE_3(c, 0xec7e, r, i, m, d)
+#define s390_cit(c, r, i, m)		S390_RIE_4(c, 0xec72, r, i m);
+#define s390_cl(c, r, x, b, d)		S390_RX(c, 0x55, r, x, b, d)
+#define s390_clg(c, r, x, b, d)		S390_RXY(c, 0xe321, r, x, b, d)
+#define s390_clgib(c, r, i, m, b, d)	S390_RIS(c, 0xecfd, r, i, m, b, d)
+#define s390_clgij(c, r, i, b)		S390_RIE_3(c, 0xec7d, r, i, m, d)
+#define s390_clgr(c, r1, r2)		S390_RRE(c, 0xb921, r1, r2)
+#define s390_clgrj(c, r1, r2, m, v)	S390_RIE_2(c, 0xec65, r1, r2, m, v)
+#define s390_clgrb(c, r1, r2, m3, b, d)	S390_RRS(c, 0xece5, r1, r2, m3, b, d)
+#define s390_clib(c, r, i, m, b, d)	S390_RIS(c, 0xecff, r, i, m, b, d)
+#define s390_clij(c, r, i, b)		S390_RIE_3(c, 0xec7f, r, i, m, d)
+#define s390_clr(c, r1, r2)		S390_RR(c, 0x15, r1, r2)
+#define s390_clrb(c, r1, r2, m3, b, d)	S390_RRS(c, 0xecf7, r1, r2, m3, b, d)
+#define s390_clrj(c, r1, r2, m, v)	S390_RIE_2(c, 0xec77, r1, r2, m, v)
+#define s390_cr(c, r1, r2)		S390_RR(c, 0x19, r1, r2)
+#define s390_crb(c, r1, r2, m3, b, d)	S390_RRS(c, 0xecf6, r1, r2, m3, b, d)
+#define s390_crj(c, r1, r2, m3, v)	S390_RIE_2(c, 0xec76, r1, r2, m3, v)
+#define s390_crl(c, r, v)		S390_RIL_1(c, 0xc6d, r, v)
+#define s390_crt(c, r1, r2, m3)		S390_RRF_2(c, 0xb972, r1, r2, m3);
+#define s390_cgrt(c, r1, r2, m3)	S390_RRF_2(c, 0xb960, r1, r2, m3);
+#define s390_cs(c, r1, r2, b, d)	S390_RX(c, 0xba, r1, r2, b, d)
+#define s390_csg(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb30, r1, r2, b, d)
+#define s390_csst(c, d1, b1, d2, b2, r)	S390_SSF(c, 0xc82, b1, d1, b2, d2, r)
+#define s390_csy(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb14, r1, r2, b, d)
+#define s390_ddbr(c, r1, r2)		S390_RRE(c, 0xb31d, r1, r2)
+#define s390_debr(c, r1, r2)		S390_RRE(c, 0xb30d, r1, r2)
+#define s390_didbr(c, r1, r2, m, r3)    S390_RRF_3(c, 0xb35b, r1, r2, m, r3)
+#define s390_dlgr(c, r1, r2)		S390_RRE(c, 0xb987, r1, r2)
+#define s390_dlr(c, r1, r2)		S390_RRE(c, 0xb997, r1, r2)
+#define s390_dr(c, r1, r2)		S390_RR(c, 0x1d, r1, r2)
+#define s390_dsgfr(c, r1, r2)		S390_RRE(c, 0xb91d, r1, r2)
+#define s390_dsgr(c, r1, r2)		S390_RRE(c, 0xb90d, r1, r2)
+#define s390_ear(c, r1, r2)		S390_RRE(c, 0xb24f, r1, r2)
+#define s390_ic(c, r, x, b, d)		S390_RX(c, 0x43, r, x, b, d)
+#define s390_icm(c, r, m, b, d)		S390_RX(c, 0xbf, r, m, b, d)
+#define s390_icmy(c, r, x, b, d)	S390_RXY(c, 0xeb81, r, x, b, d)
+#define s390_icy(c, r, x, b, d)		S390_RXY(c, 0xe373, r, x, b, d)
+#define s390_iihf(c, r, v)		S390_RIL_1(c, 0xc08, r, v)
+#define s390_iihh(c, r, v)		S390_RI(c, 0xa50, r, v)
+#define s390_iihl(c, r, v)		S390_RI(c, 0xa51, r, v)
+#define s390_iilf(c, r, v)		S390_RIL_1(c, 0xc09, r, v)
+#define s390_iilh(c, r, v)		S390_RI(c, 0xa52, r, v)
+#define s390_iill(c, r, v)		S390_RI(c, 0xa53, r, v)
+#define s390_j(c,d)			s390_brc(c, S390_CC_UN, d)
+#define s390_jc(c, m, d)		s390_brc(c, m, d)
+#define s390_jcl(c, m, d)		s390_brcl(c, m, d)
+#define s390_jcy(c, d)			s390_brc(c, S390_CC_CY, d)
+#define s390_je(c, d)			s390_brc(c, S390_CC_EQ, d)
+#define s390_jeo(c, d)			s390_brc(c, S390_CC_ZR|S390_CC_OV, d)
+#define s390_jh(c, d)			s390_brc(c, S390_CC_GT, d)
+#define s390_jho(c, d)			s390_brc(c, S390_CC_GT|S390_CC_OV, d)
+#define s390_jl(c, d)			s390_brc(c, S390_CC_LT, d)
+#define s390_jlo(c, d)			s390_brc(c, S390_CC_LT|S390_CC_OV, d)
+#define s390_jm(c, d)			s390_brc(c, S390_CC_LT, d)
+#define s390_jnc(c, d)			s390_brc(c, S390_CC_NC, d)
+#define s390_jne(c, d)			s390_brc(c, S390_CC_NZ, d)
+#define s390_jnh(c, d)			s390_brc(c, S390_CC_LE, d)
+#define s390_jnl(c, d)			s390_brc(c, S390_CC_GE, d)
+#define s390_jnz(c, d)			s390_brc(c, S390_CC_NZ, d)
+#define s390_jo(c, d)			s390_brc(c, S390_CC_OV, d)
+#define s390_jno(c, d)			s390_brc(c, S390_CC_NO, d)
+#define s390_jp(c, d)			s390_brc(c, S390_CC_GT, d)
+#define s390_jz(c, d)			s390_brc(c, S390_CC_ZR, d)
+#define s390_jg(c,d)			s390_brcl(c, S390_CC_UN, d)
+#define s390_jgcy(c, d)			s390_brcl(c, S390_CC_CY, d)
+#define s390_jge(c, d)			s390_brcl(c, S390_CC_EQ, d)
+#define s390_jgeo(c, d)			s390_brcl(c, S390_CC_ZR|S390_CC_OV, d)
+#define s390_jgh(c, d)			s390_brcl(c, S390_CC_GT, d)
+#define s390_jgho(c, d)			s390_brcl(c, S390_CC_GT|S390_CC_OV, d)
+#define s390_jgl(c, d)			s390_brcl(c, S390_CC_LT, d)
+#define s390_jglo(c, d)			s390_brcl(c, S390_CC_LT|S390_CC_OV, d)
+#define s390_jgm(c, d)			s390_brcl(c, S390_CC_LT, d)
+#define s390_jgnc(c, d)			s390_brcl(c, S390_CC_NC, d)
+#define s390_jgne(c, d)			s390_brcl(c, S390_CC_NZ, d)
+#define s390_jgnh(c, d)			s390_brcl(c, S390_CC_LE, d)
+#define s390_jgnl(c, d)			s390_brcl(c, S390_CC_GE, d)
+#define s390_jgnz(c, d)			s390_brcl(c, S390_CC_NZ, d)
+#define s390_jgo(c, d)			s390_brcl(c, S390_CC_OV, d)
+#define s390_jgno(c, d)			s390_brcl(c, S390_CC_NO, d)
+#define s390_jgp(c, d)			s390_brcl(c, S390_CC_GT, d)
+#define s390_jgz(c, d)			s390_brcl(c, S390_CC_ZR, d)
+#define s390_l(c, r, x, b, d)		S390_RX(c, 0x58, r, x, b, d)
+#define s390_ly(c, r, x, b, d)		S390_RXY(c, 0xe358, r, x, b, d)
+#define s390_la(c, r, x, b, d)		S390_RX(c, 0x41, r, x, b, d)
+#define s390_lay(c, r, x, b, d)		S390_RXY(c, 0xe371, r, x, b, d)
+#define s390_lam(c, r1, r2, b, d)	S390_RS_1(c, 0x9a, r1, r2, b, d)
+#define s390_larl(c, r, o)		S390_RIL_1(c, 0xc00, r, o)
+#define s390_lb(c, r, x, b, d)		S390_RXY(c, 0xe376, r, x, b, d)
+#define s390_lbr(c, r1, r2)		S390_RRE(c, 0xb926, r1, r2)
+#define s390_lcdbr(c, r1, r2)		S390_RRE(c, 0xb313, r1, r2)
+#define s390_lcgr(c, r1, r2)		S390_RRE(c, 0xb903, r1, r2)
+#define s390_lcr(c, r1, r2)		S390_RR(c, 0x13, r1, r2)
+#define s390_ld(c, f, x, b, d)		S390_RX(c, 0x68, f, x, b, d)
+#define s390_ldy(c, r, x, b, d)		S390_RXY(c, 0xed65, r, x, b, d)
+#define s390_ldeb(c, r, x, b, d)	S390_RXE(c, 0xed04, r, x, b, d)
+#define s390_ldebr(c, r1, r2)		S390_RRE(c, 0xb304, r1, r2)
+#define s390_ldgr(c, r1, r2)		S390_RRE(c, 0xb3c1, r1, r2)
+#define s390_ldr(c, r1, r2)		S390_RR(c, 0x28, r1, r2)
+#define s390_le(c, f, x, b, d)		S390_RX(c, 0x78, f, x, b, d)
+#define s390_ledbr(c, r1, r2)		S390_RRE(c, 0xb344, r1, r2)
+#define s390_ler(c, r1, r2)		S390_RR(c, 0x38, r1, r2)
+#define s390_ley(c, r, x, b, d)		S390_RXY(c, 0xed64, r, x, b, d)
+#define s390_lg(c, r, x, b, d)		S390_RXY(c, 0xe304, r, x, b, d)
+#define s390_lgb(c, r, x, b, d)		S390_RXY(c, 0xe377, r, x, b, d)
+#define s390_lgbr(c, r1, r2)		S390_RRE(c, 0xb906, r1, r2)
+#define s390_lgdr(c, r1, r2)		S390_RRE(c, 0xb3cd, r1, r2)
+#define s390_lgf(c, r, x, b, d)		S390_RXY(c, 0xe314, r, x, b, d)
+#define s390_lgfi(c, r, v)		S390_RIL_1(c, 0xc01, r, v)
+#define s390_lgfrl(c, r1, d)		S390_RIL_1(c, 0xc4c, r1, d)
+#define s390_lgfr(c, r1, r2)		S390_RRE(c, 0xb914, r1, r2)
+#define s390_lgh(c, r, x, b, d)		S390_RXY(c, 0xe315, r, x, b, d)
+#define s390_lghi(c, r, v)		S390_RI(c, 0xa79, r, v)
+#define s390_lghr(c, r1, r2)		S390_RRE(c, 0xb907, r1, r2)
+#define s390_lgr(c, r1, r2)		S390_RRE(c, 0xb904, r1, r2)
+#define s390_lgrl(c, r1, d)		S390_RIL_1(c, 0xc48, r1, d)
+#define s390_lh(c, r, x, b, d)		S390_RX(c, 0x48, r, x, b, d)
+#define s390_lhr(c, r1, r2)		S390_RRE(c, 0xb927, r1, r2)
+#define s390_lhg(c, r, x, b, d)		S390_RXY(c, 0xe315, r, x, b, d)
+#define s390_lhi(c, r, v)		S390_RI(c, 0xa78, r, v)
+#define s390_lhy(c, r, x, b, d)		S390_RXY(c, 0xe378, r, x, b, d)
+#define s390_llcr(c, r1, r2)		S390_RRE(c, 0xb994, r1, r2)
+#define s390_llgc(c, r, x, b, d)	S390_RXY(c, 0xe390, r, x, b, d)
+#define s390_llgcr(c, r1, r2)		S390_RRE(c, 0xb984, r1, r2)
+#define s390_llgf(c, r, x, b, d)	S390_RXY(c, 0xe316, r, x, b, d)
+#define s390_llgfr(c, r1, r2)		S390_RRE(c, 0xb916, r1, r2)
+#define s390_llgh(c, r, x, b, d)	S390_RXY(c, 0xe391, r, x, b, d)
+#define s390_llghr(c, r1, r2)		S390_RRE(c, 0xb985, r1, r2)
+#define s390_llhr(c, r1, r2)		S390_RRE(c, 0xb995, r1, r2)
+#define s390_llihf(c, r, v)		S390_RIL_1(c, 0xc0e, r, v)
+#define s390_llihh(c, r, v)		S390_RI(c, 0xa5c, r, v)
+#define s390_llihl(c, r, v)		S390_RI(c, 0xa5d, r, v)
+#define s390_llilf(c, r, v)		S390_RIL_1(c, 0xc0f, r, v)
+#define s390_llilh(c, r, v)		S390_RI(c, 0xa5e, r, v)
+#define s390_llill(c, r, v)		S390_RI(c, 0xa5f, r, v)
+#define s390_lm(c, r1, r2, b, d)	S390_RS_1(c, 0x98, r1, r2, b, d)
+#define s390_lmg(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb04, r1, r2, b, d)
+#define s390_lndbr(c, r1, r2)		S390_RRE(c, 0xb311, r1, r2)
+#define s390_lngr(c, r1, r2)		S390_RRE(c, 0xb901, r1, r2)
+#define s390_lnr(c, r1, r2)		S390_RR(c, 0x11, r1, r2)
+#define s390_lpdbr(c, r1, r2)		S390_RRE(c, 0xb310, r1, r2)
+#define s390_lpgr(c, r1, r2)		S390_RRE(c, 0xb900, r1, r2)
+#define s390_lpr(c, r1, r2)		S390_RR(c, 0x10, r1, r2)
+#define s390_lr(c, r1, r2)		S390_RR(c, 0x18, r1, r2)
+#define s390_lrl(c, r1, d)		S390_RIL_1(c, 0xc4d, r1, d)
+#define s390_ltgfr(c, r1, r2)		S390_RRE(c, 0xb912, r1, r2)
+#define s390_ltgr(c, r1, r2)		S390_RRE(c, 0xb902, r1, r2)
+#define s390_ltr(c, r1, r2)		S390_RR(c, 0x12, r1, r2)
+#define s390_lzdr(c, r)    		S390_RRE(c, 0xb375, r, 0)
+#define s390_lzer(c, r)    		S390_RRE(c, 0xb374, r, 0)
+#define s390_m(c, r, x, b, d)		S390_RX(c, 0x5c, r, x, b, d)
+#define s390_mdbr(c, r1, r2)		S390_RRE(c, 0xb31c, r1, r2)
+#define s390_meebr(c, r1, r2)		S390_RRE(c, 0xb317, r1, r2)
+#define s390_mfy(c, r, x, b, d)		S390_RXY(c, 0xe35c, r, x, b, d)
+#define s390_mlgr(c, r1, r2)		S390_RRE(c, 0xb986, r1, r2)
+#define s390_mlr(c, r1, r2)		S390_RRE(c, 0xb996, r1, r2)
+#define s390_mr(c, r1, r2)		S390_RR(c, 0x1c, r1, r2)
+#define s390_ms(c, r, x, b, d)		S390_RX(c, 0x71, r, x, b, d)
+#define s390_msi(c, r, v)		S390_RIL_1(c, 0xc21, r, v)
+#define s390_msgfr(c, r1, r2)		S390_RRE(c, 0xb91c, r1, r2)
+#define s390_msgi(c, r, v)		S390_RIL_1(c, 0xc20, r, v)
+#define s390_msgr(c, r1, r2)		S390_RRE(c, 0xb90c, r1, r2)
+#define s390_msr(c, r1, r2)		S390_RRE(c, 0xb252, r1, r2)
+#define s390_mvc(c, l, b1, d1, b2, d2)	S390_SS_1(c, 0xd2, l, b1, d1, b2, d2)
+#define s390_mvcl(c, r1, r2)		S390_RR(c, 0x0e, r1, r2)
+#define s390_mvcle(c, r1, r3, d2, b2)	S390_RS_1(c, 0xa8, r1, r3, d2, b2)
+#define s390_n(c, r, x, b, d)		S390_RX(c, 0x54, r, x, b, d)
+#define s390_nc(c, l, b1, d1, b2, d2)	S390_SS_1(c, 0xd4, l, b1, d1, b2, d2)
+#define s390_ng(c, r, x, b, d)		S390_RXY(c, 0xe380, r, x, b, d)
+#define s390_ngr(c, r1, r2)		S390_RRE(c, 0xb980, r1, r2)
+#define s390_ngrk(c, r1, r2, r3)	S390_RRF_1(c, 0xb9e4, r1, r2, r3)
+#define s390_ni(c, b, d, v)		S390_SI(c, 0x94, b, d, v) 
+#define s390_nihf(c, r, v)		S390_RIL_1(c, 0xc0a, r, v)
+#define s390_nihh(c, r, v)		S390_RI(c, 0xa54, r, v)
+#define s390_nihl(c, r, v)		S390_RI(c, 0xa55, r, v)
+#define s390_nilf(c, r, v)		S390_RIL_1(c, 0xc0b, r, v)
+#define s390_nilh(c, r, v)		S390_RI(c, 0xa56, r, v)
+#define s390_nill(c, r, v)		S390_RI(c, 0xa57, r, v)
+#define s390_niy(c, b, d, v)		S390_SIY(c, 0xeb54, b, d, v) 
+#define s390_nop(c)  			S390_RR(c, 0x07, 0x0, 0)
+#define s390_nr(c, r1, r2)		S390_RR(c, 0x14, r1, r2)
+#define s390_nrk(c, r1, r2)		S390_RRF_1(c, 0xb9f4, r1, r2)
+#define s390_ny(c, r, x, b, d)		S390_RRY(c, 0xe354, r1, r2)
+#define s390_o(c, r, x, b, d)		S390_RX(c, 0x56, r, x, b, d)
+#define s390_oihf(c, r, v)		S390_RIL_1(c, 0xc0c, r, v)
+#define s390_oihh(c, r, v)		S390_RI(c, 0xa58, r, v)
+#define s390_oihl(c, r, v)		S390_RI(c, 0xa59, r, v)
+#define s390_oilf(c, r, v)		S390_RIL_1(c, 0xc0d, r, v)
+#define s390_oilh(c, r, v)		S390_RI(c, 0xa5a, r, v)
+#define s390_oill(c, r, v)		S390_RI(c, 0xa5b` r, v)
+#define s390_oiy(c, b, d, v)		S390_SIY(c, 0xeb56 b, d, v) 
+#define s390_og(c, r, x, b, d)		S390_RXY(c, 0xe381, r, x, b, d)
+#define s390_ogr(c, r1, r2)		S390_RRE(c, 0xb981, r1, r2)
+#define s390_or(c, r1, r2)		S390_RR(c, 0x16, r1, r2)
+#define s390_s(c, r, x, b, d)		S390_RX(c, 0x5b, r, x, b, d)
+#define s390_sdb(c, r, x, b, d)		S390_RXE(c, 0xed1b, r, x, b, d)
+#define s390_sdbr(c, r1, r2)		S390_RRE(c, 0xb31b, r1, r2)
+#define s390_sebr(c, r1, r2)		S390_RRE(c, 0xb30b, r1, r2)
+#define s390_sg(c, r, x, b, d)		S390_RXY(c, 0xe309, r, x, b, d)
+#define s390_sgf(c, r, x, b, d)		S390_RXY(c, 0xe319, r, x, b, d)
+#define s390_sgr(c, r1, r2)		S390_RRE(c, 0xb909, r1, r2)
+#define s390_sl(c, r, x, b, d)		S390_RX(c, 0x5f, r, x, b, d)
+#define s390_sla(c, r, b, d)		S390_RS_3(c, 0x8b, r, b, d) 
+#define s390_slag(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb0b, r1, r2, b, d) 
+#define s390_slbg(c, r, x, b, d)	S390_RXY(c, 0xe389, r, x, b, d)
+#define s390_slbgr(c, r1, r2)		S390_RRE(c, 0xb989, r1, r2)
+#define s390_slbr(c, r1, r2)		S390_RRE(c, 0xb999, r1, r2)
+#define s390_slda(c, r, b, d)		S390_RS_3(c, 0x8f, r, b, d) 
+#define s390_sldl(c, r, b, d)		S390_RS_3(c, 0x8d, r, b, d) 
+#define s390_slfi(c, r, v)		S390_RIL_1(c, 0xc25, r, v)
+#define s390_slg(c, r, x, b, d)		S390_RXY(c, 0xe30b, r, x, b, d)
+#define s390_slgf(c, r, x, b, d)	S390_RXY(c, 0xe31b, r, x, b, d)
+#define s390_slgfr(c, r1, r2)		S390_RRE(c, 0xb91b, r1, r2)
+#define s390_slgfi(c, r, v)		S390_RIL_1(c, 0xc24, r, v)
+#define s390_slgr(c, r1, r2)		S390_RRE(c, 0xb90b, r1, r2)
+#define s390_sll(c, r, b, d)		S390_RS_3(c, 0x89, r, b, d) 
+#define s390_sllg(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb0d, r1, r2, b, d) 
+#define s390_slr(c, r1, r2)		S390_RR(c, 0x1f, r1, r2)
+#define s390_sqdbr(c, r1, r2)		S390_RRE(c, 0xb315, r1, r2)
+#define s390_sqebr(c, r1, r2)		S390_RRE(c, 0xb314, r1, r2)
+#define s390_sra(c, r, b, d)		S390_RS_3(c, 0x8a, r, b, d) 
+#define s390_srag(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb0a, r1, r2, b, d) 
+#define s390_sr(c, r1, r2)		S390_RR(c, 0x1b, r1, r2)
+#define s390_srda(c, r, b, d)		S390_RS_3(c, 0x8e, r, b, d) 
+#define s390_srdl(c, r, b, d)		S390_RS_3(c, 0x8c, r, b, d) 
+#define s390_srl(c, r, b, d)		S390_RS_3(c, 0x88, r, b, d) 
+#define s390_srlg(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb0c, r1, r2, b, d) 
+#define s390_st(c, r, x, b, d)		S390_RX(c, 0x50, r, x, b, d)
+#define s390_stam(c, r1, r2, b, d)	S390_RS_1(c, 0x9b, r1, r2, b, d)
+#define s390_stc(c, r, x, b, d)		S390_RX(c, 0x42, r, x, b, d)
+#define s390_stcm(c, r, m, b, d)	S390_RX(c, 0xbe, r, m, b, d)
+#define s390_stcmy(c, r, x, b, d)	S390_RXY(c, 0xeb2d, r, x, b, d)
+#define s390_stcy(c, r, x, b, d)	S390_RXY(c, 0xe372, r, x, b, d)
+#define s390_std(c, f, x, b, d)		S390_RX(c, 0x60, f, x, b, d)
+#define s390_stdy(c, r, x, b, d)	S390_RXY(c, 0xed67, r, x, b, d)
+#define s390_ste(c, f, x, b, d)		S390_RX(c, 0x70, f, x, b, d)
+#define s390_stey(c, r, x, b, d)	S390_RXY(c, 0xed66, r, x, b, d)
+#define s390_stfpc(c, b, d)		S390_S(c, 0xb29c, b, d)
+#define s390_stg(c, r, x, b, d)		S390_RXY(c, 0xe324, r, x, b, d)
+#define s390_sth(c, r, x, b, d)		S390_RX(c, 0x40, r, x, b, d)
+#define s390_sthy(c, r, x, b, d)	S390_RXY(c, 0xe370, r, x, b, d)
+#define s390_stm(c, r1, r2, b, d)	S390_RS_1(c, 0x90, r1, r2, b, d)
+#define s390_stmg(c, r1, r2, b, d)	S390_RSY_1(c, 0xeb24, r1, r2, b, d)
+#define s390_sty(c, r, x, b, d)		S390_RXY(c, 0xe350, r, x, b, d)
+#define s390_tcdb(c, r, x, b, d)	S390_RXE(c, 0xed11, r, x, b, d)
+#define s390_tceb(c, r, x, b, d)	S390_RXE(c, 0xed10, r, x, b, d)
+#define s390_x(c, r, x, b, d)		S390_RX(c, 0x57, r, x, b, d)
+#define s390_xihf(c, r, v)		S390_RIL_1(c, 0xc06, r, v)
+#define s390_xilf(c, r, v)		S390_RIL_1(c, 0xc07, r, v)
+#define s390_xg(c, r, x, b, d)		S390_RXY(c, 0xe382, r, x, b, d)
+#define s390_xgr(c, r1, r2)		S390_RRE(c, 0xb982, r1, r2)
+#define s390_xr(c, r1, r2)		S390_RR(c, 0x17, r1, r2)
+#define s390_xy(c, r, x, b, d)		S390_RXY(c, 0xe357, r, x, b, d)
+#endif
--- a/lib/ffts/src/arch/s390x/tramp.c
+++ b/lib/ffts/src/arch/s390x/tramp.c
--- a/lib/ffts/src/arch/sparc/.gitignore
+++ b/lib/ffts/src/arch/sparc/.gitignore
@ -0,0 +1,3 @@
+/Makefile
+/Makefile.in
+/.deps
--- a/lib/ffts/src/arch/sparc/Makefile.am
+++ b/lib/ffts/src/arch/sparc/Makefile.am
@ -0,0 +1,7 @@
+
+AM_CPPFLAGS = $(GLIB_CFLAGS) -I$(top_srcdir)
+
+noinst_LTLIBRARIES = libmonoarch-sparc.la
+
+libmonoarch_sparc_la_SOURCES = tramp.c sparc-codegen.h
+
--- a/lib/ffts/src/arch/sparc/sparc-codegen.h
+++ b/lib/ffts/src/arch/sparc/sparc-codegen.h
@ -0,0 +1,955 @@
+#ifndef __SPARC_CODEGEN_H__
+#define __SPARC_CODEGEN_H__
+
+#if SIZEOF_VOID_P == 8
+#define SPARCV9 1
+#else
+#endif
+
+typedef enum {
+	sparc_r0 = 0,
+	sparc_r1 = 1,
+	sparc_r2 = 2,
+	sparc_r3 = 3,
+	sparc_r4 = 4,
+	sparc_r5 = 5,
+	sparc_r6 = 6,
+	sparc_r7 = 7,
+	sparc_r8 = 8,
+	sparc_r9 = 9,
+	sparc_r10 = 10,
+	sparc_r11 = 11,
+	sparc_r12 = 12,
+	sparc_r13 = 13,
+	sparc_r14 = 14,
+	sparc_r15 = 15,
+	sparc_r16 = 16,
+	sparc_r17 = 17,
+	sparc_r18 = 18,
+	sparc_r19 = 19,
+	sparc_r20 = 20,
+	sparc_r21 = 21,
+	sparc_r22 = 22,
+	sparc_r23 = 23,
+	sparc_r24 = 24,
+	sparc_r25 = 25,
+	sparc_r26 = 26,
+	sparc_r27 = 27,
+	sparc_r28 = 28,
+	sparc_r29 = 29,
+	sparc_r30 = 30,
+	sparc_r31 = 31,
+	/* aliases */
+	/* global registers */
+	sparc_g0 = 0, sparc_zero = 0,
+	sparc_g1 = 1,
+	sparc_g2 = 2,
+	sparc_g3 = 3,
+	sparc_g4 = 4,
+	sparc_g5 = 5,
+	sparc_g6 = 6,
+	sparc_g7 = 7,
+	/* out registers */
+	sparc_o0 = 8,
+	sparc_o1 = 9,
+	sparc_o2 = 10,
+	sparc_o3 = 11,
+	sparc_o4 = 12,
+	sparc_o5 = 13,
+	sparc_o6 = 14, sparc_sp = 14,
+	sparc_o7 = 15, sparc_callsite = 15,
+	/* local registers */
+	sparc_l0 = 16,
+	sparc_l1 = 17,
+	sparc_l2 = 18,
+	sparc_l3 = 19,
+	sparc_l4 = 20,
+	sparc_l5 = 21,
+	sparc_l6 = 22,
+	sparc_l7 = 23,
+	/* in registers */
+	sparc_i0 = 24,
+	sparc_i1 = 25,
+	sparc_i2 = 26,
+	sparc_i3 = 27,
+	sparc_i4 = 28,
+	sparc_i5 = 29,
+	sparc_i6 = 30, sparc_fp = 30,
+	sparc_i7 = 31,
+	sparc_nreg = 32,
+	/* floating point registers */
+	sparc_f0 = 0,
+	sparc_f1 = 1,
+	sparc_f2 = 2,
+	sparc_f3 = 3,
+	sparc_f4 = 4,
+	sparc_f5 = 5,
+	sparc_f6 = 6,
+	sparc_f7 = 7,
+	sparc_f8 = 8,
+	sparc_f9 = 9,
+	sparc_f10 = 10,
+	sparc_f11 = 11,
+	sparc_f12 = 12,
+	sparc_f13 = 13,
+	sparc_f14 = 14,
+	sparc_f15 = 15,
+	sparc_f16 = 16,
+	sparc_f17 = 17,
+	sparc_f18 = 18,
+	sparc_f19 = 19,
+	sparc_f20 = 20,
+	sparc_f21 = 21,
+	sparc_f22 = 22,
+	sparc_f23 = 23,
+	sparc_f24 = 24,
+	sparc_f25 = 25,
+	sparc_f26 = 26,
+	sparc_f27 = 27,
+	sparc_f28 = 28,
+	sparc_f29 = 29,
+	sparc_f30 = 30,
+	sparc_f31 = 31,
+} SparcRegister;
+
+typedef enum {
+	sparc_bn   = 0, sparc_bnever = 0,
+	sparc_be   = 1,
+	sparc_ble  = 2,
+	sparc_bl   = 3,
+	sparc_bleu = 4,
+	sparc_bcs  = 5, sparc_blu = 5,
+	sparc_bneg = 6,
+	sparc_bvs  = 7, sparc_boverflow = 7,
+	sparc_ba   = 8, sparc_balways = 8,
+	sparc_bne  = 9,
+	sparc_bg   = 10,
+	sparc_bge  = 11,
+	sparc_bgu  = 12,
+	sparc_bcc  = 13, sparc_beu = 13,
+	sparc_bpos = 14,
+	sparc_bvc  = 15
+} SparcCond;
+
+typedef enum {
+	/* with fcmp */
+	sparc_feq = 0,
+	sparc_fl  = 1,
+	sparc_fg  = 2,
+	sparc_unordered = 3,
+	/* branch ops */
+	sparc_fba   = 8,
+	sparc_fbn   = 0,
+	sparc_fbu   = 7,
+	sparc_fbg   = 6,
+	sparc_fbug  = 5,
+	sparc_fbl   = 4,
+	sparc_fbul  = 3,
+	sparc_fblg  = 2,
+	sparc_fbne  = 1,
+	sparc_fbe   = 9,
+	sparc_fbue  = 10,
+	sparc_fbge  = 11,
+	sparc_fbuge = 12,
+	sparc_fble  = 13,
+	sparc_fbule = 14,
+	sparc_fbo   = 15
+} SparcFCond;
+
+typedef enum {
+	sparc_icc = 4,
+    sparc_xcc = 6,
+    sparc_fcc0 = 0,
+	sparc_fcc1 = 1,
+	sparc_fcc2 = 2,
+	sparc_fcc3 = 3
+} SparcCC;
+
+typedef enum {
+	sparc_icc_short = 0,
+    sparc_xcc_short = 2
+} SparcCCShort;
+
+typedef enum {
+	/* fop1 format */
+	sparc_fitos_val = 196,
+	sparc_fitod_val = 200,
+	sparc_fitoq_val = 204,
+	sparc_fxtos_val = 132,
+	sparc_fxtod_val = 136,
+	sparc_fxtoq_val = 140,
+	sparc_fstoi_val = 209,
+	sparc_fdtoi_val = 210,
+	sparc_fqtoi_val = 211,
+	sparc_fstod_val = 201,
+	sparc_fstoq_val = 205,
+	sparc_fdtos_val = 198,
+	sparc_fdtoq_val = 206,
+	sparc_fqtos_val = 199,
+	sparc_fqtod_val = 203,
+	sparc_fmovs_val  = 1,
+	sparc_fmovd_val  = 2,
+	sparc_fnegs_val  = 5,
+	sparc_fnegd_val  = 6,
+	sparc_fabss_val  = 9,
+	sparc_fabsd_val  = 10,
+	sparc_fsqrts_val = 41,
+	sparc_fsqrtd_val = 42,
+	sparc_fsqrtq_val = 43,
+	sparc_fadds_val  = 65,
+	sparc_faddd_val  = 66,
+	sparc_faddq_val  = 67,
+	sparc_fsubs_val  = 69,
+	sparc_fsubd_val  = 70,
+	sparc_fsubq_val  = 71,
+	sparc_fmuls_val  = 73,
+	sparc_fmuld_val  = 74,
+	sparc_fmulq_val  = 75,
+	sparc_fsmuld_val = 105,
+	sparc_fdmulq_val = 111,
+	sparc_fdivs_val  = 77,
+	sparc_fdivd_val  = 78,
+	sparc_fdivq_val  = 79,
+	/* fop2 format */
+	sparc_fcmps_val  = 81,
+	sparc_fcmpd_val  = 82,
+	sparc_fcmpq_val  = 83,
+	sparc_fcmpes_val = 85,
+	sparc_fcmped_val = 86,
+	sparc_fcmpeq_val = 87
+} SparcFOp;
+
+typedef enum {
+	sparc_membar_load_load = 0x1,
+	sparc_membar_store_load = 0x2,
+	sparc_membar_load_store = 0x4,
+	sparc_membar_store_store = 0x8,
+   
+	sparc_membar_lookaside = 0x10,
+	sparc_membar_memissue = 0x20,
+	sparc_membar_sync = 0x40,
+
+    sparc_membar_all = 0x4f
+} SparcMembarFlags;
+
+typedef struct {
+	unsigned int op   : 2; /* always 1 */
+	unsigned int disp : 30;
+} sparc_format1;
+
+typedef struct {
+	unsigned int op   : 2; /* always 0 */
+	unsigned int rd   : 5;
+	unsigned int op2  : 3;
+	unsigned int disp : 22;
+} sparc_format2a;
+
+typedef struct {
+	unsigned int op   : 2; /* always 0 */
+	unsigned int a    : 1;
+	unsigned int cond : 4;
+	unsigned int op2  : 3;
+	unsigned int disp : 22;
+} sparc_format2b;
+
+typedef struct {
+	unsigned int op   : 2; /* always 0 */
+	unsigned int a    : 1;
+	unsigned int cond : 4;
+	unsigned int op2  : 3;
+	unsigned int cc01 : 2;
+	unsigned int p    : 1;
+	unsigned int d19  : 19;
+} sparc_format2c;
+
+typedef struct {
+	unsigned int op   : 2; /* always 0 */
+	unsigned int a    : 1;
+	unsigned int res  : 1;
+	unsigned int rcond: 3;
+	unsigned int op2  : 3;
+	unsigned int d16hi: 2;
+	unsigned int p    : 1;
+	unsigned int rs1  : 5;
+	unsigned int d16lo: 14;
+} sparc_format2d;
+
+typedef struct {
+	unsigned int op   : 2; /* 2 or 3 */
+	unsigned int rd   : 5;
+	unsigned int op3  : 6;
+	unsigned int rs1  : 5;
+	unsigned int i    : 1;
+	unsigned int asi  : 8;
+	unsigned int rs2  : 5;
+} sparc_format3a;
+
+typedef struct {
+	unsigned int op   : 2; /* 2 or 3 */
+	unsigned int rd   : 5;
+	unsigned int op3  : 6;
+	unsigned int rs1  : 5;
+	unsigned int i    : 1;
+	unsigned int x    : 1;
+	unsigned int asi  : 7;
+	unsigned int rs2  : 5;
+} sparc_format3ax;
+
+typedef struct {
+	unsigned int op   : 2; /* 2 or 3 */
+	unsigned int rd   : 5;
+	unsigned int op3  : 6;
+	unsigned int rs1  : 5;
+	unsigned int i    : 1;
+	unsigned int imm  : 13;
+} sparc_format3b;
+
+typedef struct {
+	unsigned int op   : 2; /* 2 or 3 */
+	unsigned int rd   : 5;
+	unsigned int op3  : 6;
+	unsigned int rs1  : 5;
+	unsigned int i    : 1;
+	unsigned int x    : 1;
+	unsigned int imm  : 12;
+} sparc_format3bx;
+
+typedef struct {
+	unsigned int op   : 2; /* 2 or 3 */
+	unsigned int rd   : 5;
+	unsigned int op3  : 6;
+	unsigned int rs1  : 5;
+	unsigned int opf  : 9;
+	unsigned int rs2  : 5;
+} sparc_format3c;
+
+typedef struct {
+	unsigned int op   : 2;
+	unsigned int rd   : 5;
+	unsigned int op3  : 6;
+	unsigned int rs1  : 5;
+	unsigned int i    : 1;
+	unsigned int cc01 : 2;
+	unsigned int res  : 6;
+	unsigned int rs2  : 5;
+} sparc_format4a;
+
+typedef struct {
+	unsigned int op   : 2;
+	unsigned int rd   : 5;
+	unsigned int op3  : 6;
+	unsigned int rs1  : 5;
+	unsigned int i    : 1;
+	unsigned int cc01 : 2;
+	unsigned int simm : 11;
+} sparc_format4b;
+
+typedef struct {
+	unsigned int op   : 2;
+	unsigned int rd   : 5;
+	unsigned int op3  : 6;
+	unsigned int cc2  : 1;
+	unsigned int cond : 4;
+	unsigned int i    : 1;
+	unsigned int cc01 : 2;
+	unsigned int res  : 6;
+	unsigned int rs2  : 5;
+} sparc_format4c;
+
+typedef struct {
+	unsigned int op   : 2;
+	unsigned int rd   : 5;
+	unsigned int op3  : 6;
+	unsigned int cc2  : 1;
+	unsigned int cond : 4;
+	unsigned int i    : 1;
+	unsigned int cc01 : 2;
+	unsigned int simm : 11;
+} sparc_format4d;
+
+/* for use in logical ops, use 0 to not set flags */
+#define sparc_cc 16
+
+#define sparc_is_imm13(val) ((glong)val >= (glong)-(1<<12) && (glong)val <= (glong)((1<<12)-1))
+#define sparc_is_imm22(val) ((glong)val >= (glong)-(1<<21) && (glong)val <= (glong)((1<<21)-1))
+#define sparc_is_imm16(val) ((glong)val >= (glong)-(1<<15) && (glong)val <= (glong)((1<<15)-1))
+#define sparc_is_imm19(val) ((glong)val >= (glong)-(1<<18) && (glong)val <= (glong)((1<<18)-1))
+#define sparc_is_imm30(val) ((glong)val >= (glong)-(1<<29) && (glong)val <= (glong)((1<<29)-1))
+
+/* disassembly */
+#define sparc_inst_op(inst) ((inst) >> 30)
+#define sparc_inst_op2(inst) (((inst) >> 22) & 0x7)
+#define sparc_inst_rd(inst) (((inst) >> 25) & 0x1f)
+#define sparc_inst_op3(inst) (((inst) >> 19) & 0x3f)
+#define sparc_inst_i(inst) (((inst) >> 13) & 0x1)
+#define sparc_inst_rs1(inst) (((inst) >> 14) & 0x1f)
+#define sparc_inst_rs2(inst) (((inst) >> 0) & 0x1f)
+#define sparc_inst_imm(inst) (((inst) >> 13) & 0x1)
+#define sparc_inst_imm13(inst) (((inst) >> 0) & 0x1fff)
+
+#define sparc_encode_call(ins,addr) \
+	do {	\
+		sparc_format1 *__f = (sparc_format1*)(ins);	\
+		__f->op = 1;	\
+		__f->disp = ((unsigned int)(addr) >> 2);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format2a(ins,val,oper,dest) \
+	do {	\
+		sparc_format2a *__f = (sparc_format2a*)(ins);	\
+		__f->op = 0;	\
+		__f->rd = (dest);	\
+		__f->op2 = (oper);	\
+		__f->disp = (val) & 0x3fffff;	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format2b(ins,aval,bcond,oper,disp22) \
+	do {	\
+		sparc_format2b *__f = (sparc_format2b*)(ins);	\
+		__f->op = 0;	\
+		__f->a = (aval);	\
+		__f->cond = (bcond);	\
+		__f->op2 = (oper);	\
+		__f->disp = (disp22);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format2c(ins,aval,bcond,oper,xcc,predict,disp19) \
+	do {	\
+		sparc_format2c *__f = (sparc_format2c*)(ins);	\
+		__f->op = 0;	\
+		__f->a = (aval);	\
+		__f->cond = (bcond);	\
+		__f->op2 = (oper);	\
+        __f->cc01 = (xcc); \
+        __f->p = (predict); \
+        __f->d19 = (disp19); \
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format2d(ins,aval,bcond,oper,predict,r1,disp16) \
+	do {	\
+		sparc_format2d *__f = (sparc_format2d*)(ins);	\
+		__f->op = 0;	\
+		__f->a = (aval);	\
+        __f->res = 0;       \
+		__f->rcond = (bcond);	\
+		__f->op2 = (oper);	\
+        __f->d16hi = ((disp16) >> 14); \
+        __f->p = (predict); \
+        __f->rs1 = (r1);    \
+		__f->d16lo = ((disp16) & 0x3fff);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format3a(ins,opval,asival,r1,r2,oper,dest) \
+	do {	\
+		sparc_format3a *__f = (sparc_format3a*)(ins);	\
+		__f->op = (opval);	\
+		__f->asi = (asival);	\
+		__f->i = 0;	\
+		__f->rd = (dest);	\
+		__f->rs1 = (r1);	\
+		__f->rs2 = (r2);	\
+		__f->op3 = (oper);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format3ax(ins,opval,asival,r1,r2,oper,dest) \
+	do {	\
+		sparc_format3ax *__f = (sparc_format3ax*)(ins);	\
+		__f->op = (opval);	\
+		__f->asi = (asival);	\
+		__f->i = 0;	\
+		__f->x = 1;	\
+		__f->rd = (dest);	\
+		__f->rs1 = (r1);	\
+		__f->rs2 = (r2);	\
+		__f->op3 = (oper);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format3b(ins,opval,r1,val,oper,dest) \
+	do {	\
+		sparc_format3b *__f = (sparc_format3b*)(ins);	\
+		__f->op = (opval);	\
+		__f->imm = (val);	\
+		__f->i = 1;	\
+		__f->rd = (dest);	\
+		__f->rs1 = (r1);	\
+		__f->op3 = (oper);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format3bx(ins,opval,r1,val,oper,dest) \
+	do {	\
+		sparc_format3bx *__f = (sparc_format3bx*)(ins);	\
+		__f->op = (opval);	\
+		__f->imm = (val);	\
+		__f->i = 1;	\
+		__f->x = 1;	\
+		__f->rd = (dest);	\
+		__f->rs1 = (r1);	\
+		__f->op3 = (oper);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format3c(ins,opval,opfval,r1,oper,r2,dest) \
+	do {	\
+		sparc_format3c *__f = (sparc_format3c*)(ins);	\
+		__f->op = (opval);	\
+		__f->opf = (opfval);	\
+		__f->rd = (dest);	\
+		__f->rs1 = (r1);	\
+		__f->rs2 = (r2);	\
+		__f->op3 = (oper);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format4a(ins,opval,oper,cc,r1,r2,dest) \
+	do {	\
+		sparc_format4a *__f = (sparc_format4a*)(ins);	\
+		__f->op = (opval);	\
+		__f->rd = (dest);	\
+		__f->op3 = (oper);	\
+		__f->rs1 = (r1);	\
+        __f->i   = 0;       \
+        __f->cc01= (cc) & 0x3; \
+        __f->res = 0;       \
+		__f->rs2 = (r2);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format4b(ins,opval,oper,cc,r1,imm,dest) \
+	do {	\
+		sparc_format4b *__f = (sparc_format4b*)(ins);	\
+		__f->op = (opval);	\
+		__f->rd = (dest);	\
+		__f->op3 = (oper);	\
+		__f->rs1 = (r1);	\
+        __f->i   = 1;       \
+        __f->cc01= (cc) & 0x3; \
+		__f->simm = (imm);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format4c(ins,opval,oper,cc,bcond,r2,dest) \
+	do {	\
+		sparc_format4c *__f = (sparc_format4c*)(ins);	\
+		__f->op = (opval);	\
+		__f->rd = (dest);	\
+		__f->op3 = (oper);	\
+        __f->cc2 = ((xcc) >> 2) & 0x1; \
+        __f->cond = bcond;  \
+        __f->i   = 0;       \
+        __f->cc01= (xcc) & 0x3; \
+        __f->res = 0;       \
+		__f->rs2 = (r2);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+#define sparc_encode_format4d(ins,opval,oper,xcc,bcond,imm,dest) \
+	do {	\
+		sparc_format4d *__f = (sparc_format4d*)(ins);	\
+		__f->op = (opval);	\
+		__f->rd = (dest);	\
+		__f->op3 = (oper);	\
+        __f->cc2 = ((xcc) >> 2) & 0x1; \
+        __f->cond = bcond;  \
+        __f->i   = 1;       \
+        __f->cc01= (xcc) & 0x3; \
+		__f->simm = (imm);	\
+		(ins) = (unsigned int*)__f + 1;	\
+	} while (0)
+
+/* is it useful to provide a non-default value? */
+#define sparc_asi 0x0
+
+/* load */
+#define sparc_ldsb(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),9,(dest))
+#define sparc_ldsb_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),9,(dest))
+
+#define sparc_ldsh(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),10,(dest))
+#define sparc_ldsh_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),10,(dest))
+
+#define sparc_ldub(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),1,(dest))
+#define sparc_ldub_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),1,(dest))
+
+#define sparc_lduh(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),2,(dest))
+#define sparc_lduh_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),2,(dest))
+
+#define sparc_ld(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),0,(dest))
+#define sparc_ld_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),0,(dest))
+
+/* Sparc V9 */
+#define sparc_ldx(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),11,(dest))
+#define sparc_ldx_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),11,(dest))
+
+#define sparc_ldsw(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),8,(dest))
+#define sparc_ldsw_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),8,(dest))
+
+#define sparc_ldd(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),3,(dest))
+#define sparc_ldd_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),3,(dest))
+
+#define sparc_ldf(ins,base,disp,dest) sparc_encode_format3a((ins),3,0,(base),(disp),32,(dest))
+#define sparc_ldf_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),32,(dest))
+
+#define sparc_lddf(ins,base,disp,dest) sparc_encode_format3a((ins),3,0,(base),(disp),35,(dest))
+#define sparc_lddf_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),35,(dest))
+
+/* store */
+#define sparc_stb(ins,src,base,disp) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),5,(src))
+#define sparc_stb_imm(ins,src,base,disp) sparc_encode_format3b((ins),3,(base),(disp),5,(src))
+
+#define sparc_sth(ins,src,base,disp) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),6,(src))
+#define sparc_sth_imm(ins,src,base,disp) sparc_encode_format3b((ins),3,(base),(disp),6,(src))
+
+#define sparc_st(ins,src,base,disp) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),4,(src))
+#define sparc_st_imm(ins,src,base,disp) sparc_encode_format3b((ins),3,(base),(disp),4,(src))
+
+/* Sparc V9 */
+#define sparc_stx(ins,src,base,disp) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),14,(src))
+#define sparc_stx_imm(ins,src,base,disp) sparc_encode_format3b((ins),3,(base),(disp),14,(src))
+
+#define sparc_std(ins,src,base,disp) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),7,(src))
+#define sparc_std_imm(ins,src,base,disp) sparc_encode_format3b((ins),3,(base),(disp),7,(src))
+
+#define sparc_stf(ins,src,base,disp) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),36,(src))
+#define sparc_stf_imm(ins,src,base,disp) sparc_encode_format3b((ins),3,(base),(disp),36,(src))
+
+#define sparc_stdf(ins,src,base,disp) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),39,(src))
+#define sparc_stdf_imm(ins,src,base,disp) sparc_encode_format3b((ins),3,(base),(disp),39,(src))
+
+/* swap */
+#define sparc_ldstub(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),13,(dest))
+#define sparc_ldstub_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),13,(dest))
+
+#define sparc_swap(ins,base,disp,dest) sparc_encode_format3a((ins),3,sparc_asi,(base),(disp),15,(dest))
+#define sparc_swap_imm(ins,base,disp,dest) sparc_encode_format3b((ins),3,(base),(disp),15,(dest))
+
+/* misc */
+/* note: with sethi val is the full 32 bit value (think of it as %hi(val)) */
+#define sparc_sethi(ins,val,dest) sparc_encode_format2a((ins),((val)>>10),4,(dest))
+
+#define sparc_nop(ins) sparc_sethi((ins),0,sparc_zero)
+
+#define sparc_save(ins,src,disp,dest) sparc_encode_format3a((ins),2,0,(src),(disp),60,(dest))
+#define sparc_save_imm(ins,src,disp,dest) sparc_encode_format3b((ins),2,(src),(disp),60,(dest))
+
+#define sparc_restore(ins,src,disp,dest) sparc_encode_format3a((ins),2,0,(src),(disp),61,(dest))
+#define sparc_restore_imm(ins,src,disp,dest) sparc_encode_format3b((ins),2,(src),(disp),61,(dest))
+
+#define sparc_rett(ins,src,disp) sparc_encode_format3a((ins),2,0,(src),(disp),0x39,0)
+#define sparc_rett_imm(ins,src,disp) sparc_encode_format3b((ins),2,(src),(disp),0x39,0)
+
+#define sparc_jmpl(ins,base,disp,dest) sparc_encode_format3a((ins),2,0,(base),(disp),56,(dest))
+#define sparc_jmpl_imm(ins,base,disp,dest) sparc_encode_format3b((ins),2,(base),(disp),56,(dest))
+
+#define sparc_call_simple(ins,disp) sparc_encode_call((ins),((unsigned int)(disp)))
+
+#define sparc_rdy(ins,dest) sparc_encode_format3a((ins),2,0,0,0,40,(dest))
+
+#define sparc_wry(ins,base,disp) sparc_encode_format3a((ins),2,0,(base),(disp),48,0)
+#define sparc_wry_imm(ins,base,disp) sparc_encode_format3b((ins),2,(base),(disp),48,0)
+
+/* stbar, unimp, flush */
+#define sparc_stbar(ins) sparc_encode_format3a((ins),2,0,15,0,40,0)
+#define sparc_unimp(ins,val) sparc_encode_format2b((ins),0,0,0,(val))
+
+#define sparc_flush(ins,base,disp) sparc_encode_format3a((ins),2,0,(base),(disp),59,0)
+#define sparc_flush_imm(ins,base,disp) sparc_encode_format3b((ins),2,(base),(disp),59,0)
+
+#define sparc_flushw(ins) sparc_encode_format3a((ins),2,0,0,0,43,0)
+
+#define sparc_membar(ins,flags) sparc_encode_format3b ((ins), 2, 0xf, (flags), 0x28, 0)
+
+/* trap */
+
+#define sparc_ta(ins,tt) sparc_encode_format3b((ins),2,0,(tt),58,0x8)
+
+/* alu fop */
+/* provide wrappers for: fitos, fitod, fstoi, fdtoi, fstod, fdtos, fmov, fneg, fabs */
+
+#define sparc_fop(ins,r1,op,r2,dest) sparc_encode_format3c((ins),2,(op),(r1),52,(r2),(dest))
+#define sparc_fcmp(ins,r1,op,r2) sparc_encode_format3c((ins),2,(op),(r1),53,(r2),0)
+
+/* format 1 fops */
+#define sparc_fadds(ins, r1, r2, dest) sparc_fop( ins, r1, sparc_fadds_val, r2, dest )
+#define sparc_faddd(ins, r1, r2, dest) sparc_fop( ins, r1, sparc_faddd_val, r2, dest )
+#define sparc_faddq(ins, r1, r2, dest) sparc_fop( ins, r1, sparc_faddq_val, r2, dest )
+
+#define sparc_fsubs(ins, r1, r2, dest) sparc_fop( ins, r1, sparc_fsubs_val, r2, dest ) 
+#define sparc_fsubd(ins, r1, r2, dest) sparc_fop( ins, r1, sparc_fsubd_val, r2, dest ) 
+#define sparc_fsubq(ins, r1, r2, dest) sparc_fop( ins, r1, sparc_fsubq_val, r2, dest ) 
+
+#define sparc_fmuls( ins, r1, r2, dest ) sparc_fop( ins, r1, sparc_fmuls_val, r2, dest )
+#define sparc_fmuld( ins, r1, r2, dest ) sparc_fop( ins, r1, sparc_fmuld_val, r2, dest )
+#define sparc_fmulq( ins, r1, r2, dest ) sparc_fop( ins, r1, sparc_fmulq_val, r2, dest )
+
+#define sparc_fsmuld( ins, r1, r2, dest ) sparc_fop( ins, r1, sparc_fsmuld_val, r2, dest )
+#define sparc_fdmulq( ins, r1, r2, dest ) sparc_fop( ins, r1, sparc_fdmulq_val, r2, dest )
+
+#define sparc_fdivs( ins, r1, r2, dest ) sparc_fop( ins, r1, sparc_fdivs_val, r2, dest )
+#define sparc_fdivd( ins, r1, r2, dest ) sparc_fop( ins, r1, sparc_fdivd_val, r2, dest )
+#define sparc_fdivq( ins, r1, r2, dest ) sparc_fop( ins, r1, sparc_fdivq_val, r2, dest )
+
+#define sparc_fitos( ins, r2, dest ) sparc_fop( ins, 0, sparc_fitos_val, r2, dest )
+#define sparc_fitod( ins, r2, dest ) sparc_fop( ins, 0, sparc_fitod_val, r2, dest )
+#define sparc_fitoq( ins, r2, dest ) sparc_fop( ins, 0, sparc_fitoq_val, r2, dest )
+
+#define sparc_fxtos( ins, r2, dest) sparc_fop( ins, 0, sparc_fxtos_val, r2, dest )
+#define sparc_fxtod( ins, r2, dest) sparc_fop( ins, 0, sparc_fxtod_val, r2, dest )
+#define sparc_fxtoq( ins, r2, dest) sparc_fop( ins, 0, sparc_fxtoq_val, r2, dest )
+
+#define sparc_fstoi( ins, r2, dest ) sparc_fop( ins, 0, sparc_fstoi_val, r2, dest )
+#define sparc_fdtoi( ins, r2, dest ) sparc_fop( ins, 0, sparc_fdtoi_val, r2, dest )
+#define sparc_fqtoi( ins, r2, dest ) sparc_fop( ins, 0, sparc_fqtoi_val, r2, dest )
+
+#define sparc_fstod( ins, r2, dest ) sparc_fop( ins, 0, sparc_fstod_val, r2, dest )
+#define sparc_fstoq( ins, r2, dest ) sparc_fop( ins, 0, sparc_fstoq_val, r2, dest )
+
+#define sparc_fdtos( ins, r2, dest ) sparc_fop( ins, 0, sparc_fdtos_val, r2, dest )
+#define sparc_fdtoq( ins, r2, dest ) sparc_fop( ins, 0, sparc_fdtoq_val, r2, dest )
+
+#define sparc_fqtos( ins, r2, dest ) sparc_fop( ins, 0, sparc_fqtos_val, r2, dest )
+#define sparc_fqtod( ins, r2, dest ) sparc_fop( ins, 0, sparc_fqtod_val, r2, dest )
+
+#define sparc_fmovs( ins, r2, dest ) sparc_fop( ins, 0, sparc_fmovs_val, r2, dest )
+#define sparc_fnegs( ins, r2, dest ) sparc_fop( ins, 0, sparc_fnegs_val, r2, dest )
+#define sparc_fabss( ins, r2, dest ) sparc_fop( ins, 0, sparc_fabss_val, r2, dest )
+
+#define sparc_fmovd( ins, r2, dest) sparc_fop (ins, 0, sparc_fmovd_val, r2, dest);
+#define sparc_fnegd( ins, r2, dest) sparc_fop (ins, 0, sparc_fnegd_val, r2, dest);
+#define sparc_fabsd( ins, r2, dest) sparc_fop (ins, 0, sparc_fabsd_val, r2, dest);
+
+#define sparc_fsqrts( ins, r2, dest ) sparc_fop( ins, 0, sparc_fsqrts_val, r2, dest )
+#define sparc_fsqrtd( ins, r2, dest ) sparc_fop( ins, 0, sparc_fsqrtd_val, r2, dest )
+#define sparc_fsqrtq( ins, r2, dest ) sparc_fop( ins, 0, sparc_fsqrtq_val, r2, dest )
+
+/* format 2 fops */
+
+#define sparc_fcmps( ins, r1, r2 ) sparc_fcmp( ins, r1, sparc_fcmps_val, r2 )
+#define sparc_fcmpd( ins, r1, r2 ) sparc_fcmp( ins, r1, sparc_fcmpd_val, r2 )
+#define sparc_fcmpq( ins, r1, r2 ) sparc_fcmp( ins, r1, sparc_fcmpq_val, r2 )
+#define sparc_fcmpes( ins, r1, r2 ) sparc_fcmpes( ins, r1, sparc_fcmpes_val, r2 )
+#define sparc_fcmped( ins, r1, r2 ) sparc_fcmped( ins, r1, sparc_fcmped_val, r2 )
+#define sparc_fcmpeq( ins, r1, r2 ) sparc_fcmpeq( ins, r1, sparc_fcmpeq_val, r2 ) 
+
+/* logical */
+
+/* FIXME: condense this using macros */
+/* FIXME: the setcc stuff is wrong in lots of places */
+
+#define sparc_logic(ins,op,setcc,r1,r2,dest) sparc_encode_format3a((ins),2,0,(r1),(r2),((setcc) ? 0x10 : 0) | (op), (dest))
+#define sparc_logic_imm(ins,op,setcc,r1,imm,dest) sparc_encode_format3b((ins),2,(r1),(imm),((setcc) ? 0x10 : 0) | (op), (dest))
+
+#define sparc_and(ins,setcc,r1,r2,dest) sparc_logic(ins,1,setcc,r1,r2,dest)
+#define sparc_and_imm(ins,setcc,r1,imm,dest) sparc_logic_imm(ins,1,setcc,r1,imm,dest)
+
+#define sparc_andn(ins,setcc,r1,r2,dest) sparc_encode_format3a((ins),2,0,(r1),(r2),(setcc)|5,(dest))
+#define sparc_andn_imm(ins,setcc,r1,imm,dest) sparc_encode_format3b((ins),2,(r1),(imm),(setcc)|5,(dest))
+
+#define sparc_or(ins,setcc,r1,r2,dest) sparc_encode_format3a((ins),2,0,(r1),(r2),(setcc)|2,(dest))
+#define sparc_or_imm(ins,setcc,r1,imm,dest) sparc_encode_format3b((ins),2,(r1),(imm),(setcc)|2,(dest))
+
+#define sparc_orn(ins,setcc,r1,r2,dest) sparc_encode_format3a((ins),2,0,(r1),(r2),(setcc)|6,(dest))
+#define sparc_orn_imm(ins,setcc,r1,imm,dest) sparc_encode_format3b((ins),2,(r1),(imm),(setcc)|6,(dest))
+
+#define sparc_xor(ins,setcc,r1,r2,dest) sparc_encode_format3a((ins),2,0,(r1),(r2),(setcc)|3,(dest))
+#define sparc_xor_imm(ins,setcc,r1,imm,dest) sparc_encode_format3b((ins),2,(r1),(imm), (setcc)|3,(dest))
+
+#define sparc_xnor(ins,setcc,r1,r2,dest) sparc_encode_format3a((ins),2,0,(r1),(r2),(setcc)|7,(dest))
+#define sparc_xnor_imm(ins,setcc,r1,imm,dest) sparc_encode_format3b((ins),2,(r1),(imm),(setcc)|7,(dest))
+
+/* shift */
+#define sparc_sll(ins,src,disp,dest) sparc_encode_format3a((ins),2,0,(src),(disp),37,(dest))
+#define sparc_sll_imm(ins,src,disp,dest) sparc_encode_format3b((ins),2,(src),(disp),37,(dest))
+
+/* Sparc V9 */
+#define sparc_sllx(ins,src,disp,dest) sparc_encode_format3ax((ins),2,0,(src),(disp),37,(dest))
+#define sparc_sllx_imm(ins,src,disp,dest) sparc_encode_format3bx((ins),2,(src),(disp),37,(dest))
+
+#define sparc_srl(ins,src,disp,dest) sparc_encode_format3a((ins),2,0,(src),(disp),38,(dest))
+#define sparc_srl_imm(ins,src,disp,dest) sparc_encode_format3b((ins),2,(src),(disp),38,(dest))
+
+/* Sparc V9 */
+#define sparc_srlx(ins,src,disp,dest) sparc_encode_format3ax((ins),2,0,(src),(disp),38,(dest))
+#define sparc_srlx_imm(ins,src,disp,dest) sparc_encode_format3bx((ins),2,(src),(disp),38,(dest))
+
+#define sparc_sra(ins,src,disp,dest) sparc_encode_format3a((ins),2,0,(src),(disp),39,(dest))
+#define sparc_sra_imm(ins,src,disp,dest) sparc_encode_format3b((ins),2,(src),(disp),39,(dest))
+
+/* Sparc V9 */
+#define sparc_srax(ins,src,disp,dest) sparc_encode_format3ax((ins),2,0,(src),(disp),39,(dest))
+#define sparc_srax_imm(ins,src,disp,dest) sparc_encode_format3bx((ins),2,(src),(disp),39,(dest))
+
+/* alu */
+
+#define sparc_alu_reg(ins,op,setcc,r1,r2,dest) sparc_encode_format3a((ins),2,0,(r1),(r2),op|((setcc) ? 0x10 : 0),(dest))
+#define sparc_alu_imm(ins,op,setcc,r1,imm,dest) sparc_encode_format3b((ins),2,(r1),(imm),op|((setcc) ? 0x10 : 0),(dest))
+
+#define sparc_add(ins,setcc,r1,r2,dest) sparc_alu_reg((ins),0,(setcc),(r1),(r2),(dest))
+#define sparc_add_imm(ins,setcc,r1,imm,dest) sparc_alu_imm((ins),0,(setcc),(r1),(imm),(dest))
+
+#define sparc_addx(ins,setcc,r1,r2,dest) sparc_alu_reg((ins),0x8,(setcc),(r1),(r2),(dest))
+#define sparc_addx_imm(ins,setcc,r1,imm,dest) sparc_alu_imm((ins),0x8,(setcc),(r1),(imm),(dest))
+
+#define sparc_sub(ins,setcc,r1,r2,dest) sparc_alu_reg((ins),0x4,(setcc),(r1),(r2),(dest))
+#define sparc_sub_imm(ins,setcc,r1,imm,dest) sparc_alu_imm((ins),0x4,(setcc),(r1),(imm),(dest))
+
+#define sparc_subx(ins,setcc,r1,r2,dest) sparc_alu_reg((ins),0xc,(setcc),(r1),(r2),(dest))
+#define sparc_subx_imm(ins,setcc,r1,imm,dest) sparc_alu_imm((ins),0xc,(setcc),(r1),(imm),(dest))
+
+#define sparc_muls(ins,r1,r2,dest) sparc_encode_format3a((ins),2,0,(r1),(r2),36,(dest))
+#define sparc_muls_imm(ins,r1,imm,dest) sparc_encode_format3b((ins),2,(r1),(imm),36,(dest))
+
+#define sparc_umul(ins,setcc,r1,r2,dest) sparc_alu_reg((ins),0xa,(setcc),(r1),(r2),(dest))
+#define sparc_umul_imm(ins,setcc,r1,imm,dest) sparc_alu_imm((ins),0xa,(setcc),(r1),(imm),(dest))
+
+#define sparc_smul(ins,setcc,r1,r2,dest) sparc_alu_reg((ins),0xb,(setcc),(r1),(r2),(dest))
+#define sparc_smul_imm(ins,setcc,r1,imm,dest) sparc_alu_imm((ins),0xb,(setcc),(r1),(imm),(dest))
+
+#define sparc_udiv(ins,setcc,r1,r2,dest) sparc_alu_reg((ins),0xe,(setcc),(r1),(r2),(dest))
+#define sparc_udiv_imm(ins,setcc,r1,imm,dest) sparc_alu_imm((ins),0xe,(setcc),(r1),(imm),(dest))
+
+#define sparc_sdiv(ins,setcc,r1,r2,dest) sparc_alu_reg((ins),0xf,(setcc),(r1),(r2),(dest))
+#define sparc_sdiv_imm(ins,setcc,r1,imm,dest) sparc_alu_imm((ins),0xf,(setcc),(r1),(imm),(dest))
+
+
+/* branch */
+#define sparc_branch(ins,aval,condval,displ) sparc_encode_format2b((ins),(aval),(condval),2,(displ))
+/* FIXME: float condition codes are different: unify. */
+#define sparc_fbranch(ins,aval,condval,displ) sparc_encode_format2b((ins),(aval),(condval),6,(displ))
+#define sparc_branchp(ins,aval,condval,xcc,predict,displ) sparc_encode_format2c((ins),(aval),(condval),0x1,(xcc),(predict),(displ))
+
+#define sparc_brz(ins,aval,predict,rs1,disp) sparc_encode_format2d((ins), (aval),0x1,0x3,(predict),(rs1),(disp))
+#define sparc_brlez(ins,aval,predict,rs1,disp) sparc_encode_format2d((ins), (aval),0x2,0x3,(predict),(rs1),(disp))
+#define sparc_brlz(ins,aval,predict,rs1,disp) sparc_encode_format2d((ins), (aval),0x3,0x3,(predict),(rs1),(disp))
+#define sparc_brnz(ins,aval,predict,rs1,disp) sparc_encode_format2d((ins), (aval),0x5,0x3,(predict),(rs1),(disp))
+#define sparc_brgz(ins,aval,predict,rs1,disp) sparc_encode_format2d((ins), (aval),0x6,0x3,(predict),(rs1),(disp))
+#define sparc_brgez(ins,aval,predict,rs1,disp) sparc_encode_format2d((ins), (aval),0x7,0x3,(predict),(rs1),(disp))
+
+/* conditional moves */
+#define sparc_movcc(ins,cc,condval,r1,dest) sparc_encode_format4c((ins), 0x2, 0x2c, cc, condval, r1, dest)
+
+#define sparc_movcc_imm(ins,cc,condval,imm,dest) sparc_encode_format4d((ins), 0x2, 0x2c, cc, condval, imm, dest)
+
+/* synthetic instructions */
+#define sparc_cmp(ins,r1,r2) sparc_sub((ins),sparc_cc,(r1),(r2),sparc_g0)
+#define sparc_cmp_imm(ins,r1,imm) sparc_sub_imm((ins),sparc_cc,(r1),(imm),sparc_g0)
+#define sparc_jmp(ins,base,disp) sparc_jmpl((ins),(base),(disp),sparc_g0)
+#define sparc_jmp_imm(ins,base,disp) sparc_jmpl_imm((ins),(base),(disp),sparc_g0)
+#define sparc_call(ins,base,disp) sparc_jmpl((ins),(base),(disp),sparc_o7)
+#define sparc_call_imm(ins,base,disp) sparc_jmpl_imm((ins),(base),(disp),sparc_o7)
+
+#define sparc_test(ins,reg) sparc_or ((ins),sparc_cc,sparc_g0,(reg),sparc_g0)
+
+#define sparc_ret(ins) sparc_jmpl_imm((ins),sparc_i7,8,sparc_g0)
+#define sparc_retl(ins) sparc_jmpl_imm((ins),sparc_o7,8,sparc_g0)
+#define sparc_restore_simple(ins) sparc_restore((ins),sparc_g0,sparc_g0,sparc_g0)
+#define sparc_rett_simple(ins) sparc_rett_imm((ins),sparc_i7,8)
+
+#define sparc_set32(ins,val,reg)	\
+	do {	\
+        if ((val) == 0) \
+            sparc_clr_reg((ins),(reg)); \
+               else if (((guint32)(val) & 0x3ff) == 0) \
+			sparc_sethi((ins),(guint32)(val),(reg));	\
+		else if (((gint32)(val) >= -4096) && ((gint32)(val) <= 4095))	\
+			sparc_or_imm((ins),FALSE,sparc_g0,(gint32)(val),(reg));	\
+		else {	\
+			sparc_sethi((ins),(guint32)(val),(reg));	\
+			sparc_or_imm((ins),FALSE,(reg),(guint32)(val)&0x3ff,(reg));	\
+		}	\
+	} while (0)
+
+#ifdef SPARCV9
+#define SPARC_SET_MAX_SIZE (6 * 4)
+#else
+#define SPARC_SET_MAX_SIZE (2 * 4)
+#endif
+
+#if SPARCV9
+#define sparc_set(ins,ptr,reg) \
+	do {	\
+        g_assert ((reg) != sparc_g1); \
+        gint64 val = (gint64)ptr; \
+		guint32 top_word = (val) >> 32; \
+		guint32 bottom_word = (val) & 0xffffffff; \
+        if (val == 0) \
+           sparc_clr_reg ((ins), reg); \
+		else if ((val >= -4096) && ((val) <= 4095))	\
+			sparc_or_imm((ins),FALSE,sparc_g0,bottom_word,(reg));	\
+        else if ((val >= 0) && (val <= 4294967295L)) {   \
+               sparc_sethi((ins),bottom_word,(reg));   \
+               if (bottom_word & 0x3ff) \
+			sparc_or_imm((ins),FALSE,(reg),bottom_word&0x3ff,(reg));	\
+        } \
+        else if ((val >= 0) && (val <= (1L << 44) - 1)) {  \
+            sparc_sethi ((ins), (val >> 12), (reg)); \
+            sparc_or_imm ((ins), FALSE, (reg), (val >> 12) & 0x3ff, (reg)); \
+            sparc_sllx_imm ((ins),(reg), 12, (reg)); \
+            sparc_or_imm ((ins), FALSE, (reg), (val) & 0xfff, (reg)); \
+        } \
+        else if (top_word == 0xffffffff) { \
+            sparc_xnor ((ins), FALSE, sparc_g0, sparc_g0, sparc_g1);    \
+			sparc_sethi((ins),bottom_word,(reg));	\
+		    sparc_sllx_imm((ins),sparc_g1,32,sparc_g1);	\
+			sparc_or_imm((ins),FALSE,(reg),bottom_word&0x3ff,(reg));	\
+		    sparc_or((ins),FALSE,(reg),sparc_g1,(reg));	\
+        } \
+        else { \
+			sparc_sethi((ins),top_word,sparc_g1);	\
+			sparc_sethi((ins),bottom_word,(reg));	\
+			sparc_or_imm((ins),FALSE,sparc_g1,top_word&0x3ff,sparc_g1);	\
+			sparc_or_imm((ins),FALSE,(reg),bottom_word&0x3ff,(reg));	\
+		    sparc_sllx_imm((ins),sparc_g1,32,sparc_g1);	\
+		    sparc_or((ins),FALSE,(reg),sparc_g1,(reg));	\
+        } \
+	} while (0)
+#else
+#define sparc_set(ins,val,reg)	\
+	do {	\
+        if ((val) == 0) \
+            sparc_clr_reg((ins),(reg)); \
+               else if (((guint32)(val) & 0x3ff) == 0) \
+			sparc_sethi((ins),(guint32)(val),(reg));	\
+		else if (((gint32)(val) >= -4096) && ((gint32)(val) <= 4095))	\
+			sparc_or_imm((ins),FALSE,sparc_g0,(gint32)(val),(reg));	\
+		else {	\
+			sparc_sethi((ins),(guint32)(val),(reg));	\
+			sparc_or_imm((ins),FALSE,(reg),(guint32)(val)&0x3ff,(reg));	\
+		}	\
+	} while (0)
+#endif
+
+#define sparc_set_ptr(ins,val,reg) sparc_set(ins,val,reg)
+
+#ifdef SPARCV9
+#define sparc_set_template(ins,reg) sparc_set (ins,0x7fffffff7fffffff, reg)
+#else
+#define sparc_set_template(ins,reg) sparc_set (ins,0x7fffffff, reg)
+#endif
+
+#define sparc_not(ins,reg) sparc_xnor((ins),FALSE,(reg),sparc_g0,(reg))
+#define sparc_neg(ins,reg) sparc_sub((ins),FALSE,sparc_g0,(reg),(reg))
+#define sparc_clr_reg(ins,reg) sparc_or((ins),FALSE,sparc_g0,sparc_g0,(reg))
+
+#define sparc_mov_reg_reg(ins,src,dest) sparc_or((ins),FALSE,sparc_g0,(src),(dest))
+
+#ifdef SPARCV9
+#define sparc_sti_imm sparc_stx_imm
+#define sparc_ldi_imm sparc_ldx_imm
+#define sparc_sti sparc_stx
+#define sparc_ldi sparc_ldx
+#else
+#define sparc_sti_imm sparc_st_imm
+#define sparc_ldi_imm sparc_ld_imm
+#define sparc_sti sparc_st
+#define sparc_ldi sparc_ld
+#endif
+
+#endif /* __SPARC_CODEGEN_H__ */
+
--- a/lib/ffts/src/arch/sparc/test.c
+++ b/lib/ffts/src/arch/sparc/test.c
@ -0,0 +1,123 @@
+#include <glib.h>
+#include "sparc-codegen.h"
+
+/* don't run the resulting program, it will destroy your computer,
+ * just objdump -d it to inspect we generated the correct assembler.
+ */
+
+int
+main ()
+{
+	guint32 *p;
+	guint32 code_buffer [500];
+	guint32 local_size = 0, stack_size = 0, code_size = 6;
+	guint32 arg_pos, simpletype;
+	unsigned char *ins;
+	int i, stringp, cur_out_reg, size;
+
+	p = code_buffer;
+
+	printf (".text\n.align 4\n.globl main\n.type main,@function\nmain:\n");
+
+	/*
+	 * Standard function prolog.
+	 */
+	sparc_save_imm (p, sparc_sp, -112-stack_size, sparc_sp);
+	cur_out_reg = sparc_o0;
+	arg_pos = 0;
+
+	if (1) {
+		sparc_mov_reg_reg (p, sparc_i2, cur_out_reg);
+		++cur_out_reg;
+	}
+
+	sparc_ld_imm (p, sparc_i3, arg_pos, cur_out_reg);
+	++cur_out_reg;
+	sparc_ld_imm (p, sparc_i3, arg_pos+4, cur_out_reg);
+	++cur_out_reg;
+	/* 
+	 * Insert call to function 
+	 */
+	sparc_jmpl (p, sparc_i0, 0, sparc_callsite);
+	sparc_nop (p);
+
+	sparc_jmpl_imm (p, sparc_i7, 8, sparc_zero);
+	sparc_restore (p, sparc_zero, sparc_zero, sparc_zero);
+
+	sparc_ldsb (p, sparc_i3, sparc_l0, sparc_o5);
+	sparc_ldsb_imm (p, sparc_i3, 2, sparc_o5);
+
+	sparc_ldsh (p, sparc_i3, sparc_l0, sparc_o5);
+	sparc_ldsh_imm (p, sparc_i3, 2, sparc_o5);
+
+	sparc_ldub (p, sparc_i3, sparc_l0, sparc_o5);
+	sparc_ldub_imm (p, sparc_i3, 2, sparc_o5);
+
+	sparc_lduh (p, sparc_i3, sparc_l0, sparc_o5);
+	sparc_lduh_imm (p, sparc_i3, 2, sparc_o5);
+
+	sparc_ldf (p, sparc_i3, sparc_l0, sparc_o5);
+	sparc_ldf_imm (p, sparc_i3, 2, sparc_o5);
+
+	sparc_stb (p, sparc_i3, sparc_l0, sparc_l2);
+	sparc_stb_imm (p, sparc_i3, sparc_o5, 2);
+
+	sparc_sethi (p, 0xff000000, sparc_o2);
+	sparc_rdy (p, sparc_l0);
+	sparc_wry (p, sparc_l0, sparc_l1);
+	sparc_wry_imm (p, sparc_l0, 16);
+	sparc_stbar (p);
+	sparc_unimp (p, 24);
+	sparc_flush (p, sparc_l4, 0);
+
+	sparc_and (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_and_imm (p, FALSE, sparc_l0, 0xff, sparc_o1);
+	sparc_andn (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_or (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_orn (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_xor (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_xnor (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+
+	sparc_sll (p, sparc_l0, sparc_l1, sparc_o1);
+	sparc_sll_imm (p, sparc_l0, 2, sparc_o1);
+	sparc_srl (p, sparc_l0, sparc_l1, sparc_o1);
+	sparc_srl_imm (p, sparc_l0, 2, sparc_o1);
+	sparc_sra (p, sparc_l0, sparc_l1, sparc_o1);
+	sparc_sra_imm (p, sparc_l0, 2, sparc_o1);
+
+	sparc_add (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_add_imm (p, FALSE, sparc_l0, 0xff, sparc_o1);
+	sparc_addx (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_sub (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_subx (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+
+	sparc_muls (p, sparc_l0, sparc_l1, sparc_o1);
+	sparc_umul (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_smul (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_udiv (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+	sparc_sdiv (p, sparc_cc, sparc_l0, sparc_l1, sparc_o1);
+
+	sparc_branch (p, FALSE, sparc_bne, -12);
+	sparc_ret (p);
+	sparc_retl (p);
+	sparc_test (p, sparc_l4);
+	sparc_cmp (p, sparc_l4, sparc_l6);
+	sparc_cmp_imm (p, sparc_l4, 4);
+	sparc_restore_simple (p);
+
+	sparc_set (p, 0xff000000, sparc_l7);
+	sparc_set (p, 1, sparc_l7);
+	sparc_set (p, 0xff0000ff, sparc_l7);
+
+	sparc_not (p, sparc_g2);
+	sparc_neg (p, sparc_g3);
+	sparc_clr_reg (p, sparc_g4);
+
+
+	size = (p-code_buffer)*4;
+	ins = (gchar*)code_buffer;
+	for (i = 0; i < size; ++i)
+		printf (".byte %d\n", (unsigned int) ins [i]);
+	return 0;
+}
+
--- a/lib/ffts/src/arch/sparc/tramp.c
+++ b/lib/ffts/src/arch/sparc/tramp.c
--- a/lib/ffts/src/arch/x64/.gitignore
+++ b/lib/ffts/src/arch/x64/.gitignore
@ -0,0 +1,4 @@
+/Makefile.in
+/Makefile
+/.deps
+/.libs
--- a/lib/ffts/src/arch/x64/Makefile.am
+++ b/lib/ffts/src/arch/x64/Makefile.am
@ -0,0 +1,2 @@
+EXTRA_DIST =  x64-codegen.h
+
--- a/lib/ffts/src/arch/x64/x64-codegen.h
+++ b/lib/ffts/src/arch/x64/x64-codegen.h
--- a/lib/ffts/src/arch/x86/.gitignore
+++ b/lib/ffts/src/arch/x86/.gitignore
@ -0,0 +1,6 @@
+/Makefile
+/Makefile.in
+/.libs
+/.deps
+/*.la
+/*.lo
--- a/lib/ffts/src/arch/x86/Makefile.am
+++ b/lib/ffts/src/arch/x86/Makefile.am
@ -0,0 +1 @@
+EXTRA_DIST = x86-codegen.h
--- a/lib/ffts/src/arch/x86/x86-codegen.h
+++ b/lib/ffts/src/arch/x86/x86-codegen.h
--- a/lib/ffts/src/codegen.c
+++ b/lib/ffts/src/codegen.c
--- a/lib/ffts/src/codegen.h
+++ b/lib/ffts/src/codegen.h
@ -1,10 +1,10 @@
 /*
- 
+
 This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
+
 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
+ Copyright (c) 2012, The University of Waikato
+
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
@ -31,19 +31,16 @@

 */

-#ifndef __CODEGEN_H__
-#define __CODEGEN_H__
+#ifndef FFTS_CODEGEN_H
+#define FFTS_CODEGEN_H

-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <errno.h>
-#include <sys/mman.h>
-#include <string.h>
-#include <limits.h>	   /* for PAGESIZE */
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif

 #include "ffts.h"
+#include "ffts_internal.h"

-void ffts_generate_func_code(ffts_plan_t *, size_t N, size_t leafN, int sign); 
+transform_func_t ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leaf_N, int sign);

-#endif
+#endif /* FFTS_CODEGEN_H */
--- a/lib/ffts/src/codegen_arm.h
+++ b/lib/ffts/src/codegen_arm.h
@ -31,10 +31,14 @@

 */

-#ifndef __CODEGEN_ARM_H__
-#define __CODEGEN_ARM_H__
+#ifndef FFTS_CODEGEN_ARM_H
+#define FFTS_CODEGEN_ARM_H

+#include "neon.h"

+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif

 uint32_t BL(void *pos, void *target) {
 	return 0xeb000000 | (((target - pos) / 4) & 0xffffff);
@ -95,7 +99,130 @@ void MOVI(uint32_t **p, uint8_t dst, uint32_t imm) {
 uint32_t PUSH_LR() { return 0xe92d4ff0; } //0xe92d4000; }
 uint32_t POP_LR() { return 0xe8bd8ff0; } //0xe8bd8000; }

+static FFTS_INLINE insns_t* generate_size4_base_case(insns_t **fp, int sign)
+{
+	insns_t *x_4_addr;
+	size_t len;

+	x_4_addr = *fp;

+#ifdef HAVE_NEON
+	len = (char*) neon_x8 - (char*) neon_x4;
+	memcpy(x_4_addr, neon_x4, len);

+	if (sign < 0) {
+		x_4_addr[26] ^= 0x00200000;
+		x_4_addr[28] ^= 0x00200000;
+		x_4_addr[31] ^= 0x00200000;
+		x_4_addr[32] ^= 0x00200000;
+	}
+#else
+	len = (char*) vfp_x8 - (char*) vfp_x4;
+	memcpy(x_4_addr, vfp_x4, len);
+
+	if (sign > 0) {
+		x_4_addr[36] ^= 0x00000040;
+		x_4_addr[38] ^= 0x00000040;
+		x_4_addr[43] ^= 0x00000040;
+		x_4_addr[44] ^= 0x00000040;
+	}
 #endif
+
+	*fp += len / 4;
+	return x_4_addr;
+}
+
+static FFTS_INLINE insns_t* generate_size8_base_case(insns_t **fp, int sign)
+{
+	insns_t *x_8_addr;
+	ptrdiff_t len;
+
+	x_8_addr = *fp;
+
+#ifdef HAVE_NEON
+	len = (char*) neon_x8_t - (char*) neon_x8;
+	memcpy(x_8_addr, neon_x8, len);
+
+	/*
+	* Changes adds to subtracts and vice versa to allow the computation
+	* of both the IFFT and FFT
+	*/
+	if (sign < 0) {
+		x_8_addr[31] ^= 0x00200000;
+		x_8_addr[32] ^= 0x00200000;
+		x_8_addr[33] ^= 0x00200000;
+		x_8_addr[34] ^= 0x00200000;
+		x_8_addr[65] ^= 0x00200000;
+		x_8_addr[66] ^= 0x00200000;
+		x_8_addr[70] ^= 0x00200000;
+		x_8_addr[74] ^= 0x00200000;
+		x_8_addr[97] ^= 0x00200000;
+		x_8_addr[98] ^= 0x00200000;
+		x_8_addr[102] ^= 0x00200000;
+		x_8_addr[104] ^= 0x00200000;
+	}
+
+	*fp += len / 4;
+	
+	//uint32_t *x_8_t_addr = fp;
+    //memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
+    //fp += (neon_end - neon_x8_t) / 4;
+#else
+	len = (char*) vfp_end - (char*) vfp_x8;
+	memcpy(x_8_addr, vfp_x8, len);
+
+	if (sign > 0) {
+		x_8_addr[65] ^= 0x00000040;
+		x_8_addr[66] ^= 0x00000040;
+		x_8_addr[68] ^= 0x00000040;
+		x_8_addr[70] ^= 0x00000040;
+		x_8_addr[103] ^= 0x00000040;
+		x_8_addr[104] ^= 0x00000040;
+		x_8_addr[105] ^= 0x00000040;
+		x_8_addr[108] ^= 0x00000040;
+		x_8_addr[113] ^= 0x00000040;
+		x_8_addr[114] ^= 0x00000040;
+		x_8_addr[117] ^= 0x00000040;
+		x_8_addr[118] ^= 0x00000040;
+	}
+
+	*fp += len / 4;
+#endif
+
+	return x_8_addr;
+}
+
+static FFTS_INLINE insns_t* generate_prologue(insns_t **fp, ffts_plan_t *p)
+{
+	insns_t	*start = *fp;
+
+	*(*fp)++ = PUSH_LR();
+	*(*fp)++ = 0xed2d8b10;
+
+	ADDI(fp,  3,  1,        0);
+	ADDI(fp,  7,  1,     p->N);
+	ADDI(fp,  5,  1, 2 * p->N);
+	ADDI(fp, 10,  7, 2 * p->N);
+	ADDI(fp,  4,  5, 2 * p->N);
+	ADDI(fp,  8, 10, 2 * p->N);
+	ADDI(fp,  6,  4, 2 * p->N);
+	ADDI(fp,  9,  8, 2 * p->N);
+
+	// load offsets into r12
+	*(*fp)++ = LDRI(12, 0, ((uint32_t) &p->offsets) - ((uint32_t) p));
+	//  *(*fp)++ = LDRI(1, 0, 4); // load ws into r1
+	ADDI(fp, 1, 0, 0);
+
+	ADDI(fp, 0, 2, 0), // mov out into r0
+	*(*fp)++ = LDRI(2, 1, ((uint32_t) &p->ee_ws) - ((uint32_t) p));
+
+#ifdef HAVE_NEON
+	MOVI(fp, 11, p->i0);
+#else
+	MOVI(fp, 11, p->i0);
+#endif
+
+	return start;
+}
+
+#endif /* FFTS_CODEGEN_ARM_H */
--- a/lib/ffts/src/codegen_sse.h
+++ b/lib/ffts/src/codegen_sse.h
--- a/lib/ffts/src/ffts.c
+++ b/lib/ffts/src/ffts.c
@ -1,398 +1,539 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */
+
 #include "ffts.h"
+
+#include "ffts_internal.h"
+#include "ffts_static.h"
+#include "ffts_trig.h"
 #include "macros.h"
-//#include "mini_macros.h"
 #include "patterns.h"
-#include "ffts_small.h"

-#ifdef DYNAMIC_DISABLED
-	#include "ffts_static.h"
+#ifndef DYNAMIC_DISABLED
+#include "codegen.h"
+#endif
+
+#if _WIN32
+#include <windows.h>
+#else
+#if __APPLE__
+#include <libkern/OSCacheControl.h>
+#endif
+
+#if HAVE_SYS_MMAN_H
+#include <sys/mman.h>
+#endif
+#endif
+
+#if defined(HAVE_NEON)
+static const FFTS_ALIGN(64) float w_data[16] = {
+     0.70710678118654757273731092936941f,
+     0.70710678118654746171500846685376f,
+    -0.70710678118654757273731092936941f,
+    -0.70710678118654746171500846685376f,
+     1.0f,
+     0.70710678118654757273731092936941f,
+    -0.0f,
+    -0.70710678118654746171500846685376f,
+     0.70710678118654757273731092936941f,
+     0.70710678118654746171500846685376f,
+     0.70710678118654757273731092936941f,
+     0.70710678118654746171500846685376f,
+     1.0f,
+     0.70710678118654757273731092936941f,
+     0.0f,
+     0.70710678118654746171500846685376f
+};
+#endif
+
+static FFTS_INLINE int ffts_allow_execute(void *start, size_t len)
+{
+    int result;
+
+#ifdef _WIN32
+    DWORD old_protect;
+    result = !VirtualProtect(start, len, PAGE_EXECUTE_READ, &old_protect);
 #else
-	#include "codegen.h"
+    result = mprotect(start, len, PROT_READ | PROT_EXEC);
 #endif

-#include <errno.h>
-  #include <sys/mman.h>
-  #include <string.h>
-  #include <limits.h>	   /* for PAGESIZE */
+    return result;
+}
+
+static FFTS_INLINE int ffts_deny_execute(void *start, size_t len)
+{
+    int result;

+#ifdef _WIN32
+    DWORD old_protect;
+    result = (int) VirtualProtect(start, len, PAGE_READWRITE, &old_protect);
+#else
+    result = mprotect(start, len, PROT_READ | PROT_WRITE);
+#endif
+
+    return result;
+}
+
+static FFTS_INLINE int ffts_flush_instruction_cache(void *start, size_t length)
+{
+#ifdef _WIN32
+    return !FlushInstructionCache(GetCurrentProcess(), start, length);
+#else
+#ifdef __APPLE__
+    sys_icache_invalidate(start, length);
+#elif __ANDROID__
+    cacheflush((long) start, (long) start + length, 0);
+#elif __linux__
+#if GCC_VERSION_AT_LEAST(4,3)
+    __builtin___clear_cache(start, (char*) start + length);
+#elif __GNUC__
+    __clear_cache((long) start, (long) start + length);
+#endif
+#endif
+    return 0;
+#endif
+}
+
+static FFTS_INLINE void *ffts_vmem_alloc(size_t length)
+{
 #if __APPLE__
-  #include <libkern/OSCacheControl.h>
+    return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANON | MAP_SHARED, -1, 0);
+#elif _WIN32
+    return VirtualAlloc(NULL, length, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
 #else
+#ifndef MAP_ANONYMOUS
+#define MAP_ANONYMOUS 0x20
 #endif

-void ffts_execute(ffts_plan_t *p, const void *  in, void *  out) {
-	p->transform(p, (const float *)in, (float *)out);
+    return mmap(NULL, length, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
+#endif
 }

-void ffts_free(ffts_plan_t *p) {
-	p->destroy(p);
+static FFTS_INLINE void ffts_vmem_free(void *addr, size_t length)
+{
+#ifdef _WIN32
+    (void) length;
+    VirtualFree(addr, 0, MEM_RELEASE);
+#else
+    munmap(addr, length);
+#endif
 }

-void ffts_free_1d(ffts_plan_t *p) {
-	
-	size_t i;
-
-	if(p->ws) {
-		FFTS_FREE(p->ws);
-	}
-	if(p->is) free(p->is);
-	if(p->ws_is) free(p->ws_is);
-	if(p->offsets)		free(p->offsets);
-	//free(p->transforms);
-	if(p->transforms) free(p->transforms);
-
-	if(p->transform_base) {
-		if (mprotect(p->transform_base, p->transform_size, PROT_READ | PROT_WRITE)) {
-			perror("Couldn't mprotect");
-			exit(errno);
-		}
-		munmap(p->transform_base, p->transform_size);
-		//free(p->transform_base);
-	}
-	free(p);
+FFTS_API void
+ffts_execute(ffts_plan_t *p, const void *in, void *out)
+{
+    /* TODO: Define NEEDS_ALIGNED properly instead */
+#if defined(HAVE_SSE) || defined(HAVE_NEON)
+    if (((uintptr_t) in % 16) != 0) {
+        LOG("ffts_execute: input buffer needs to be aligned to a 128bit boundary\n");
+    }
+
+    if (((uintptr_t) out % 16) != 0) {
+        LOG("ffts_execute: output buffer needs to be aligned to a 128bit boundary\n");
+    }
+#endif
+
+    p->transform(p, (const float*) in, (float*) out);
 }

-ffts_plan_t *ffts_init_1d(size_t N, int sign) {
-	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
-	size_t leafN = 8;	
-	size_t i;	
-
-#ifdef __arm__
-//#ifdef HAVE_NEON
-	V MULI_SIGN;
-	
-	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
-	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
-//#endif 
+FFTS_API void
+ffts_free(ffts_plan_t *p)
+{
+    if (p) {
+        p->destroy(p);
+    }
+}
+
+void ffts_free_1d(ffts_plan_t *p)
+{
+#if !defined(DYNAMIC_DISABLED)
+    if (p->transform_base) {
+        ffts_deny_execute(p->transform_base, p->transform_size);
+        ffts_vmem_free(p->transform_base, p->transform_size);
+    }
+#endif
+
+    if (p->ws_is) {
+        free(p->ws_is);
+    }
+
+    if (p->ws) {
+        FFTS_FREE(p->ws);
+    }
+
+    if (p->is) {
+        free(p->is);
+    }
+
+    if (p->offsets) {
+        free(p->offsets);
+    }
+
+    free(p);
+}
+
+static int
+ffts_generate_luts(ffts_plan_t *p, size_t N, size_t leaf_N, int sign)
+{
+    V4SF MULI_SIGN;
+    size_t n_luts;
+    ffts_cpx_32f *w;
+    ffts_cpx_32f *tmp;
+    size_t i, j, m, n;
+    int stride;
+
+    if (sign < 0) {
+        MULI_SIGN = V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f);
+    } else {
+        MULI_SIGN = V4SF_LIT4(0.0f, -0.0f, 0.0f, -0.0f);
+    }
+
+    /* LUTS */
+    n_luts = ffts_ctzl(N / leaf_N);
+    if (n_luts >= 32) {
+        n_luts = 0;
+    }
+
+    if (n_luts) {
+        size_t lut_size;
+
+#if defined(__arm__) && !defined(HAVE_NEON)
+        lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f) / 2;
 #else
-	V MULI_SIGN;
-	
-	if(sign < 0) MULI_SIGN = VLIT4(-0.0f, 0.0f, -0.0f, 0.0f);
-	else         MULI_SIGN = VLIT4(0.0f, -0.0f, 0.0f, -0.0f);
+        lut_size = leaf_N * (((1 << n_luts) - 2) * 3 + 1) * sizeof(ffts_cpx_32f);
 #endif

-	p->transform = NULL;
-	p->transform_base = NULL;
-	p->transforms = NULL;
-	p->is = NULL;
-	p->ws_is = NULL;
-	p->ws = NULL;
-	p->offsets = NULL;
-	p->destroy = ffts_free_1d;
-
-	if(N >= 32) {
-		ffts_init_offsets(p, N, leafN);
-#ifdef __arm__
+        p->ws = FFTS_MALLOC(lut_size, 32);
+        if (!p->ws) {
+            goto cleanup;
+        }
+
+        p->ws_is = (size_t*) malloc(n_luts * sizeof(*p->ws_is));
+        if (!p->ws_is) {
+            goto cleanup;
+        }
+    }
+
+    w = p->ws;
+    n = leaf_N * 2;
+
 #ifdef HAVE_NEON
-		ffts_init_is(p, N, leafN, 1);
+    V4SF neg = (sign < 0) ? V4SF_LIT4(0.0f, 0.0f, 0.0f, 0.0f) : V4SF_LIT4(-0.0f, -0.0f, -0.0f, -0.0f);
+#endif
+
+    /* calculate factors */
+    m = leaf_N << (n_luts - 2);
+    tmp = FFTS_MALLOC(m * sizeof(ffts_cpx_32f), 32);
+
+    ffts_generate_cosine_sine_pow2_32f(tmp, m);
+
+    /* generate lookup tables */
+    stride = 1 << (n_luts - 1);
+    for (i = 0; i < n_luts; i++) {
+        p->ws_is[i] = w - (ffts_cpx_32f*) p->ws;
+
+        if (!i) {
+            ffts_cpx_32f *w0 = FFTS_MALLOC(n/4 * sizeof(ffts_cpx_32f), 32);
+            float *fw0 = (float*) w0;
+            float *fw = (float*) w;
+
+            for (j = 0; j < n/4; j++) {
+                w0[j][0] = tmp[j * stride][0];
+                w0[j][1] = tmp[j * stride][1];
+            }
+
+#if defined(__arm__)
+#ifdef HAVE_NEON
+            for (j = 0; j < n/4; j += 4) {
+                V4SF2 temp0 = V4SF2_LD(fw0 + j*2);
+                temp0.val[1] = V4SF_XOR(temp0.val[1], neg);
+                V4SF2_STORE_SPR(fw + j*2, temp0);
+            }
 #else
-		ffts_init_is(p, N, leafN, 1);
+            for (j = 0; j < n/4; j++) {
+                fw[j*2+0] = fw0[j*2+0];
+                fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+            }
 #endif
+            w += n/4;
 #else
-		ffts_init_is(p, N, leafN, 1);
+            for (j = 0; j < n/4; j += 2) {
+                V4SF re, im, temp0;
+                temp0 = V4SF_LD(fw0 + j*2);
+                re = V4SF_DUPLICATE_RE(temp0);
+                im = V4SF_DUPLICATE_IM(temp0);
+                im = V4SF_XOR(im, MULI_SIGN);
+                V4SF_ST(fw + j*4 + 0, re);
+                V4SF_ST(fw + j*4 + 4, im);
+            }
+
+            w += n/4 * 2;
 #endif
-		
-		p->i0 = N/leafN/3+1;
-		p->i1 = N/leafN/3;
-		if((N/leafN) % 3 > 1) p->i1++;
-		p->i2 = N/leafN/3;
-		
-		#ifdef __arm__	
-		#ifdef HAVE_NEON
-		p->i0/=2;
-		p->i1/=2;
-		#endif
-		#else
-		p->i0/=2;
-		p->i1/=2;
-		#endif
-
-	}else{
-		p->transforms = malloc(2 * sizeof(transform_index_t));
-		p->transforms[0] = 0;
-		p->transforms[1] = 1;
-		if(N == 2) p->transform = &firstpass_2;
-		else if(N == 4 && sign == -1) p->transform = &firstpass_4_f;
-		else if(N == 4 && sign == 1) p->transform = &firstpass_4_b;
-		else if(N == 8 && sign == -1) p->transform = &firstpass_8_f;
-		else if(N == 8 && sign == 1) p->transform = &firstpass_8_b;
-		else if(N == 16 && sign == -1) p->transform = &firstpass_16_f;
-		else if(N == 16 && sign == 1) p->transform = &firstpass_16_b;
-
-		p->is = NULL;
-		p->offsets = NULL;
-	}
-
-		int hardcoded = 0;
-
-		/*      LUTS           */
-		size_t n_luts = __builtin_ctzl(N/leafN);
-		if(N < 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
-
-		if(n_luts >= 32) n_luts = 0;
-
-//		fprintf(stderr, "n_luts = %zu\n", n_luts);
-		
-		cdata_t *w;
-
-		int n = leafN*2;
-		if(hardcoded) n = 8;
-		
-		size_t lut_size = 0;
-
-		for(i=0;i<n_luts;i++) {
-			if(!i || hardcoded) {
-			#ifdef __arm__ 
-				if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
-				else lut_size += n/4 * sizeof(cdata_t);
-			#else
-				lut_size += n/4 * 2 * sizeof(cdata_t);
-			#endif
-				n *= 2;
-			} else {
-			#ifdef __arm__
-				lut_size += n/8 * 3 * sizeof(cdata_t);
-			#else
-				lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
-			#endif
-			}
-			n *= 2;
-		}
-		
-//		lut_size *= 16;
-		
-	//	fprintf(stderr, "lut size = %zu\n", lut_size);
-		if(n_luts) {
-			p->ws = FFTS_MALLOC(lut_size,32);
-			p->ws_is = malloc(n_luts * sizeof(size_t));
-		}else{
-			p->ws = NULL;
-			p->ws_is = NULL;
-		}
-		w = p->ws;
-
-		n = leafN*2;
-		if(hardcoded) n = 8;
-		
-		#ifdef HAVE_NEON
-			V neg = (sign < 0) ? VLIT4(0.0f, 0.0f, 0.0f, 0.0f) : VLIT4(-0.0f, -0.0f, -0.0f, -0.0f);
-		#endif
-		
-		for(i=0;i<n_luts;i++) {
-			p->ws_is[i] = w - (cdata_t *)p->ws;	
-			//fprintf(stderr, "LUT[%zu] = %d @ %08x - %zu\n", i, n, w, p->ws_is[i]);	
-			
-			if(!i || hardcoded) {
-				cdata_t *w0 = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
-
-				size_t j;
-				for(j=0;j<n/4;j++) {
-					w0[j][0]	= W_re(n,j);
-					w0[j][1]	= W_im(n,j);
-				}
-
-
-				float *fw0 = (float *)w0;
-				#ifdef __arm__
-					if(N < 32) {
-						//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
-						float *fw = (float *)w;
-						V temp0, temp1, temp2;
-						for(j=0;j<n/4;j+=2) {
-						//	#ifdef HAVE_NEON
-							temp0 = VLD(fw0 + j*2);
-							V re, im;
-							re = VDUPRE(temp0);
-							im = VDUPIM(temp0);
-							#ifdef HAVE_NEON 
-								im = VXOR(im, MULI_SIGN);
-								//im = IMULI(sign>0, im);
-							#else
-								im = MULI(sign>0, im);
-							#endif
-							VST(fw + j*4  , re);
-							VST(fw + j*4+4, im);
-					//		#endif
-						}
-						w += n/4 * 2;
-					}else{
-						//w = FFTS_MALLOC(n/4 * sizeof(cdata_t), 32);
-						float *fw = (float *)w;
-						#ifdef HAVE_NEON
-							VS temp0, temp1, temp2;
-							for(j=0;j<n/4;j+=4) {
-								temp0 = VLD2(fw0 + j*2);
-								temp0.val[1] = VXOR(temp0.val[1], neg);
-								STORESPR(fw + j*2, temp0);
-							}
-						#else
-							for(j=0;j<n/4;j+=1) {
-								fw[j*2] = fw0[j*2];
-								fw[j*2+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
-							}
-						#endif
-						w += n/4;
-					}
-				#else
-					//w = FFTS_MALLOC(n/4 * 2 * sizeof(cdata_t), 32);
-					float *fw = (float *)w;
-					V temp0, temp1, temp2;
-					for(j=0;j<n/4;j+=2) {
-						temp0 = VLD(fw0 + j*2);
-						V re, im;
-						re = VDUPRE(temp0);
-						im = VDUPIM(temp0);
-						im = VXOR(im, MULI_SIGN);
-						VST(fw + j*4  , re);
-						VST(fw + j*4+4, im);
-					}
-					w += n/4 * 2;
-				#endif
-
-				FFTS_FREE(w0);
-			}else{
-
-				cdata_t *w0 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
-				cdata_t *w1 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
-				cdata_t *w2 = FFTS_MALLOC(n/8 * sizeof(cdata_t), 32);
-
-				size_t j;
-				for(j=0;j<n/8;j++) {
-					w0[j][0]	= W_re(n,j*2);
-					w0[j][1]	= W_im(n,j*2);
-					w1[j][0]	= W_re(n,j);
-					w1[j][1]	= W_im(n,j);
-					w2[j][0]	= W_re(n,j + (n/8));
-					w2[j][1]	= W_im(n,j + (n/8));
-
-				}
-
-				float *fw0 = (float *)w0;
-				float *fw1 = (float *)w1;
-				float *fw2 = (float *)w2;
-				#ifdef __arm__
-					//w = FFTS_MALLOC(n/8 * 3 * sizeof(cdata_t), 32);
-					float *fw = (float *)w;
-					#ifdef HAVE_NEON	
-						VS temp0, temp1, temp2;
-						for(j=0;j<n/8;j+=4) {
-							temp0 = VLD2(fw0 + j*2);
-							temp0.val[1] = VXOR(temp0.val[1], neg);
-							STORESPR(fw + j*2*3,      temp0);
-							temp1 = VLD2(fw1 + j*2);
-							temp1.val[1] = VXOR(temp1.val[1], neg);
-							STORESPR(fw + j*2*3 + 8,  temp1);
-							temp2 = VLD2(fw2 + j*2);
-							temp2.val[1] = VXOR(temp2.val[1], neg);
-							STORESPR(fw + j*2*3 + 16, temp2);
-						}
-					#else
-						for(j=0;j<n/8;j+=1) {
-								fw[j*6] = fw0[j*2];
-								fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
-								fw[j*6+2] = fw1[j*2+0];
-								fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
-								fw[j*6+4] = fw2[j*2+0];
-								fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
-						}
-					#endif
-					w += n/8 * 3;
-				#else
-					//w = FFTS_MALLOC(n/8 * 3 * 2 * sizeof(cdata_t), 32);
-					float *fw = (float *)w;
-					V temp0, temp1, temp2, re, im;
-					for(j=0;j<n/8;j+=2) {
-						temp0 = VLD(fw0 + j*2);
-						re = VDUPRE(temp0);
-						im = VDUPIM(temp0);
-						im = VXOR(im, MULI_SIGN);
-						VST(fw + j*2*6  , re);
-						VST(fw + j*2*6+4, im);
-
-						temp1 = VLD(fw1 + j*2);
-						re = VDUPRE(temp1);
-						im = VDUPIM(temp1);
-						im = VXOR(im, MULI_SIGN);
-						VST(fw + j*2*6+8 , re);
-						VST(fw + j*2*6+12, im);
-
-						temp2 = VLD(fw2 + j*2);
-						re = VDUPRE(temp2);
-						im = VDUPIM(temp2);
-						im = VXOR(im, MULI_SIGN);
-						VST(fw + j*2*6+16, re);
-						VST(fw + j*2*6+20, im);
-					}
-					w += n/8 * 3 * 2;
-				#endif
-
-				FFTS_FREE(w0);
-				FFTS_FREE(w1);
-				FFTS_FREE(w2);
-			}
-			///p->ws[i] = w;
-
-			n *= 2;
-		}
-
-	float *tmp = (float *)p->ws;
-
-	if(sign < 0) {
-		p->oe_ws = (void *)(&w_data[4]);
-		p->ee_ws = (void *)(w_data);
-		p->eo_ws = (void *)(&w_data[4]);
-	}else{
-		p->oe_ws = (void *)(w_data + 12);
-		p->ee_ws = (void *)(w_data + 8);
-		p->eo_ws = (void *)(w_data + 12);
-	}
-
-	p->N = N;
-	p->lastlut = w;
-	p->n_luts = n_luts;
-#ifdef DYNAMIC_DISABLED
-	if(sign < 0) { 
-		if(N >= 32) p->transform = ffts_static_transform_f; 
-	}else{
-		if(N >= 32) p->transform = ffts_static_transform_i; 
-	}

+            FFTS_FREE(w0);
+        } else {
+            ffts_cpx_32f *w0 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
+            ffts_cpx_32f *w1 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
+            ffts_cpx_32f *w2 = (ffts_cpx_32f*) FFTS_MALLOC(n/8 * sizeof(ffts_cpx_32f), 32);
+
+            float *fw0 = (float*) w0;
+            float *fw1 = (float*) w1;
+            float *fw2 = (float*) w2;
+
+            float *fw = (float *)w;
+
+            for (j = 0; j < n/8; j++) {
+                w0[j][0] = tmp[2 * j * stride][0];
+                w0[j][1] = tmp[2 * j * stride][1];
+
+                w1[j][0] = tmp[j * stride][0];
+                w1[j][1] = tmp[j * stride][1];
+
+                w2[j][0] = tmp[(j + (n/8)) * stride][0];
+                w2[j][1] = tmp[(j + (n/8)) * stride][1];
+            }
+
+#if defined(__arm__)
+#ifdef HAVE_NEON
+            for (j = 0; j < n/8; j += 4) {
+                V4SF2 temp0, temp1, temp2;
+
+                temp0 = V4SF2_LD(fw0 + j*2);
+                temp0.val[1] = V4SF_XOR(temp0.val[1], neg);
+                V4SF2_STORE_SPR(fw + j*2*3, temp0);
+                
+                temp1 = V4SF2_LD(fw1 + j*2);
+                temp1.val[1] = V4SF_XOR(temp1.val[1], neg);
+                V4SF2_STORE_SPR(fw + j*2*3 + 8,  temp1);
+                
+                temp2 = V4SF2_LD(fw2 + j*2);
+                temp2.val[1] = V4SF_XOR(temp2.val[1], neg);
+                V4SF2_STORE_SPR(fw + j*2*3 + 16, temp2);
+            }
 #else
-	if(N>=32)  ffts_generate_func_code(p, N, leafN, sign);
+            for (j = 0; j < n/8; j++) {
+                fw[j*6+0] = fw0[j*2+0];
+                fw[j*6+1] = (sign < 0) ? fw0[j*2+1] : -fw0[j*2+1];
+                fw[j*6+2] = fw1[j*2+0];
+                fw[j*6+3] = (sign < 0) ? fw1[j*2+1] : -fw1[j*2+1];
+                fw[j*6+4] = fw2[j*2+0];
+                fw[j*6+5] = (sign < 0) ? fw2[j*2+1] : -fw2[j*2+1];
+            }
 #endif
+            w += n/8 * 3;
+#else
+            for (j = 0; j < n/8; j += 2) {
+                V4SF temp0, temp1, temp2, re, im;
+
+                temp0 = V4SF_LD(fw0 + j*2);
+                re = V4SF_DUPLICATE_RE(temp0);
+                im = V4SF_DUPLICATE_IM(temp0);
+                im = V4SF_XOR(im, MULI_SIGN);
+                V4SF_ST(fw + j*2*6+0, re);
+                V4SF_ST(fw + j*2*6+4, im);
+
+                temp1 = V4SF_LD(fw1 + j*2);
+                re = V4SF_DUPLICATE_RE(temp1);
+                im = V4SF_DUPLICATE_IM(temp1);
+                im = V4SF_XOR(im, MULI_SIGN);
+                V4SF_ST(fw + j*2*6+8 , re);
+                V4SF_ST(fw + j*2*6+12, im);
+
+                temp2 = V4SF_LD(fw2 + j*2);
+                re = V4SF_DUPLICATE_RE(temp2);
+                im = V4SF_DUPLICATE_IM(temp2);
+                im = V4SF_XOR(im, MULI_SIGN);
+                V4SF_ST(fw + j*2*6+16, re);
+                V4SF_ST(fw + j*2*6+20, im);
+            }
+
+            w += n/8 * 3 * 2;
+#endif
+
+            FFTS_FREE(w0);
+            FFTS_FREE(w1);
+            FFTS_FREE(w2);
+        }
+
+        n *= 2;
+        stride >>= 1;
+    }
+
+#if defined(HAVE_NEON)
+    if (sign < 0) {
+        p->oe_ws = (void*)(w_data + 4);
+        p->ee_ws = (void*)(w_data);
+        p->eo_ws = (void*)(w_data + 4);
+    } else {
+        p->oe_ws = (void*)(w_data + 12);
+        p->ee_ws = (void*)(w_data + 8);
+        p->eo_ws = (void*)(w_data + 12);
+    }
+#endif
+
+    FFTS_FREE(tmp);

-	return p;
+    p->lastlut = w;
+    p->n_luts = n_luts;
+    return 0;
+
+cleanup:
+    return -1;
 }

+FFTS_API ffts_plan_t*
+ffts_init_1d(size_t N, int sign)
+{
+    const size_t leaf_N = 8;
+    ffts_plan_t *p;
+
+    if (N < 2 || (N & (N - 1)) != 0) {
+        LOG("FFT size must be a power of two\n");
+        return NULL;
+    }
+
+    p = calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }
+
+    p->destroy = ffts_free_1d;
+    p->N = N;
+
+    if (N >= 32) {
+        /* generate lookup tables */
+        if (ffts_generate_luts(p, N, leaf_N, sign)) {
+            goto cleanup;
+        }
+
+        p->offsets = ffts_init_offsets(N, leaf_N);
+        if (!p->offsets) {
+            goto cleanup;
+        }
+
+        p->is = ffts_init_is(N, leaf_N, 1);
+        if (!p->is) {
+            goto cleanup;
+        }
+
+        p->i0 = N/leaf_N/3 + 1;
+        p->i1 = p->i2 = N/leaf_N/3;
+        if ((N/leaf_N) % 3 > 1) {
+            p->i1++;
+        }
+
+#if !defined(HAVE_VFP) || defined(DYNAMIC_DISABLED)
+        p->i0 /= 2;
+        p->i1 /= 2;
+#endif
+
+#ifdef DYNAMIC_DISABLED
+        if (sign < 0) {
+            p->transform = ffts_static_transform_f_32f;
+        } else {
+            p->transform = ffts_static_transform_i_32f;
+        }
+#else
+        /* determinate transform size */
+#if defined(__arm__)
+        if (N < 8192) {
+            p->transform_size = 8192;
+        } else {
+            p->transform_size = N;
+        }
+#else
+        if (N < 2048) {
+            p->transform_size = 16384;
+        } else {
+            p->transform_size = 16384 + 2*N/8 * ffts_ctzl(N);
+        }
+#endif
+
+        /* allocate code/function buffer */
+        p->transform_base = ffts_vmem_alloc(p->transform_size);
+        if (!p->transform_base) {
+            goto cleanup;
+        }
+
+        /* generate code */
+        p->transform = ffts_generate_func_code(p, N, leaf_N, sign);
+        if (!p->transform) {
+            goto cleanup;
+        }
+
+        /* enable execution with read access for the block */
+        if (ffts_allow_execute(p->transform_base, p->transform_size)) {
+            goto cleanup;
+        }
+
+        /* flush from the instruction cache */
+        if (ffts_flush_instruction_cache(p->transform_base, p->transform_size)) {
+            goto cleanup;
+        }
+#endif
+    } else {
+        switch (N) {
+        case 2:
+            p->transform = &ffts_small_2_32f;
+            break;
+        case 4:
+            if (sign == -1) {
+                p->transform = &ffts_small_forward4_32f;
+            } else if (sign == 1) {
+                p->transform = &ffts_small_backward4_32f;
+            }
+            break;
+        case 8:
+            if (sign == -1) {
+                p->transform = &ffts_small_forward8_32f;
+            } else if (sign == 1) {
+                p->transform = &ffts_small_backward8_32f;
+            }
+            break;
+        case 16:
+        default:
+            if (sign == -1) {
+                p->transform = &ffts_small_forward16_32f;
+            } else {
+                p->transform = &ffts_small_backward16_32f;
+            }
+            break;
+        }
+    }
+
+    return p;
+
+cleanup:
+    ffts_free_1d(p);
+    return NULL;
+}
--- a/lib/ffts/src/ffts.h
+++ b/lib/ffts/src/ffts.h
@ -1,177 +0,0 @@
-/*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-#ifndef __CP_SSE_H__
-#define __CP_SSE_H__
-
-#include "config.h"
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <math.h>
-#include <stddef.h>
-#include <stdint.h>
-//#include <stdalign.h>
-
-//#include "codegen.h"
-#include "types.h"
-
-#define PI 3.1415926535897932384626433832795028841971693993751058209
-
-static const __attribute__ ((aligned(64))) float w_data[16] = {
-	0.70710678118654757273731092936941,		0.70710678118654746171500846685376,
-	-0.70710678118654757273731092936941,	-0.70710678118654746171500846685376,
-    1.0f,									 0.70710678118654757273731092936941f, 
-	-0.0f,									-0.70710678118654746171500846685376,
-	0.70710678118654757273731092936941,		0.70710678118654746171500846685376,
-	0.70710678118654757273731092936941,		0.70710678118654746171500846685376,
-	1.0f,									0.70710678118654757273731092936941f, 
-	0.0f,									0.70710678118654746171500846685376
-};
-
-__INLINE float W_re(float N, float k) { return cos(-2.0f * PI * k / N); }
-__INLINE float W_im(float N, float k) { return sin(-2.0f * PI * k / N); }
-
-typedef size_t transform_index_t;
-
-//typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
-typedef void (*transform_func_t)(float *data, size_t N, float *LUT);
-
-typedef struct _ffts_plan_t ffts_plan_t;
-
-/**
- * Contains all the Information need to perform FFT
- *
- *
- * DO NOT CHANGE THE ORDER OF MEMBERS
- * ASSEMBLY CODE USES HARD CODED OFFSETS TO REFERENCE
- * SOME OF THESE VARIABES!!
- */
-struct _ffts_plan_t {
-
-	/**
-	 * 
-	 */
-	ptrdiff_t *offsets;
-#ifdef DYNAMIC_DISABLED
-	/**
-	 * Twiddle factors
-	 */
-	void *ws;
-	/**
-	 * ee - 2 size x  size8 
-	 * oo - 2 x size4 in parallel
-	 * oe - 
-	 */
-	void  *oe_ws, *eo_ws, *ee_ws;
-#else
-	void __attribute__((aligned(32))) *ws;
-	void __attribute__((aligned(32)))  *oe_ws, *eo_ws, *ee_ws;
-#endif
-	/** 
-	 * Pointer into an array of precomputed indexes for the input data array
-	 */
-	ptrdiff_t *is; 
-
-	/**
-	 * Twiddle Factor Indexes
-	 */
-	size_t *ws_is;
-	
-	/** 
-	 * Size of the loops for the base cases
-	 */
-	size_t i0, i1, n_luts;
-
-	/**
-	 * Size fo the Transform
-	 */
-	size_t N;
-	void *lastlut;
-	/**
-	 * Used in multidimensional Code ??
-	 */
-	transform_index_t *transforms; 
-	//transform_func_t transform;
-	
-	/** 
-	 * Pointer to the dynamically generated function 
-	 * that will execute the FFT
-	 */
-	void (*transform)(ffts_plan_t * , const void * , void * );
-
-	/**
-	 * Pointer to the base memory address of 
-	 * of the transform function
-	 */
-	void *transform_base;
-
-	/**
-	 * Size of the memory block contain the 
-	 * generated code
-	 */
-	size_t transform_size;
-
-	/**
-	 * Points to the cosnant variables used by
-	 * the Assembly Code 
-	 */
-	void *constants;
-	
-	// multi-dimensional stuff:
-	struct _ffts_plan_t **plans;
-	int rank;
-	size_t *Ns, *Ms;
-	void *buf;
-
-	void *transpose_buf;
-
-	/**
-	 * Pointer to the destroy function
-	 * to clean up the plan after use
-	 * (differs for real and multi dimension transforms
-	 */
-	void (*destroy)(ffts_plan_t *);
-
-	/**
-	 * Coefficiants for the real valued transforms
-	 */
-	float *A, *B;
-			
-	size_t i2;
-};
-
-
-void ffts_free(ffts_plan_t *);
-ffts_plan_t *ffts_init_1d(size_t N, int sign); 
-void ffts_execute(ffts_plan_t *, const void *, void *);
-#endif
--- a/lib/ffts/src/ffts_attributes.h
+++ b/lib/ffts/src/ffts_attributes.h
@ -0,0 +1,111 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_ATTRIBUTES_H
+#define FFTS_ATTRIBUTES_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+/* Macro definitions for various function/variable attributes */
+#ifdef __GNUC__
+#define GCC_VERSION_AT_LEAST(x,y) \
+	(__GNUC__ > x || __GNUC__ == x && __GNUC_MINOR__ >= y)
+#else
+#define GCC_VERSION_AT_LEAST(x,y) 0
+#endif
+
+#ifdef __GNUC__
+#define FFTS_ALIGN(x) __attribute__((aligned(x)))
+#elif defined(_MSC_VER)
+#define FFTS_ALIGN(x) __declspec(align(x))
+#else
+#define FFTS_ALIGN(x)
+#endif
+
+#if GCC_VERSION_AT_LEAST(3,1)
+#define FFTS_ALWAYS_INLINE __attribute__((always_inline)) inline
+#elif defined(_MSC_VER)
+#define FFTS_ALWAYS_INLINE __forceinline
+#else
+#define FFTS_ALWAYS_INLINE inline
+#endif
+
+#if defined(_MSC_VER)
+#define FFTS_INLINE __inline
+#else
+#define FFTS_INLINE inline
+#endif
+
+#if defined(__GNUC__)
+#define FFTS_RESTRICT __restrict
+#elif defined(_MSC_VER)
+#define FFTS_RESTRICT __restrict
+#else
+#define FFTS_RESTRICT
+#endif
+
+#if GCC_VERSION_AT_LEAST(4,5)
+#define FFTS_ASSUME(cond) do { if (!(cond)) __builtin_unreachable(); } while (0)
+#elif defined(_MSC_VER)
+#define FFTS_ASSUME(cond) __assume(cond)
+#else
+#define FFTS_ASSUME(cond)
+#endif
+
+#if GCC_VERSION_AT_LEAST(4,7)
+#define FFTS_ASSUME_ALIGNED_16(x) __builtin_assume_aligned(x, 16)
+#else
+#define FFTS_ASSUME_ALIGNED_16(x) x
+#endif
+
+#if GCC_VERSION_AT_LEAST(4,7)
+#define FFTS_ASSUME_ALIGNED_32(x) __builtin_assume_aligned(x, 32)
+#else
+#define FFTS_ASSUME_ALIGNED_32(x) x
+#endif
+
+#if defined(__GNUC__)
+#define FFTS_LIKELY(cond) __builtin_expect(!!(cond), 1)
+#else
+#define FFTS_LIKELY(cond) cond
+#endif
+
+#if defined(__GNUC__)
+#define FFTS_UNLIKELY(cond) __builtin_expect(!!(cond), 0)
+#else
+#define FFTS_UNLIKELY(cond) cond
+#endif
+
+#endif /* FFTS_ATTRIBUTES_H */
--- a/lib/ffts/src/ffts_dd.h
+++ b/lib/ffts/src/ffts_dd.h
@ -0,0 +1,230 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_DD_H
+#define FFTS_DD_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts_attributes.h"
+
+#if HAVE_SSE2
+#include <emmintrin.h>
+#endif
+
+/* double-double number */
+struct ffts_dd_t
+{
+    double hi;
+    double lo;
+};
+
+#if HAVE_SSE2
+/* double-double vector */
+struct ffts_dd2_t {
+    __m128d hi;
+    __m128d lo;
+};
+#endif
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add_dd_unnormalized(const struct ffts_dd_t a,
+                            const struct ffts_dd_t b);
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul_dd_unnormalized(const struct ffts_dd_t a,
+                            const struct ffts_dd_t b);
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_split(double a);
+
+/* aka quick-two-sum */
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add(double a, double b)
+{
+    struct ffts_dd_t dd;
+    dd.hi = a + b;
+    dd.lo = b - (dd.hi - a);
+    return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add_dd(const struct ffts_dd_t a,
+               const struct ffts_dd_t b)
+{
+    struct ffts_dd_t t1 = ffts_dd_add_dd_unnormalized(a, b);
+    return ffts_dd_add(t1.hi, t1.lo);
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_add_dd_unnormalized(const struct ffts_dd_t a,
+                            const struct ffts_dd_t b)
+{
+    struct ffts_dd_t dd;
+    double e1;
+    dd.hi = a.hi + b.hi;
+    e1 = dd.hi - a.hi;
+    dd.lo = ((a.hi - (dd.hi - e1)) + (b.hi - e1)) + (a.lo + b.lo);
+    return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul(const double a, const double b)
+{
+    struct ffts_dd_t dd;
+    struct ffts_dd_t t1 = ffts_dd_split(a);
+    struct ffts_dd_t t2 = ffts_dd_split(b);
+    dd.hi = a * b;
+    dd.lo = (t1.hi * t2.hi - dd.hi);
+    dd.lo += (t1.hi * t2.lo + t1.lo * t2.hi);
+    dd.lo += t1.lo * t2.lo;
+    return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul_dd(const struct ffts_dd_t a,
+               const struct ffts_dd_t b)
+{
+    struct ffts_dd_t dd = ffts_dd_mul_dd_unnormalized(a, b);
+    return ffts_dd_add(dd.hi, dd.lo);
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_mul_dd_unnormalized(const struct ffts_dd_t a,
+                            const struct ffts_dd_t b)
+{
+    struct ffts_dd_t dd = ffts_dd_mul(a.hi, b.hi);
+    dd.lo += (a.hi * b.lo + a.lo * b.hi);
+    return dd;
+}
+
+static FFTS_INLINE struct ffts_dd_t
+ffts_dd_split(double a)
+{
+    /* 2^27+1 = 134217729 */
+    struct ffts_dd_t dd;
+    double t = 134217729.0 * a;
+    dd.hi = t - (t - a);
+    dd.lo = a - dd.hi;
+    return dd;
+}
+
+#if HAVE_SSE2
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add_dd2_unnormalized(const struct ffts_dd2_t *const FFTS_RESTRICT a,
+                              const struct ffts_dd2_t *const FFTS_RESTRICT b);
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul_dd2_unnormalized(const struct ffts_dd2_t *const FFTS_RESTRICT a,
+                              const struct ffts_dd2_t *const FFTS_RESTRICT b);
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_split(__m128d a);
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add(__m128d a, __m128d b)
+{
+    struct ffts_dd2_t dd2;
+    dd2.hi = _mm_add_pd(a, b);
+    dd2.lo = _mm_sub_pd(b, _mm_sub_pd(dd2.hi, a));
+    return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add_dd2(const struct ffts_dd2_t *const FFTS_RESTRICT a,
+                 const struct ffts_dd2_t *const FFTS_RESTRICT b)
+{
+    struct ffts_dd2_t t1 = ffts_dd2_add_dd2_unnormalized(a, b);
+    return ffts_dd2_add(t1.hi, t1.lo);
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_add_dd2_unnormalized(const struct ffts_dd2_t *const FFTS_RESTRICT a,
+                              const struct ffts_dd2_t *const FFTS_RESTRICT b)
+{
+    struct ffts_dd2_t dd2;
+    __m128d e1;
+    dd2.hi = _mm_add_pd(a->hi, b->hi);
+    e1 = _mm_sub_pd(dd2.hi, a->hi);
+    dd2.lo = _mm_add_pd(_mm_add_pd(_mm_sub_pd(a->hi, _mm_sub_pd(dd2.hi, e1)),
+        _mm_sub_pd(b->hi, e1)), _mm_add_pd(a->lo, b->lo));
+    return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul(const __m128d a, const __m128d b)
+{
+    struct ffts_dd2_t dd2;
+    struct ffts_dd2_t t1 = ffts_dd2_split(a);
+    struct ffts_dd2_t t2 = ffts_dd2_split(b);
+    dd2.hi = _mm_mul_pd(a, b);
+    dd2.lo = _mm_add_pd(_mm_add_pd(_mm_sub_pd(
+        _mm_mul_pd(t1.hi, t2.hi), dd2.hi),
+        _mm_add_pd(_mm_mul_pd(t1.hi, t2.lo),
+        _mm_mul_pd(t1.lo, t2.hi))),
+        _mm_mul_pd(t1.lo, t2.lo));
+    return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul_dd2(const struct ffts_dd2_t *const FFTS_RESTRICT a,
+                 const struct ffts_dd2_t *const FFTS_RESTRICT b)
+{
+    struct ffts_dd2_t dd2 = ffts_dd2_mul_dd2_unnormalized(a, b);
+    return ffts_dd2_add(dd2.hi, dd2.lo);
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_mul_dd2_unnormalized(const struct ffts_dd2_t *const FFTS_RESTRICT a,
+                              const struct ffts_dd2_t *const FFTS_RESTRICT b)
+{
+    struct ffts_dd2_t dd2 = ffts_dd2_mul(a->hi, b->hi);
+    dd2.lo = _mm_add_pd(dd2.lo, _mm_add_pd(
+        _mm_mul_pd(a->hi, b->lo), _mm_mul_pd(a->lo, b->hi)));
+    return dd2;
+}
+
+static FFTS_INLINE struct ffts_dd2_t
+ffts_dd2_split(__m128d a)
+{
+    /* 2^27+1 = 134217729 */
+    struct ffts_dd2_t dd2;
+    __m128d t = _mm_mul_pd(a, _mm_set1_pd(134217729.0));
+    dd2.hi = _mm_sub_pd(t, _mm_sub_pd(t, a));
+    dd2.lo = _mm_sub_pd(a, dd2.hi);
+    return dd2;
+}
+#endif /* HAVE_SSE2 */
+
+#endif /* FFTS_DD_H */
--- a/lib/ffts/src/ffts_internal.h
+++ b/lib/ffts/src/ffts_internal.h
@ -0,0 +1,215 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_INTERNAL_H
+#define FFTS_INTERNAL_H
+
+//#include "config.h"
+#include "ffts_attributes.h"
+#include "types.h"
+
+#ifdef HAVE_MALLOC_H
+#include <malloc.h>
+#endif
+
+#include <stddef.h>
+
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+#include <stdio.h>
+
+#ifdef ENABLE_LOG
+#ifdef __ANDROID__
+#include <android/log.h>
+#define LOG(s) __android_log_print(ANDROID_LOG_ERROR, "FFTS", s)
+#else
+#define LOG(s) fprintf(stderr, s)
+#endif
+#else
+#define LOG(s)
+#endif
+
+struct _ffts_plan_t;
+typedef void (*transform_func_t)(struct _ffts_plan_t *p, const void *in, void *out);
+
+/**
+ * Contains all the Information need to perform FFT
+ *
+ *
+ * DO NOT CHANGE THE ORDER OF MEMBERS
+ * ASSEMBLY CODE USES HARD CODED OFFSETS TO REFERENCE
+ * SOME OF THESE VARIABES!!
+ */
+struct _ffts_plan_t {
+
+    /**
+     *
+     */
+    ptrdiff_t *offsets;
+#ifdef DYNAMIC_DISABLED
+    /**
+     * Twiddle factors
+     */
+    void *ws;
+
+    /**
+     * ee - 2 size x  size8
+     * oo - 2 x size4 in parallel
+     * oe -
+     */
+    void  *oe_ws, *eo_ws, *ee_ws;
+#else
+    void FFTS_ALIGN(32) *ws;
+    void FFTS_ALIGN(32) *oe_ws, *eo_ws, *ee_ws;
+#endif
+
+    /**
+     * Pointer into an array of precomputed indexes for the input data array
+     */
+    ptrdiff_t *is;
+
+    /**
+     * Twiddle Factor Indexes
+     */
+    size_t *ws_is;
+
+    /**
+     * Size of the loops for the base cases
+     */
+    size_t i0, i1, n_luts;
+
+    /**
+     * Size fo the Transform
+     */
+    size_t N;
+    void *lastlut;
+
+#ifdef __arm__
+    size_t *temporary_fix_as_dynamic_code_assumes_fixed_offset;
+#endif
+
+    /**
+     * Pointer to the dynamically generated function
+     * that will execute the FFT
+     */
+    transform_func_t transform;
+
+    /**
+     * Pointer to the base memory address of
+     * of the transform function
+     */
+    void *transform_base;
+
+    /**
+     * Size of the memory block contain the
+     * generated code
+     */
+    size_t transform_size;
+
+    /**
+     * Points to the cosnant variables used by
+     * the Assembly Code
+     */
+    void *constants;
+
+    // multi-dimensional stuff:
+    struct _ffts_plan_t **plans;
+    int rank;
+    size_t *Ns, *Ms;
+    void *buf;
+
+    void *transpose_buf;
+
+    /**
+     * Pointer to the destroy function
+     * to clean up the plan after use
+     * (differs for real and multi dimension transforms
+     */
+    void (*destroy)(struct _ffts_plan_t *);
+
+    /**
+     * Coefficiants for the real valued transforms
+     */
+    float *A, *B;
+
+    size_t i2;
+};
+
+static FFTS_INLINE void *ffts_aligned_malloc(size_t size)
+{
+#if defined(_WIN32)
+    return _aligned_malloc(size, 32);
+#else
+    return valloc(size);
+#endif
+}
+
+static FFTS_INLINE void ffts_aligned_free(void *p)
+{
+#if defined(_WIN32)
+    _aligned_free(p);
+#else
+    free(p);
+#endif
+}
+
+#if GCC_VERSION_AT_LEAST(3,3)
+#define ffts_ctzl __builtin_ctzl
+#elif defined(_MSC_VER)
+#include <intrin.h>
+#ifdef _M_X64
+#pragma intrinsic(_BitScanForward64)
+static __inline unsigned long ffts_ctzl(size_t N)
+{
+    unsigned long count;
+    _BitScanForward64((unsigned long*) &count, N);
+    return count;
+}
+#else
+#pragma intrinsic(_BitScanForward)
+static __inline unsigned long ffts_ctzl(size_t N)
+{
+    unsigned long count;
+    _BitScanForward((unsigned long*) &count, N);
+    return count;
+}
+#endif /* _WIN64 */
+#endif /* _MSC_VER */
+
+#endif /* FFTS_INTERNAL_H */
--- a/lib/ffts/src/ffts_nd.c
+++ b/lib/ffts/src/ffts_nd.c
@ -1,282 +1,193 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-*/
-
-#include "ffts_nd.h"
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#ifdef HAVE_NEON
-#include "neon.h"
-#endif
-
-void ffts_free_nd(ffts_plan_t *p) {
-
-	int i;
-	for(i=0;i<p->rank;i++) {
-		
-		ffts_plan_t *x = p->plans[i];
-		int k;
-		for(k=0;k<i;k++) {
-			if(p->Ms[i] == p->Ms[k]) x = NULL;
-		}
-		
-		if(x)	ffts_free(x);
-	}
-
-	free(p->Ns);
-	free(p->Ms);
-	free(p->plans);
-	free(p->buf);
-	free(p->transpose_buf);
-	free(p);
-}
-#define TSIZE 8
-#include <string.h>
-void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
-
-#ifdef HAVE_NEON 
-	size_t i,j,k;
-	int linebytes = w*8;
-
-	for(j=0;j<h;j+=8) {
-		for(i=0;i<w;i+=8) {
-			neon_transpose_to_buf(in + j*w + i, buf, w);
-
-			uint64_t *p = out + i*h + j;
-			uint64_t *pbuf = buf;
-			uint64_t *ptemp;
-
-			__asm__ __volatile__(
-							 "mov %[ptemp], %[p]\n\t"
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
-							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t" 
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t"
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
-							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t" 
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t"
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
-							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t" 
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t"
-							 "add %[p], %[p], %[w], lsl #3\n\t"
-							 "vld1.32 {q8,q9}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q10,q11}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q12,q13}, [%[pbuf], :128]!\n\t"
-							 "vld1.32 {q14,q15}, [%[pbuf], :128]!\n\t"
-							 "vst1.32 {q8,q9}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q10,q11}, [%[ptemp], :128]!\n\t"
-							 "mov %[ptemp], %[p]\n\t" 
-							 "vst1.32 {q12,q13}, [%[ptemp], :128]!\n\t"
-							 "vst1.32 {q14,q15}, [%[ptemp], :128]!\n\t"
-							 
-						: [p] "+r" (p), [pbuf] "+r" (pbuf), [ptemp] "+r" (ptemp)
-						: [w] "r" (w)
-						: "memory", "q8", "q9", "q10", "q11"
-						);
-//			out[i*h + j] = in[j*w + i];
-		}
-	}
-#else
-#ifdef HAVE_SSE
-	uint64_t tmp[TSIZE*TSIZE] __attribute__((aligned(64)));
-	int tx, ty;
-	int x, y;
-	int tw = w / TSIZE;
-	int th = h / TSIZE;
-	for (ty=0;ty<th;ty++) {
-		for (tx=0;tx<tw;tx++) {
-			uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
-			uint64_t *op0 = tmp;//out + h*TSIZE*tx + ty*TSIZE;
-
-			// Copy/transpose to tmp
-			for (y=0;y<TSIZE;y+=2) {
-				//for (x=0;x<TSIZE;x+=2) {
-					//op[x*TSIZE] = ip[x];
-					__m128d q0 = _mm_load_pd((double *)(ip0 + 0*w));
-					__m128d q1 = _mm_load_pd((double *)(ip0 + 1*w));
-					__m128d q2 = _mm_load_pd((double *)(ip0 + 2*w));
-					__m128d q3 = _mm_load_pd((double *)(ip0 + 3*w));
-					__m128d q4 = _mm_load_pd((double *)(ip0 + 4*w));
-					__m128d q5 = _mm_load_pd((double *)(ip0 + 5*w));
-					__m128d q6 = _mm_load_pd((double *)(ip0 + 6*w));
-					__m128d q7 = _mm_load_pd((double *)(ip0 + 7*w));
-					ip0 += 2;	
-					
-					__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
-					__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
-					__m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
-					__m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
-					__m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
-					__m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
-					__m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
-					__m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
-					//_mm_store_pd((double *)(op0 + y*h + x), t0);
-					//_mm_store_pd((double *)(op0 + y*h + x + h), t1);
-					_mm_store_pd((double *)(op0 + 0), t0);
-					_mm_store_pd((double *)(op0 + 0 + TSIZE), t1);
-					_mm_store_pd((double *)(op0 + 2 ), t2);
-					_mm_store_pd((double *)(op0 + 2 + TSIZE), t3);
-					_mm_store_pd((double *)(op0 + 4 ), t4);
-					_mm_store_pd((double *)(op0 + 4 + TSIZE), t5);
-					_mm_store_pd((double *)(op0 + 6 ), t6);
-					_mm_store_pd((double *)(op0 + 6 + TSIZE), t7);
-				//}
-				op0 += 2*TSIZE;
-			}
-			
-			op0 = out + h*tx*TSIZE + ty*TSIZE;
-			ip0 = tmp;
-			for (y=0;y<TSIZE;y+=1) {
-		//		memcpy(op0, ip0, TSIZE * sizeof(*ip0));
-				
-				__m128d q0 = _mm_load_pd((double *)(ip0 + 0));
-				__m128d q1 = _mm_load_pd((double *)(ip0 + 2));
-				__m128d q2 = _mm_load_pd((double *)(ip0 + 4));
-				__m128d q3 = _mm_load_pd((double *)(ip0 + 6));
-				_mm_store_pd((double *)(op0 + 0), q0);
-				_mm_store_pd((double *)(op0 + 2), q1);
-				_mm_store_pd((double *)(op0 + 4), q2);
-				_mm_store_pd((double *)(op0 + 6), q3);
-				
-				op0 += h;
-				ip0 += TSIZE;
-			}
-
-		}
-	}
-/*
-	size_t i,j;
-	for(i=0;i<w;i+=2) {
-		for(j=0;j<h;j+=2) {
-//		out[i*h + j] = in[j*w + i];
-  		__m128d q0 = _mm_load_pd((double *)(in + j*w + i));
-  		__m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
-  		__m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
-  		__m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
-  		_mm_store_pd((double *)(out + i*h + j), t0);
-  		_mm_store_pd((double *)(out + i*h + j + h), t1);
-		}
-	}
 */
-#endif
-#endif
-
-}
-
-void ffts_execute_nd(ffts_plan_t *p, const void *  in, void *  out) {
-
-	uint64_t *din = (uint64_t *)in;
-	uint64_t *buf = p->buf;
-	uint64_t *dout = (uint64_t *)out;

-	size_t i,j;
-	for(i=0;i<p->Ns[0];i++) {
-		p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * p->Ms[0]));	
-	}
-	ffts_transpose(buf, dout, p->Ms[0], p->Ns[0], p->transpose_buf);	
-
-	for(i=1;i<p->rank;i++) {
-		for(j=0;j<p->Ns[i];j++) { 
-			p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));	
-		}
-		ffts_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);	
-	}
+#include "ffts_nd.h"
+#include "ffts_internal.h"
+#include "ffts_transpose.h"
+
+static void
+ffts_free_nd(ffts_plan_t *p)
+{
+    if (p->plans) {
+        int i, j;
+
+        for (i = 0; i < p->rank; i++) {
+            ffts_plan_t *plan = p->plans[i];
+
+            if (plan) {
+                for (j = 0; j < i; j++) {
+                    if (p->Ns[i] == p->Ns[j]) {
+                        plan = NULL;
+                        break;
+                    }
+                }
+
+                if (plan) {
+                    ffts_free(plan);
+                }
+            }
+        }
+
+        free(p->plans);
+    }
+
+    if (p->Ns) {
+        free(p->Ns);
+    }
+
+    if (p->Ms) {
+        free(p->Ms);
+    }
+
+    if (p->buf) {
+        ffts_aligned_free(p->buf);
+    }
+
+    free(p);
 }

-ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign) {
-	size_t vol = 1;
+static void
+ffts_execute_nd(ffts_plan_t *p, const void *in, void *out)
+{
+    uint64_t *din = (uint64_t*) in;
+    uint64_t *buf = p->buf;
+    uint64_t *dout = (uint64_t*) out;

-	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+    ffts_plan_t *plan;
+    int i;
+    size_t j;

-	p->transform = &ffts_execute_nd;
-	p->destroy = &ffts_free_nd;
+    plan = p->plans[0];
+    for (j = 0; j < p->Ms[0]; j++) {
+        plan->transform(plan, din + (j * p->Ns[0]), buf + (j * p->Ns[0]));
+    }

-	p->rank = rank;
-	p->Ns = malloc(sizeof(size_t) * rank);
-	p->Ms = malloc(sizeof(size_t) * rank);
-	p->plans = malloc(sizeof(ffts_plan_t **) * rank);
-	int i;
-	for(i=0;i<rank;i++) {
-		p->Ns[i] = Ns[i];
-		vol *= Ns[i];	
-	}
-	p->buf = valloc(sizeof(float) * 2 * vol);
+    ffts_transpose(buf, dout, p->Ns[0], p->Ms[0]);

-	for(i=0;i<rank;i++) {
-		p->Ms[i] = vol / p->Ns[i];
+    for (i = 1; i < p->rank; i++) {
+        plan = p->plans[i];

-		p->plans[i] = NULL;
-		int k;
-		for(k=0;k<i;k++) {
-			if(p->Ms[k] == p->Ms[i]) 
-				p->plans[i] = p->plans[k];
-		}
+        for (j = 0; j < p->Ms[i]; j++) {
+            plan->transform(plan, dout + (j * p->Ns[i]), buf + (j * p->Ns[i]));
+        }

-		if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); 
-	}
+        ffts_transpose(buf, dout, p->Ns[i], p->Ms[i]);
+    }
+}

-	p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
-	return p;
+FFTS_API ffts_plan_t*
+ffts_init_nd(int rank, size_t *Ns, int sign)
+{
+    ffts_plan_t *p;
+    size_t vol = 1;
+    int i, j;
+
+    if (!Ns) {
+        return NULL;
+    }
+
+    if (rank == 1) {
+         return ffts_init_1d(Ns[0], sign);
+    }
+
+    p = calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }
+
+    p->transform = &ffts_execute_nd;
+    p->destroy   = &ffts_free_nd;
+    p->rank      = rank;
+
+    p->Ms = malloc(rank * sizeof(*p->Ms));
+    if (!p->Ms) {
+        goto cleanup;
+    }
+
+    p->Ns = malloc(rank * sizeof(*p->Ns));
+    if (!p->Ns) {
+        goto cleanup;
+    }
+
+    /* reverse the order */
+    for (i = 0; i < rank; i++) {
+        size_t N = Ns[rank - i - 1];
+        p->Ns[i] = N;
+        vol *= N;
+    }
+
+    p->buf = ffts_aligned_malloc(2 * vol * sizeof(float));
+    if (!p->buf) {
+        goto cleanup;
+    }
+
+    p->plans = calloc(rank, sizeof(*p->plans));
+    if (!p->plans) {
+        goto cleanup;
+    }
+
+    for (i = 0; i < rank; i++) {
+        p->Ms[i] = vol / p->Ns[i];
+
+        for (j = 0; j < i; j++) {
+            if (p->Ns[i] == p->Ns[j]) {
+                p->plans[i] = p->plans[j];
+                break;
+            }
+        }
+
+        if (!p->plans[i]) {
+            p->plans[i] = ffts_init_1d(p->Ns[i], sign);
+            if (!p->plans) {
+                goto cleanup;
+            }
+        }
+    }
+
+    return p;
+
+cleanup:
+    ffts_free_nd(p);
+    return NULL;
 }

+FFTS_API ffts_plan_t*
+ffts_init_2d(size_t N1, size_t N2, int sign)
+{
+    size_t Ns[2];

-ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign) {
-	size_t Ns[2];
-	Ns[0] = N1;
-	Ns[1] = N2;
-	return ffts_init_nd(2, Ns, sign);
+    Ns[0] = N1; /* x */
+    Ns[1] = N2; /* y */
+    return ffts_init_nd(2, Ns, sign);
 }
--- a/lib/ffts/src/ffts_nd.h
+++ b/lib/ffts/src/ffts_nd.h
@ -1,58 +1,50 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-*/
-
-#ifndef __FFTS_ND_H__
-#define __FFTS_ND_H__
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include <stdint.h>
-#include <stddef.h>
-#include <stdio.h>
+*/

-#include "ffts.h"
+#ifndef FFTS_ND_H
+#define FFTS_ND_H

-#ifdef HAVE_NEON 
-	#include <arm_neon.h>
-#endif
-#ifdef HAVE_SSE
-	#include <xmmintrin.h>
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
 #endif

-void ffts_free_nd(ffts_plan_t *p);
-void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf);
+#include "ffts.h"
+#include <stddef.h>

-void ffts_execute_nd(ffts_plan_t *p, const void *  in, void *  out); 
-ffts_plan_t *ffts_init_nd(int rank, size_t *Ns, int sign); 
-ffts_plan_t *ffts_init_2d(size_t N1, size_t N2, int sign); 
+ffts_plan_t*
+ffts_init_nd(int rank, size_t *Ns, int sign);

-#endif
+ffts_plan_t*
+ffts_init_2d(size_t N1, size_t N2, int sign);

+#endif /* FFTS_ND_H */
--- a/lib/ffts/src/ffts_real.c
+++ b/lib/ffts/src/ffts_real.c
@ -1,226 +1,654 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */

 #include "ffts_real.h"
+#include "ffts_internal.h"
+#include "ffts_trig.h"
+
+#ifdef HAVE_NEON
+#include <arm_neon.h>
+#elif HAVE_SSE
+#include <xmmintrin.h>
+
+/* check if have SSE3 intrinsics */
+#ifdef HAVE_PMMINTRIN_H
+#include <pmmintrin.h>
+#elif HAVE_INTRIN_H
+#include <intrin.h>
+#else
+/* avoid using negative zero as some configurations have problems with those */
+static const FFTS_ALIGN(16) unsigned int sign_mask_even[4] = {
+    0x80000000, 0, 0x80000000, 0
+};
+static const FFTS_ALIGN(16) unsigned int sign_mask_odd[4] = {
+    0, 0x80000000, 0, 0x80000000
+};
+#endif
+#endif
+
+static void
+ffts_free_1d_real(ffts_plan_t *p)
+{
+    if (p->B) {
+        ffts_aligned_free(p->B);
+    }
+
+    if (p->A) {
+        ffts_aligned_free(p->A);
+    }

-void ffts_free_1d_real(ffts_plan_t *p) {
-	ffts_free(p->plans[0]);
-	free(p->A);
-	free(p->B);
-	free(p->plans);
-	free(p->buf);
-	free(p);
+    if (p->buf) {
+        ffts_aligned_free(p->buf);
+    }
+
+    if (p->plans[0]) {
+        ffts_free(p->plans[0]);
+    }
+
+    free(p);
 }

-void ffts_execute_1d_real(ffts_plan_t *p, const void *vin, void *vout) {
-	float *out = (float *)vout;
-	float *buf = (float *)p->buf;
-	float *A = p->A;
-	float *B = p->B;
+static void
+ffts_execute_1d_real(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT out =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(output);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+#ifdef __ARM_NEON__
+    float *p_buf0 = buf;
+    float *p_buf1 = buf + N - 2;
+    float *p_out = out;
+#endif

-	p->plans[0]->transform(p->plans[0], vin, buf);
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);

-	size_t N = p->N;
-	buf[N] = buf[0];
-	buf[N+1] = buf[1];
+    p->plans[0]->transform(p->plans[0], input, buf);

-	float *p_buf0 = buf;
-	float *p_buf1 = buf + N - 2;
-	float *p_out = out;
+#ifndef HAVE_SSE
+    buf[N + 0] = buf[0];
+    buf[N + 1] = buf[1];
+#endif

-	size_t i;
 #ifdef __ARM_NEON__
-	for(i=0;i<N/2;i+=2) {
-	__asm__ __volatile__ ("vld1.32 {q8},  [%[pa], :128]!\n\t"
-												"vld1.32 {q9},  [%[pb], :128]!\n\t"
-												"vld1.32 {q10}, [%[buf0], :128]!\n\t"
-												"vld1.32 {q11}, [%[buf1], :64]\n\t"
-												"sub %[buf1], %[buf1], #16\n\t"
-
-												"vdup.32 d26, d16[1]\n\t"
-												"vdup.32 d27, d17[1]\n\t"
-												"vdup.32 d24, d16[0]\n\t"
-												"vdup.32 d25, d17[0]\n\t"
-												
-												"vdup.32 d30, d23[1]\n\t"
-												"vdup.32 d31, d22[1]\n\t"
-												"vdup.32 d28, d23[0]\n\t"
-												"vdup.32 d29, d22[0]\n\t"
-												
-												"vmul.f32 q13, q13, q10\n\t"
-												"vmul.f32 q15, q15, q9\n\t"
-												"vmul.f32 q12, q12, q10\n\t"
-												"vmul.f32 q14, q14, q9\n\t"
-												"vrev64.f32 q13, q13\n\t"
-												"vrev64.f32 q15, q15\n\t"
-
-												"vtrn.32 d26, d27\n\t"
-												"vtrn.32 d30, d31\n\t"
-												"vneg.f32 d26, d26\n\t"
-												"vneg.f32 d31, d31\n\t"
-												"vtrn.32 d26, d27\n\t"
-												"vtrn.32 d30, d31\n\t"
-												
-												"vadd.f32 q12, q12, q14\n\t"
-												"vadd.f32 q13, q13, q15\n\t"
-												"vadd.f32 q12, q12, q13\n\t"
-												"vst1.32 {q12}, [%[pout], :128]!\n\t"
-												: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
-													[pout] "+r" (p_out)
-												:
-												: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-												);
+    for (i = 0; i < N; i += 4) {
+        __asm__ __volatile__ (
+            "vld1.32 {q8},  [%[pa]]!\n\t"
+            "vld1.32 {q9},  [%[pb]]!\n\t"
+            "vld1.32 {q10}, [%[buf0]]!\n\t"
+            "vld1.32 {q11}, [%[buf1]]\n\t"
+            "sub %[buf1], %[buf1], #16\n\t"
+
+            "vdup.32 d26, d16[1]\n\t"
+            "vdup.32 d27, d17[1]\n\t"
+            "vdup.32 d24, d16[0]\n\t"
+            "vdup.32 d25, d17[0]\n\t"
+
+            "vdup.32 d30, d23[1]\n\t"
+            "vdup.32 d31, d22[1]\n\t"
+            "vdup.32 d28, d23[0]\n\t"
+            "vdup.32 d29, d22[0]\n\t"
+
+            "vmul.f32 q13, q13, q10\n\t"
+            "vmul.f32 q15, q15, q9\n\t"
+            "vmul.f32 q12, q12, q10\n\t"
+            "vmul.f32 q14, q14, q9\n\t"
+            "vrev64.f32 q13, q13\n\t"
+            "vrev64.f32 q15, q15\n\t"
+
+            "vtrn.32 d26, d27\n\t"
+            "vtrn.32 d30, d31\n\t"
+            "vneg.f32 d26, d26\n\t"
+            "vneg.f32 d31, d31\n\t"
+            "vtrn.32 d26, d27\n\t"
+            "vtrn.32 d30, d31\n\t"
+
+            "vadd.f32 q12, q12, q14\n\t"
+            "vadd.f32 q13, q13, q15\n\t"
+            "vadd.f32 q12, q12, q13\n\t"
+            "vst1.32 {q12}, [%[pout]]!\n\t"
+            : [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), [pout] "+r" (p_out)
+            : [pa] "r" (A), [pb] "r" (B) 
+            : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+    }
+#elif HAVE_SSE3
+    if (FFTS_UNLIKELY(N <= 8)) {
+        __m128 t0 = _mm_load_ps(buf);
+        __m128 t1 = _mm_load_ps(buf + N - 4);
+        __m128 t2 = _mm_load_ps(A);
+        __m128 t3 = _mm_load_ps(B);
+
+        _mm_store_ps(out, _mm_add_ps(_mm_addsub_ps(
+            _mm_mul_ps(t0, _mm_moveldup_ps(t2)),
+            _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)),
+            _mm_movehdup_ps(t2))), _mm_addsub_ps(
+            _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)),
+            _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))),
+            _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3))));
+
+        if (N == 8) {
+            t2 = _mm_load_ps(A + 4);
+            t3 = _mm_load_ps(B + 4);
+
+            _mm_store_ps(out + 4, _mm_add_ps(_mm_addsub_ps(
+                _mm_mul_ps(t1, _mm_moveldup_ps(t2)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t2))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(2,2,0,0)), t3))));
+        }
+    } else {
+        __m128 t0 = _mm_load_ps(buf);
+
+        for (i = 0; i < N; i += 16) {
+            __m128 t1 = _mm_load_ps(buf + i);
+            __m128 t2 = _mm_load_ps(buf + N - i - 4);
+            __m128 t3 = _mm_load_ps(A + i);
+            __m128 t4 = _mm_load_ps(B + i);
+
+            _mm_store_ps(out + i, _mm_add_ps(_mm_addsub_ps(
+                _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t3))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+            t0 = _mm_load_ps(buf + N - i - 8);
+            t1 = _mm_load_ps(buf + i + 4);
+            t3 = _mm_load_ps(A + i + 4);
+            t4 = _mm_load_ps(B + i + 4);
+
+            _mm_store_ps(out + i + 4, _mm_add_ps(_mm_addsub_ps(
+                _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t3))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+            t1 = _mm_load_ps(buf + i + 8);
+            t2 = _mm_load_ps(buf + N - i - 12);
+            t3 = _mm_load_ps(A + i + 8);
+            t4 = _mm_load_ps(B + i + 8);
+
+            _mm_store_ps(out + i + 8, _mm_add_ps(_mm_addsub_ps(
+                _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t3))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+            t0 = _mm_load_ps(buf + N - i - 16);
+            t1 = _mm_load_ps(buf + i + 12);
+            t3 = _mm_load_ps(A + i + 12);
+            t4 = _mm_load_ps(B + i + 12);
+
+            _mm_store_ps(out + i + 12, _mm_add_ps(_mm_addsub_ps(
+                _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t3))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
+        }
+    }
+#elif HAVE_SSE
+    if (FFTS_UNLIKELY(N <= 8)) {
+        __m128 c0 = _mm_load_ps((const float*) sign_mask_even);
+        __m128 t0 = _mm_load_ps(buf);
+        __m128 t1 = _mm_load_ps(buf + N - 4);
+        __m128 t2 = _mm_load_ps(A);
+        __m128 t3 = _mm_load_ps(B);
+
+        _mm_store_ps(out, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+            _mm_mul_ps(t0, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))),
+            _mm_mul_ps(_mm_shuffle_ps(t0, t0, _MM_SHUFFLE(2,3,0,1)),
+            _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))),
+            _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(2,2,0,0)), t3)),
+            _mm_mul_ps(_mm_shuffle_ps(t0, t1, _MM_SHUFFLE(3,3,1,1)),
+            _mm_shuffle_ps(_mm_xor_ps(t3, c0), _mm_xor_ps(t3, c0),
+            _MM_SHUFFLE(2,3,0,1)))));
+
+        if (N == 8) {
+            t2 = _mm_load_ps(A + 4);
+            t3 = _mm_load_ps(B + 4);
+
+            _mm_store_ps(out + 4, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+                _mm_mul_ps(t1, _mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(2,2,0,0)), t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(_mm_xor_ps(t3, c0), _mm_xor_ps(t3, c0),
+                _MM_SHUFFLE(2,3,0,1)))));
+        }
+    } else {
+        __m128 c0 = _mm_load_ps((const float*) sign_mask_even);
+        __m128 t0 = _mm_load_ps(buf);
+
+        for (i = 0; i < N; i += 16) {
+            __m128 t1 = _mm_load_ps(buf + i);
+            __m128 t2 = _mm_load_ps(buf + N - i - 4);
+            __m128 t3 = _mm_load_ps(A + i);
+            __m128 t4 = _mm_load_ps(B + i);
+
+            _mm_store_ps(out + i, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+                _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0),
+                _MM_SHUFFLE(2,3,0,1)))));
+
+            t0 = _mm_load_ps(buf + N - i - 8);
+            t1 = _mm_load_ps(buf + i + 4);
+            t3 = _mm_load_ps(A + i + 4);
+            t4 = _mm_load_ps(B + i + 4);
+
+            _mm_store_ps(out + i + 4, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+                _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0),
+                _MM_SHUFFLE(2,3,0,1)))));
+
+            t1 = _mm_load_ps(buf + i + 8);
+            t2 = _mm_load_ps(buf + N - i - 12);
+            t3 = _mm_load_ps(A + i + 8);
+            t4 = _mm_load_ps(B + i + 8);
+
+            _mm_store_ps(out + i + 8, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+                _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4)),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0),
+                _MM_SHUFFLE(2,3,0,1)))));
+
+            t0 = _mm_load_ps(buf + N - i - 16);
+            t1 = _mm_load_ps(buf + i + 12);
+            t3 = _mm_load_ps(A + i + 12);
+            t4 = _mm_load_ps(B + i + 12);
+
+            _mm_store_ps(out + i + 12, _mm_add_ps(_mm_add_ps(_mm_add_ps(
+                _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4)),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(_mm_xor_ps(t4, c0), _mm_xor_ps(t4, c0),
+                _MM_SHUFFLE(2,3,0,1)))));
+        }
+    }
 #else
-	for(i=0;i<N/2;i++) {
-		out[2*i]   = buf[2*i]*A[2*i] - buf[2*i+1]*A[2*i+1] + buf[N-2*i]*B[2*i] + buf[N-2*i+1]*B[2*i+1];
-		out[2*i+1] = buf[2*i+1]*A[2*i] + buf[2*i]*A[2*i+1] + buf[N-2*i]*B[2*i+1] - buf[N-2*i+1]*B[2*i];
-
-//	out[2*N-2*i] = out[2*i];
-//	out[2*N-2*i+1] = -out[2*i+1];
-
-#endif	
-	}
-	
-	out[N] = buf[0] - buf[1];
-	out[N+1] = 0.0f;
-	
+    for (i = 0; i < N/2; i++) {
+        out[2*i + 0] =
+            buf[    2*i + 0] * A[2*i + 0] - buf[    2*i + 1] * A[2*i + 1] +
+            buf[N - 2*i + 0] * B[2*i + 0] + buf[N - 2*i + 1] * B[2*i + 1];
+        out[2*i + 1] =
+            buf[    2*i + 1] * A[2*i + 0] + buf[    2*i + 0] * A[2*i + 1] +
+            buf[N - 2*i + 0] * B[2*i + 1] - buf[N - 2*i + 1] * B[2*i + 0];
+    }
+#endif
+
+    out[N + 0] = buf[0] - buf[1];
+    out[N + 1] = 0.0f;
 }

-void ffts_execute_1d_real_inv(ffts_plan_t *p, const void *vin, void *vout) {
-	float *out = (float *)vout;
-	float *in = (float *)vin;
-	float *buf = (float *)p->buf;
-	float *A = p->A;
-	float *B = p->B;
-	size_t N = p->N;
-	
-	float *p_buf0 = in;
-	float *p_buf1 = in + N - 2;
-	
-	float *p_out = buf;
-	
-	size_t i;
+static void
+ffts_execute_1d_real_inv(ffts_plan_t *p, const void *input, void *output)
+{
+    float *const FFTS_RESTRICT in =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_16(input);
+    float *const FFTS_RESTRICT buf =
+        (float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->buf);
+    const float *const FFTS_RESTRICT A =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->A);
+    const float *const FFTS_RESTRICT B =
+        (const float *const FFTS_RESTRICT) FFTS_ASSUME_ALIGNED_32(p->B);
+    const int N = (const int) p->N;
+    int i;
+
+#ifdef __ARM_NEON__
+    float *p_buf0 = in;
+    float *p_buf1 = in + N - 2;
+    float *p_out = buf;
+#endif
+
+    /* we know this */
+    FFTS_ASSUME(N/2 > 0);
+
 #ifdef __ARM_NEON__
-	for(i=0;i<N/2;i+=2) {
-	__asm__ __volatile__ ("vld1.32 {q8},  [%[pa], :128]!\n\t"
-												"vld1.32 {q9},  [%[pb], :128]!\n\t"
-												"vld1.32 {q10}, [%[buf0], :128]!\n\t"
-												"vld1.32 {q11}, [%[buf1], :64]\n\t"
-												"sub %[buf1], %[buf1], #16\n\t"
-
-												"vdup.32 d26, d16[1]\n\t"
-												"vdup.32 d27, d17[1]\n\t"
-												"vdup.32 d24, d16[0]\n\t"
-												"vdup.32 d25, d17[0]\n\t"
-												
-												"vdup.32 d30, d23[1]\n\t"
-												"vdup.32 d31, d22[1]\n\t"
-												"vdup.32 d28, d23[0]\n\t"
-												"vdup.32 d29, d22[0]\n\t"
-												
-												"vmul.f32 q13, q13, q10\n\t"
-												"vmul.f32 q15, q15, q9\n\t"
-												"vmul.f32 q12, q12, q10\n\t"
-												"vmul.f32 q14, q14, q9\n\t"
-												"vrev64.f32 q13, q13\n\t"
-												"vrev64.f32 q15, q15\n\t"
-
-												"vtrn.32 d26, d27\n\t"
-												"vtrn.32 d28, d29\n\t"
-												"vneg.f32 d27, d27\n\t"
-												"vneg.f32 d29, d29\n\t"
-												"vtrn.32 d26, d27\n\t"
-												"vtrn.32 d28, d29\n\t"
-												
-												"vadd.f32 q12, q12, q14\n\t"
-												"vsub.f32 q13, q13, q15\n\t"
-												"vadd.f32 q12, q12, q13\n\t"
-												"vst1.32 {q12}, [%[pout], :128]!\n\t"
-												: [pa] "+r" (A), [pb] "+r" (B), [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1),
-													[pout] "+r" (p_out)
-												:
-												: "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-												);
+    for (i = 0; i < N/2; i += 2) {
+        __asm__ __volatile__ (
+            "vld1.32 {q8},  [%[pa]]!\n\t"
+            "vld1.32 {q9},  [%[pb]]!\n\t"
+            "vld1.32 {q10}, [%[buf0]]!\n\t"
+            "vld1.32 {q11}, [%[buf1]]\n\t"
+            "sub %[buf1], %[buf1], #16\n\t"
+
+            "vdup.32 d26, d16[1]\n\t"
+            "vdup.32 d27, d17[1]\n\t"
+            "vdup.32 d24, d16[0]\n\t"
+            "vdup.32 d25, d17[0]\n\t"
+
+            "vdup.32 d30, d23[1]\n\t"
+            "vdup.32 d31, d22[1]\n\t"
+            "vdup.32 d28, d23[0]\n\t"
+            "vdup.32 d29, d22[0]\n\t"
+
+            "vmul.f32 q13, q13, q10\n\t"
+            "vmul.f32 q15, q15, q9\n\t"
+            "vmul.f32 q12, q12, q10\n\t"
+            "vmul.f32 q14, q14, q9\n\t"
+            "vrev64.f32 q13, q13\n\t"
+            "vrev64.f32 q15, q15\n\t"
+
+            "vtrn.32 d26, d27\n\t"
+            "vtrn.32 d28, d29\n\t"
+            "vneg.f32 d27, d27\n\t"
+            "vneg.f32 d29, d29\n\t"
+            "vtrn.32 d26, d27\n\t"
+            "vtrn.32 d28, d29\n\t"
+
+            "vadd.f32 q12, q12, q14\n\t"
+            "vsub.f32 q13, q13, q15\n\t"
+            "vadd.f32 q12, q12, q13\n\t"
+            "vst1.32 {q12}, [%[pout]]!\n\t"
+            : [buf0] "+r" (p_buf0), [buf1] "+r" (p_buf1), [pout] "+r" (p_out)
+            : [pa] "r" (A), [pb] "r" (B)
+            : "memory", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+        );
+    }
+#elif HAVE_SSE3
+    if (FFTS_UNLIKELY(N <= 8)) {
+        __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
+        __m128 t1 = _mm_load_ps(in);
+        __m128 t2 = _mm_load_ps(in + N - 4);
+        __m128 t3 = _mm_load_ps(A);
+        __m128 t4 = _mm_load_ps(B);
+
+        _mm_store_ps(buf, _mm_sub_ps(_mm_addsub_ps(
+            _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+            _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+            _mm_movehdup_ps(t3))), _mm_addsub_ps(
+            _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+            _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+            _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+        if (N == 8) {
+            t3 = _mm_load_ps(A + 4);
+            t4 = _mm_load_ps(B + 4);
+
+            _mm_store_ps(buf + 4, _mm_sub_ps(_mm_addsub_ps(
+                _mm_mul_ps(t2, _mm_moveldup_ps(t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t3))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,2,0,0)), t4))));
+        }
+    } else {
+        __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
+
+        for (i = 0; i < N; i += 16) {
+            __m128 t1 = _mm_load_ps(in + i);
+            __m128 t2 = _mm_load_ps(in + N - i - 4);
+            __m128 t3 = _mm_load_ps(A + i);
+            __m128 t4 = _mm_load_ps(B + i);
+
+            _mm_store_ps(buf + i, _mm_sub_ps(_mm_addsub_ps(
+                _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t3))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+            t0 = _mm_load_ps(in + N - i - 8);
+            t1 = _mm_load_ps(in + i + 4);
+            t3 = _mm_load_ps(A + i + 4);
+            t4 = _mm_load_ps(B + i + 4);
+
+            _mm_store_ps(buf + i + 4, _mm_sub_ps(_mm_addsub_ps(
+                _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t3))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+            t1 = _mm_load_ps(in + i + 8);
+            t2 = _mm_load_ps(in + N - i - 12);
+            t3 = _mm_load_ps(A + i + 8);
+            t4 = _mm_load_ps(B + i + 8);
+
+            _mm_store_ps(buf + i + 8, _mm_sub_ps(_mm_addsub_ps(
+                _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t3))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)), t4))));
+
+            t0 = _mm_load_ps(in + N - i - 16);
+            t1 = _mm_load_ps(in + i + 12);
+            t3 = _mm_load_ps(A + i + 12);
+            t4 = _mm_load_ps(B + i + 12);
+
+            _mm_store_ps(buf + i + 12, _mm_sub_ps(_mm_addsub_ps(
+                _mm_mul_ps(t1, _mm_moveldup_ps(t3)),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_movehdup_ps(t3))), _mm_addsub_ps(
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)), t4))));
+        }
+    }
+#elif HAVE_SSE
+    if (FFTS_UNLIKELY(N <= 8)) {
+        __m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
+        __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
+        __m128 t1 = _mm_load_ps(in);
+        __m128 t2 = _mm_load_ps(in + N - 4);
+        __m128 t3 = _mm_load_ps(A);
+        __m128 t4 = _mm_load_ps(B);

+        _mm_store_ps(buf, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+            _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+            _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+            _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+            _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+            _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+            _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)),
+            _mm_xor_ps(t4, c0))));

+        if (N == 8) {
+            t3 = _mm_load_ps(A + 4);
+            t4 = _mm_load_ps(B + 4);
+
+            _mm_store_ps(buf + 4, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+                _mm_mul_ps(t2, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t2, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t1, _MM_SHUFFLE(2,2,0,0)),
+                _mm_xor_ps(t4, c0))));
+        }
+    } else {
+        __m128 c0 = _mm_load_ps((const float*) sign_mask_odd);
+        __m128 t0 = _mm_loadl_pi(_mm_setzero_ps(), (const __m64*) &in[N]);
+
+        for (i = 0; i < N; i += 16) {
+            __m128 t1 = _mm_load_ps(in + i);
+            __m128 t2 = _mm_load_ps(in + N - i - 4);
+            __m128 t3 = _mm_load_ps(A + i);
+            __m128 t4 = _mm_load_ps(B + i);
+
+            _mm_store_ps(buf + i, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+                _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)),
+                _mm_xor_ps(t4, c0))));
+
+            t0 = _mm_load_ps(in + N - i - 8);
+            t1 = _mm_load_ps(in + i + 4);
+            t3 = _mm_load_ps(A + i + 4);
+            t4 = _mm_load_ps(B + i + 4);
+
+            _mm_store_ps(buf + i + 4, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+                _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)),
+                _mm_xor_ps(t4, c0))));
+
+            t1 = _mm_load_ps(in + i + 8);
+            t2 = _mm_load_ps(in + N - i - 12);
+            t3 = _mm_load_ps(A + i + 8);
+            t4 = _mm_load_ps(B + i + 8);
+
+            _mm_store_ps(buf + i + 8, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+                _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+                _mm_mul_ps(_mm_shuffle_ps(t0, t2, _MM_SHUFFLE(2,2,0,0)),
+                _mm_xor_ps(t4, c0))));
+
+            t0 = _mm_load_ps(in + N - i - 16);
+            t1 = _mm_load_ps(in + i + 12);
+            t3 = _mm_load_ps(A + i + 12);
+            t4 = _mm_load_ps(B + i + 12);
+
+            _mm_store_ps(buf + i + 12, _mm_add_ps(_mm_sub_ps(_mm_add_ps(
+                _mm_mul_ps(t1, _mm_shuffle_ps(t3, t3, _MM_SHUFFLE(2,2,0,0))),
+                _mm_mul_ps(_mm_shuffle_ps(t1, t1, _MM_SHUFFLE(2,3,0,1)),
+                _mm_xor_ps(_mm_shuffle_ps(t3, t3, _MM_SHUFFLE(3,3,1,1)), c0))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(3,3,1,1)),
+                _mm_shuffle_ps(t4, t4, _MM_SHUFFLE(2,3,0,1)))),
+                _mm_mul_ps(_mm_shuffle_ps(t2, t0, _MM_SHUFFLE(2,2,0,0)),
+                _mm_xor_ps(t4, c0))));
+        }
+    }
 #else
-	for(i=0;i<N/2;i++) {
-		buf[2*i]   = in[2*i]*A[2*i] + in[2*i+1]*A[2*i+1] + in[N-2*i]*B[2*i] - in[N-2*i+1]*B[2*i+1];
-		buf[2*i+1] = in[2*i+1]*A[2*i] - in[2*i]*A[2*i+1] - in[N-2*i]*B[2*i+1] - in[N-2*i+1]*B[2*i];
+    for (i = 0; i < N/2; i++) {
+        buf[2*i + 0] =
+            in[    2*i + 0] * A[2*i + 0] + in[    2*i + 1] * A[2*i + 1] +
+            in[N - 2*i + 0] * B[2*i + 0] - in[N - 2*i + 1] * B[2*i + 1];
+        buf[2*i + 1] =
+            in[    2*i + 1] * A[2*i + 0] - in[    2*i + 0] * A[2*i + 1] -
+            in[N - 2*i + 0] * B[2*i + 1] - in[N - 2*i + 1] * B[2*i + 0];
+    }
 #endif
-}
-	
-	p->plans[0]->transform(p->plans[0], buf, out);
-	
-}

-ffts_plan_t *ffts_init_1d_real(size_t N, int sign) {
-	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
-
-	if(sign < 0) p->transform = &ffts_execute_1d_real;
-	else         p->transform = &ffts_execute_1d_real_inv;
-	
-	p->destroy = &ffts_free_1d_real;
-	p->N = N;
-	p->rank = 1;
-	p->plans = malloc(sizeof(ffts_plan_t **) * 1);
-
-	p->plans[0] = ffts_init_1d(N/2, sign); 
-
-	p->buf = valloc(sizeof(float) * 2 * ((N/2) + 1));
-
-	p->A = valloc(sizeof(float) * N);
-	p->B = valloc(sizeof(float) * N);
-
-  if(sign < 0) {
-		int i;
-		for (i = 0; i < N/2; i++) {
-			p->A[2 * i]     = 0.5 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
-			p->A[2 * i + 1] = 0.5 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
-			p->B[2 * i]     = 0.5 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
-			p->B[2 * i + 1] = 0.5 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
-		}
-	}else{
-		int i;
-		for (i = 0; i < N/2; i++) {
-			p->A[2 * i]     = 1.0 * (1.0 - sin (2.0f * PI / (double) (N) * (double) i));
-			p->A[2 * i + 1] = 1.0 * (-1.0 * cos (2.0f * PI / (double) (N) * (double) i));
-			p->B[2 * i]     = 1.0 * (1.0 + sin (2.0f * PI / (double) (N) * (double) i));
-			p->B[2 * i + 1] = 1.0 * (1.0 * cos (2.0f * PI / (double) (N) * (double) i));
-		}
-  }
-	
-	return p;
+    p->plans[0]->transform(p->plans[0], buf, output);
 }

+FFTS_API ffts_plan_t*
+ffts_init_1d_real(size_t N, int sign)
+{
+    ffts_plan_t *p;
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
+    if (!p) {
+        return NULL;
+    }
+
+    if (sign < 0) {
+        p->transform = &ffts_execute_1d_real;
+    } else {
+        p->transform = &ffts_execute_1d_real_inv;
+    }
+
+    p->destroy = &ffts_free_1d_real;
+    p->N       = N;
+    p->rank    = 1;
+    p->plans   = (ffts_plan_t**) &p[1];
+
+    p->plans[0] = ffts_init_1d(N/2, sign);
+    if (!p->plans[0]) {
+        goto cleanup;
+    }
+
+    p->buf = ffts_aligned_malloc(2 * ((N/2) + 1) * sizeof(float));
+    if (!p->buf) {
+        goto cleanup;
+    }
+
+    p->A = (float*) ffts_aligned_malloc(N * sizeof(float));
+    if (!p->A) {
+        goto cleanup;
+    }
+
+    p->B = (float*) ffts_aligned_malloc(N * sizeof(float));
+    if (!p->B) {
+        goto cleanup;
+    }
+
+#ifdef HAVE_SSE3
+    ffts_generate_table_1d_real_32f(p, sign, 1);
+#else
+    ffts_generate_table_1d_real_32f(p, sign, 0);
+#endif
+
+    return p;

+cleanup:
+    ffts_free_1d_real(p);
+    return NULL;
+}
--- a/lib/ffts/src/ffts_real.h
+++ b/lib/ffts/src/ffts_real.h
@ -1,53 +1,47 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-*/
-
-#ifndef __FFTS_REAL_H__
-#define __FFTS_REAL_H__
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#include <stdint.h>
-#include <stddef.h>
-#include <stdio.h>
+*/

-#include "ffts.h"
+#ifndef FFTS_REAL_H
+#define FFTS_REAL_H

-#ifdef HAVE_NEON 
-	#include <arm_neon.h>
-#endif
-#ifdef HAVE_SSE
-	#include <xmmintrin.h>
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
 #endif

-ffts_plan_t *ffts_init_1d_real(size_t N, int sign);
+#include "ffts.h"
+#include <stddef.h>

-#endif
+ffts_plan_t*
+ffts_init_1d_real(size_t N, int sign);

+#endif /* FFTS_REAL_H */
--- a/lib/ffts/src/ffts_real_nd.c
+++ b/lib/ffts/src/ffts_real_nd.c
@ -1,177 +1,269 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */

 #include "ffts_real_nd.h"
+#include "ffts_real.h"
+#include "ffts_internal.h"
+#include "ffts_transpose.h"

-#ifdef __ARM_NEON__
-#include "neon.h"
-#endif
-
-void ffts_free_nd_real(ffts_plan_t *p) {
-
-	int i;
-	for(i=0;i<p->rank;i++) {
-		
-		ffts_plan_t *x = p->plans[i];
-
-		int k;
-		for(k=i+1;k<p->rank;k++) {
-			if(x == p->plans[k]) p->plans[k] = NULL;
-		}
-		
-		if(x)	ffts_free(x);
-	}
-
-	free(p->Ns);
-	free(p->Ms);
-	free(p->plans);
-	free(p->buf);
-	free(p->transpose_buf);
-	free(p);
-}
+static void
+ffts_free_nd_real(ffts_plan_t *p)
+{
+    if (p->plans) {
+        int i, j;
+
+        for (i = 0; i < p->rank; i++) {
+            ffts_plan_t *plan = p->plans[i];
+
+			if (plan) {
+				for (j = 0; j < i; j++) {
+					if (p->Ns[i] == p->Ns[j]) {
+						plan = NULL;
+						break;
+					}
+				}
+
+				if (plan) {
+					ffts_free(plan);
+				}
+			}
+        }

-void ffts_scalar_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
+        free(p->plans);
+    }

-	size_t i,j;
-	for(i=0;i<w;i+=1) {
-		for(j=0;j<h;j+=1) {
-			out[i*h + j] = in[j*w + i];
-		}
-	}
+    if (p->buf) {
+        ffts_aligned_free(p->buf);
+    }

+    if (p->Ns) {
+        free(p->Ns);
+    }
+
+    if (p->Ms) {
+        free(p->Ms);
+    }
+
+    free(p);
 }

-void ffts_execute_nd_real(ffts_plan_t *p, const void *  in, void *  out) {
+static void
+ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out)
+{
+    const size_t Ms0 = p->Ms[0];
+    const size_t Ns0 = p->Ns[0];
+
+    uint32_t *din = (uint32_t*) in;
+    uint64_t *buf = p->buf;
+    uint64_t *dout = (uint64_t*) out;
+
+    ffts_plan_t *plan;
+    int i;
+    size_t j;
+
+    plan = p->plans[0];
+    for (j = 0; j < Ns0; j++) {
+        plan->transform(plan, din + (j * Ms0), buf + (j * (Ms0 / 2 + 1)));
+    }

-	uint32_t *din = (uint32_t *)in;
-	uint64_t *buf = p->buf;
-	uint64_t *dout = (uint64_t *)out;
+    ffts_transpose(buf, dout, Ms0 / 2 + 1, Ns0);

-	size_t i,j;
-	for(i=0;i<p->Ns[0];i++) {
-		p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1)));	
-	}
-	ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf);	
+    for (i = 1; i < p->rank; i++) {
+        const size_t Ms = p->Ms[i];
+        const size_t Ns = p->Ns[i];

-	for(i=1;i<p->rank;i++) {
-		for(j=0;j<p->Ns[i];j++) { 
-			p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));	
-		}
-		ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);	
-	}
+        plan = p->plans[i];
+
+        for (j = 0; j < Ns; j++) {
+            plan->transform(plan, dout + (j * Ms), buf + (j * Ms));
+        }
+
+        ffts_transpose(buf, dout, Ms, Ns);
+    }
 }

-void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *  in, void *  out) {
-
-	uint64_t *din = (uint64_t *)in;
-	uint64_t *buf = p->buf;
-	uint64_t *dout = (uint64_t *)out;
-	
-	float *bufr = (float *)(p->buf);
-	float *doutr = (float *)out;
-
-	size_t i,j;
-	ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf);	
-
-	for(i=0;i<p->Ms[0];i++) {
-		p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), dout + (i * p->Ns[0]));	
-	}
-	
-	ffts_scalar_transpose(dout, buf, p->Ns[0], p->Ms[0], p->transpose_buf);	
-	for(j=0;j<p->Ms[1];j++) { 
-  	p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]);	
-  }
+static void
+ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out)
+{
+    const size_t Ms0 = p->Ms[0];
+    const size_t Ms1 = p->Ms[1];
+    const size_t Ns0 = p->Ns[0];
+    const size_t Ns1 = p->Ns[1];
+
+    uint64_t *din = (uint64_t*) in;
+    uint64_t *buf = p->buf;
+    uint64_t *buf2;
+    float    *doutr = (float*) out;
+
+    ffts_plan_t *plan;
+    size_t vol;
+
+    int i;
+    size_t j;
+
+    vol = p->Ns[0];
+    for (i = 1; i < p->rank; i++) {
+        vol *= p->Ns[i];
+    }
+
+    buf2 = buf + vol;
+
+    ffts_transpose(din, buf, Ms0, Ns0);
+
+    plan = p->plans[0];
+    for (j = 0; j < Ms0; j++) {
+        plan->transform(plan, buf + (j * Ns0), buf2 + (j * Ns0));
+    }
+
+    ffts_transpose(buf2, buf, Ns0, Ms0);
+
+    plan = p->plans[1];
+    for (j = 0; j < Ms1; j++) {
+        plan->transform(plan, buf + (j * Ms0), &doutr[j * Ns1]);
+    }
 }

-ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
-	size_t vol = 1;
+FFTS_API ffts_plan_t*
+ffts_init_nd_real(int rank, size_t *Ns, int sign)
+{
+    int i;
+    size_t vol = 1;
+    size_t bufsize;
+    ffts_plan_t *p;

-	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
+    p = (ffts_plan_t*) calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }

-	if(sign < 0) p->transform = &ffts_execute_nd_real;
-	else         p->transform = &ffts_execute_nd_real_inv;
+    if (sign < 0) {
+        p->transform = &ffts_execute_nd_real;
+    } else {
+        p->transform = &ffts_execute_nd_real_inv;
+    }

-	p->destroy = &ffts_free_nd_real;
+    p->destroy = &ffts_free_nd_real;
+    p->rank    = rank;

-	p->rank = rank;
-	p->Ns = malloc(sizeof(size_t) * rank);
-	p->Ms = malloc(sizeof(size_t) * rank);
-	p->plans = malloc(sizeof(ffts_plan_t **) * rank);
-	int i;
-	for(i=0;i<rank;i++) {
-		p->Ns[i] = Ns[i];
-		vol *= Ns[i];	
-	}
-	p->buf = valloc(sizeof(float) * 2 * vol);
+    p->Ms = (size_t*) malloc(rank * sizeof(*p->Ms));
+    if (!p->Ms) {
+        goto cleanup;
+    }

-	for(i=0;i<rank;i++) {
-		p->Ms[i] = vol / p->Ns[i];
-		
-		p->plans[i] = NULL;
-		int k;
+    p->Ns = (size_t*) malloc(rank * sizeof(*p->Ns));
+    if (!p->Ns) {
+        goto cleanup;
+    }

-		if(sign < 0) {
-			for(k=1;k<i;k++) {
-				if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k];
-			}
-			if(!i)                p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); 
-			else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); 
-		}else{
-  		for(k=0;k<i;k++) {
-  			if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k];
-  		}
-			if(i==rank-1)         p->plans[i] = ffts_init_1d_real(p->Ns[i], sign); 
-			else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign); 
-		}
-	}
-	if(sign < 0) {
-		for(i=1;i<rank;i++) {
-			p->Ns[i] = p->Ns[i] / 2 + 1;
-		}
-	}else{
-		for(i=0;i<rank-1;i++) {
-			p->Ms[i] = p->Ms[i] / 2 + 1;
-		}
-	}
-
-	p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
-	return p;
+    for (i = 0; i < rank; i++) {
+        p->Ns[i] = Ns[i];
+        vol *= Ns[i];
+    }
+
+    /* there is probably a prettier way of doing this, but it works.. */
+    if (sign < 0) {
+        bufsize = 2 * vol;
+    } else {
+        bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
+    }
+
+    p->buf = ffts_aligned_malloc(bufsize * sizeof(float));
+    if (!p->buf) {
+        goto cleanup;
+    }
+
+    p->plans = (ffts_plan_t**) calloc(rank, sizeof(*p->plans));
+    if (!p->plans) {
+        goto cleanup;
+    }
+
+    for (i = 0; i < rank; i++) {
+        int k;
+
+        p->Ms[i] = vol / p->Ns[i];
+
+        if (sign < 0) {
+            if (!i) {
+                p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
+            } else {
+                for (k = 1; k < i; k++) {
+                    if (p->Ms[k] == p->Ms[i]) {
+                        p->plans[i] = p->plans[k];
+                        break;
+                    }
+                }
+
+                if (!p->plans[i]) {
+                    p->plans[i] = ffts_init_1d(p->Ms[i], sign);
+                    p->Ns[i] = p->Ns[i] / 2 + 1;
+                }
+            }
+        } else {
+            if (i == rank - 1) {
+                p->plans[i] = ffts_init_1d_real(p->Ns[i], sign);
+            } else {
+                for (k = 0; k < i; k++) {
+                    if (p->Ns[k] == p->Ns[i]) {
+                        p->plans[i] = p->plans[k];
+                        break;
+                    }
+                }
+
+                if (!p->plans[i]) {
+                    p->plans[i] = ffts_init_1d(p->Ns[i], sign);
+                    p->Ms[i] = p->Ms[i] / 2 + 1;
+                }
+            }
+        }
+
+        if (!p->plans[i]) {
+            goto cleanup;
+        }
+    }
+
+    return p;
+
+cleanup:
+    ffts_free_nd_real(p);
+    return NULL;
 }

+FFTS_API ffts_plan_t*
+ffts_init_2d_real(size_t N1, size_t N2, int sign)
+{
+    size_t Ns[2];

-ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) {
-	size_t Ns[2];
-	Ns[0] = N1;
-	Ns[1] = N2;
-	return ffts_init_nd_real(2, Ns, sign);
+    Ns[0] = N1;
+    Ns[1] = N2;
+    return ffts_init_nd_real(2, Ns, sign);
 }
--- a/lib/ffts/src/ffts_real_nd.h
+++ b/lib/ffts/src/ffts_real_nd.h
@ -1,53 +1,50 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */

-#ifndef __FFTS_REAL_ND_H__
-#define __FFTS_REAL_ND_H__
+#ifndef FFTS_REAL_ND_H
+#define FFTS_REAL_ND_H

-#include <stdint.h>
-#include <stddef.h>
-#include <stdio.h>
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif

-#include "ffts_nd.h"
-#include "ffts_real.h"
 #include "ffts.h"
+#include <stddef.h>

-#ifdef HAVE_NEON 
-	#include <arm_neon.h>
-#endif
-#ifdef HAVE_SSE
-	#include <xmmintrin.h>
-#endif
+ffts_plan_t*
+ffts_init_nd_real(int rank, size_t *Ns, int sign);

-#endif
+ffts_plan_t*
+ffts_init_2d_real(size_t N1, size_t N2, int sign);

+#endif /* FFTS_REAL_ND_H */
--- a/lib/ffts/src/ffts_small.c
+++ b/lib/ffts/src/ffts_small.c
@ -1,156 +0,0 @@
-/*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
- Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "ffts.h"
-#include "macros.h"
-
-#include <stdlib.h>
-
-#define DEBUG(x)
-
-#include "ffts_small.h"
-
- void firstpass_16_f(ffts_plan_t *  p, const void *  in, void *  out)
-{
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
-    float *LUT8 = p->ws;
-
-    L_4_4(0, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
-    L_2_4(0, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
-    K_N(0, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
-    K_N(0, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
-    S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
-    K_N(0, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
-    S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
-}
-
- void firstpass_16_b(ffts_plan_t *  p, const void *  in, void *  out)
-{
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    V r0_1,r2_3,r4_5,r6_7,r8_9,r10_11,r12_13,r14_15;
-    float *LUT8 = p->ws;
-
-    L_4_4(1, din+0,din+16,din+8,din+24,&r0_1,&r2_3,&r8_9,&r10_11);
-    L_2_4(1, din+4,din+20,din+28,din+12,&r4_5,&r6_7,&r14_15,&r12_13);
-    K_N(1, VLD(LUT8),VLD(LUT8+4),&r0_1,&r2_3,&r4_5,&r6_7);
-    K_N(1, VLD(LUT8+8),VLD(LUT8+12),&r0_1,&r4_5,&r8_9,&r12_13);
-    S_4(r0_1,r4_5,r8_9,r12_13,dout+0,dout+8,dout+16,dout+24);
-    K_N(1, VLD(LUT8+16),VLD(LUT8+20),&r2_3,&r6_7,&r10_11,&r14_15);
-    S_4(r2_3,r6_7,r10_11,r14_15,dout+4,dout+12,dout+20,dout+28);
-}
-
-
- void firstpass_8_f(ffts_plan_t *p, const void *in, void *out)
-{
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    V r0_1, r2_3, r4_5, r6_7;
-    float *LUT8 = p->ws + p->ws_is[0];
-
-    L_4_2(0, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(0, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
-}
-
- void firstpass_8_b(ffts_plan_t *p, const void *in, void *out)
-{
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    V r0_1, r2_3, r4_5, r6_7;
-    float *LUT8 = p->ws + p->ws_is[0];
-
-    L_4_2(1, din, din+8, din+4, din+12, &r0_1, &r2_3, &r4_5, &r6_7);
-    K_N(1, VLD(LUT8), VLD(LUT8+4), &r0_1, &r2_3, &r4_5, &r6_7);
-    S_4(r0_1,r2_3,r4_5,r6_7,dout+0,dout+4,dout+8,dout+12);
-}
-
-
- void firstpass_4_f(ffts_plan_t *p, const void *in, void *out)
-{
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
-    t0[0] = din[0]; t0[1] = din[1];
-    t1[0] = din[4]; t1[1] = din[5];
-    t2[0] = din[2]; t2[1] = din[3];
-    t3[0] = din[6]; t3[1] = din[7];
-        
-    t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
-    t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
-    t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
-    t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
-
-    dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
-    dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
-    dout[2] = t5[0] + t7[1]; dout[3] = t5[1] - t7[0];
-    dout[6] = t5[0] - t7[1]; dout[7] = t5[1] + t7[0];
-}
-
- void firstpass_4_b(ffts_plan_t *p, const void *in, void *out)
-{
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    cdata_t t0, t1, t2, t3, t4, t5, t6, t7;
-    t0[0] = din[0]; t0[1] = din[1];
-    t1[0] = din[4]; t1[1] = din[5];
-    t2[0] = din[2]; t2[1] = din[3];
-    t3[0] = din[6]; t3[1] = din[7];
-        
-    t4[0] = t0[0] + t1[0]; t4[1] = t0[1] + t1[1];
-    t5[0] = t0[0] - t1[0]; t5[1] = t0[1] - t1[1];
-    t6[0] = t2[0] + t3[0]; t6[1] = t2[1] + t3[1];
-    t7[0] = t2[0] - t3[0]; t7[1] = t2[1] - t3[1];
-
-    dout[0] = t4[0] + t6[0]; dout[1] = t4[1] + t6[1];
-    dout[4] = t4[0] - t6[0]; dout[5] = t4[1] - t6[1];
-    dout[2] = t5[0] - t7[1]; dout[3] = t5[1] + t7[0];
-    dout[6] = t5[0] + t7[1]; dout[7] = t5[1] - t7[0];
-}
-
- void firstpass_2(ffts_plan_t *p, const void *in, void *out)
-{
-    const data_t *din = (const data_t *)in;
-    data_t *dout = (data_t *)out;
-    cdata_t t0, t1, r0,r1;
-    t0[0] = din[0]; t0[1] = din[1];
-    t1[0] = din[2]; t1[1] = din[3];
-    r0[0] = t0[0] + t1[0];
-    r0[1] = t0[1] + t1[1];
-    r1[0] = t0[0] - t1[0];
-    r1[1] = t0[1] - t1[1];
-    dout[0] = r0[0]; dout[1] = r0[1];
-    dout[2] = r1[0]; dout[3] = r1[1];
-}
--- a/lib/ffts/src/ffts_small.h
+++ b/lib/ffts/src/ffts_small.h
@ -1,13 +0,0 @@
-#ifndef __FFTS_SMALL_H__
-#define __FFTS_SMALL_H__
-
-
-void firstpass_16_f(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_16_b(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_8_f(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_8_b(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_4_f(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_4_b(ffts_plan_t *  p, const void *  in, void *  out);
-void firstpass_2(ffts_plan_t *  p, const void *  in, void *  out);
-
-#endif
--- a/lib/ffts/src/ffts_static.c
+++ b/lib/ffts/src/ffts_static.c
--- a/lib/ffts/src/ffts_static.h
+++ b/lib/ffts/src/ffts_static.h
@ -1,46 +1,91 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */

-#ifndef __FFTS_STATIC_H__
-#define __FFTS_STATIC_H__
+#ifndef FFTS_STATIC_H
+#define FFTS_STATIC_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif

 #include "ffts.h"
-#include "neon.h"

-void ffts_static_rec_f(ffts_plan_t *p, float *data, size_t N) ;
-void ffts_static_transform_f(ffts_plan_t *p, const void *in, void *out);
+void
+ffts_small_2_32f(ffts_plan_t *p, const void *in, void *out);

-void ffts_static_rec_i(ffts_plan_t *p, float *data, size_t N) ;
-void ffts_static_transform_i(ffts_plan_t *p, const void *in, void *out);
+void
+ffts_small_2_64f(ffts_plan_t *p, const void *in, void *out);

-#endif
+void
+ffts_small_forward4_32f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_forward4_64f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_backward4_32f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_backward4_64f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_forward8_32f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_forward8_64f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_backward8_32f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_backward8_64f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_forward16_32f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_forward16_64f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_backward16_32f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_small_backward16_64f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_static_transform_f_32f(ffts_plan_t *p, const void *in, void *out);
+
+void
+ffts_static_transform_i_32f(ffts_plan_t *p, const void *in, void *out);
+
+#endif /* FFTS_STATIC_H */
--- a/lib/ffts/src/ffts_transpose.c
+++ b/lib/ffts/src/ffts_transpose.c
@ -0,0 +1,194 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_transpose.h"
+#include "ffts_internal.h"
+
+#ifdef HAVE_NEON
+#include "neon.h"
+#include <arm_neon.h>
+#elif HAVE_SSE2
+#include <emmintrin.h>
+#endif
+
+#define TSIZE 8
+
+void
+ffts_transpose(uint64_t *in, uint64_t *out, int w, int h)
+{
+#ifdef HAVE_NEON
+#if 0
+    neon_transpose4(in, out, w, h);
+#else
+    neon_transpose8(in, out, w, h);
+#endif
+#elif HAVE_SSE2
+    uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE];
+    int tx, ty;
+    /* int x; */
+    int y;
+    int tw = w / TSIZE;
+    int th = h / TSIZE;
+
+    for (ty = 0; ty < th; ty++) {
+        for (tx = 0; tx < tw; tx++) {
+            uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE;
+            uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */
+
+            /* copy/transpose to tmp */
+            for (y = 0; y < TSIZE; y += 2) {
+                /* for (x=0;x<TSIZE;x+=2) {
+                   op[x*TSIZE] = ip[x];
+                */
+                __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w));
+                __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w));
+                __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w));
+                __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w));
+                __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w));
+                __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w));
+                __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w));
+                __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w));
+
+                __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+                __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+                __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0));
+                __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1));
+                __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0));
+                __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1));
+                __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0));
+                __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1));
+
+                ip0 += 2;
+                /* _mm_store_pd((double *)(op0 + y*h + x), t0);
+                   _mm_store_pd((double *)(op0 + y*h + x + h), t1);
+                   */
+
+                _mm_store_pd((double*)(op0 + 0        ), t0);
+                _mm_store_pd((double*)(op0 + 0 + TSIZE), t1);
+                _mm_store_pd((double*)(op0 + 2        ), t2);
+                _mm_store_pd((double*)(op0 + 2 + TSIZE), t3);
+                _mm_store_pd((double*)(op0 + 4        ), t4);
+                _mm_store_pd((double*)(op0 + 4 + TSIZE), t5);
+                _mm_store_pd((double*)(op0 + 6        ), t6);
+                _mm_store_pd((double*)(op0 + 6 + TSIZE), t7);
+                /* } */
+
+                op0 += 2*TSIZE;
+            }
+
+            op0 = out + h*tx*TSIZE + ty*TSIZE;
+            ip0 = tmp;
+            for (y = 0; y < TSIZE; y += 1) {
+                /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */
+
+                __m128d q0 = _mm_load_pd((double*)(ip0 + 0));
+                __m128d q1 = _mm_load_pd((double*)(ip0 + 2));
+                __m128d q2 = _mm_load_pd((double*)(ip0 + 4));
+                __m128d q3 = _mm_load_pd((double*)(ip0 + 6));
+
+                _mm_store_pd((double*)(op0 + 0), q0);
+                _mm_store_pd((double*)(op0 + 2), q1);
+                _mm_store_pd((double*)(op0 + 4), q2);
+                _mm_store_pd((double*)(op0 + 6), q3);
+
+                op0 += h;
+                ip0 += TSIZE;
+            }
+        }
+    }
+    /*
+    size_t i,j;
+    for(i=0;i<w;i+=2) {
+    for(j=0;j<h;j+=2) {
+    //		out[i*h + j] = in[j*w + i];
+    __m128d q0 = _mm_load_pd((double *)(in + j*w + i));
+    __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w));
+    __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0));
+    __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1));
+    _mm_store_pd((double *)(out + i*h + j), t0);
+    _mm_store_pd((double *)(out + i*h + j + h), t1);
+    }
+    }
+    */
+#else
+    const int bw = 1;
+    const int bh = 8;
+    int i = 0, j = 0;
+
+    for (; i <= h - bh; i += bh) {
+        for (j = 0; j <= w - bw; j += bw) {
+            uint64_t const *ib = &in[w*i + j];
+            uint64_t *ob = &out[h*j + i];
+
+            uint64_t s_0_0 = ib[0*w + 0];
+            uint64_t s_1_0 = ib[1*w + 0];
+            uint64_t s_2_0 = ib[2*w + 0];
+            uint64_t s_3_0 = ib[3*w + 0];
+            uint64_t s_4_0 = ib[4*w + 0];
+            uint64_t s_5_0 = ib[5*w + 0];
+            uint64_t s_6_0 = ib[6*w + 0];
+            uint64_t s_7_0 = ib[7*w + 0];
+
+            ob[0*h + 0] = s_0_0;
+            ob[0*h + 1] = s_1_0;
+            ob[0*h + 2] = s_2_0;
+            ob[0*h + 3] = s_3_0;
+            ob[0*h + 4] = s_4_0;
+            ob[0*h + 5] = s_5_0;
+            ob[0*h + 6] = s_6_0;
+            ob[0*h + 7] = s_7_0;
+        }
+    }
+
+    if (i < h) {
+        int i1;
+
+        for (i1 = 0; i1 < w; i1++) {
+            for (j = i; j < h; j++) {
+                out[i1*h + j] = in[j*w + i1];
+            }
+        }
+    }
+
+    if (j < w) {
+        int j1;
+
+        for (i = j; i < w; i++) {
+            for (j1 = 0; j1 < h; j1++) {
+                out[i*h + j1] = in[j1*w + i];
+            }
+        }
+    }
+#endif
+}
--- a/lib/ffts/src/ffts_transpose.h
+++ b/lib/ffts/src/ffts_transpose.h
@ -0,0 +1,46 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_TRANSPOSE_H
+#define FFTS_TRANSPOSE_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts_internal.h"
+
+void
+ffts_transpose(uint64_t *in, uint64_t *out, int w, int h);
+
+#endif /* FFTS_TRANSPOSE_H */
--- a/lib/ffts/src/ffts_trig.c
+++ b/lib/ffts/src/ffts_trig.c
@ -0,0 +1,628 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#include "ffts_trig.h"
+#include "ffts_dd.h"
+
+/* 1/(2*cos(pow(2,-p)*pi)) */
+static const FFTS_ALIGN(16) unsigned int half_secant[132] = {
+    0x00000000, 0x3fe00000, 0xc9be45de, 0x3be3bd3c,
+    0x00000000, 0x3fe00000, 0xc9be45de, 0x3c03bd3c,
+    0x00000000, 0x3fe00000, 0xc9be45de, 0x3c23bd3c,
+    0x00000000, 0x3fe00000, 0xc9be45de, 0x3c43bd3c,
+    0x00000000, 0x3fe00000, 0xc9be45de, 0x3c63bd3c,
+    0x00000000, 0x3fe00000, 0xc9be45df, 0x3c83bd3c,
+    0x00000001, 0x3fe00000, 0x4df22efd, 0x3c7de9e6,
+    0x00000005, 0x3fe00000, 0x906e8725, 0xbc60b0cd,
+    0x00000014, 0x3fe00000, 0x906e8357, 0xbc80b0cd,
+    0x0000004f, 0x3fe00000, 0x0dce83c9, 0xbc5619b2,
+    0x0000013c, 0x3fe00000, 0x0dc6e79a, 0xbc7619b2,
+    0x000004ef, 0x3fe00000, 0xe4af1240, 0x3c83cc9b,
+    0x000013bd, 0x3fe00000, 0x2d14c08a, 0x3c7e64df,
+    0x00004ef5, 0x3fe00000, 0x47a85465, 0xbc59b20b,
+    0x00013bd4, 0x3fe00000, 0xab79c897, 0xbc79b203,
+    0x0004ef4f, 0x3fe00000, 0x15019a96, 0x3c79386b,
+    0x0013bd3d, 0x3fe00000, 0x7d6dbf4b, 0xbc7b16b7,
+    0x004ef4f3, 0x3fe00000, 0xf30832e0, 0x3c741ee4,
+    0x013bd3cd, 0x3fe00000, 0xd3bcd4bb, 0xbc83f41e,
+    0x04ef4f34, 0x3fe00000, 0xdd75aebb, 0xbc82ef06,
+    0x13bd3cde, 0x3fe00000, 0xb2b41b3d, 0x3c52d979,
+    0x4ef4f46c, 0x3fe00000, 0x4f0fb458, 0xbc851db3,
+    0x3bd3e0e7, 0x3fe00001, 0x8a0ce3f0, 0x3c58dbab,
+    0xef507722, 0x3fe00004, 0x2a8ec295, 0x3c83e351,
+    0xbd5114f9, 0x3fe00013, 0xc4c0d92d, 0x3c8b3ca4,
+    0xf637de7d, 0x3fe0004e, 0xb74de729, 0x3c45974e,
+    0xe8190891, 0x3fe0013b, 0x26edf4da, 0xbc814c20,
+    0x9436640e, 0x3fe004f0, 0xe2b34b50, 0x3c8091ab,
+    0x9c61d971, 0x3fe013d1, 0x6ce01b8e, 0x3c7f7df7,
+    0xd17cba53, 0x3fe0503e, 0x74ad7633, 0xbc697609,
+    0x7bdb3895, 0x3fe1517a, 0x82f9091b, 0xbc8008d1,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000
+};
+
+/* cos(pow(2,-p)*pi), sin(pow(2,-p)*pi) */
+static const FFTS_ALIGN(16) unsigned int cos_sin_pi_table[264] = {
+    0x00000000, 0x3ff00000, 0x54442d18, 0x3df921fb,
+    0xc9be45de, 0xbbf3bd3c, 0xbb77974f, 0x3a91a390,
+    0x00000000, 0x3ff00000, 0x54442d18, 0x3e0921fb,
+    0xc9be45de, 0xbc13bd3c, 0x54a14928, 0x3aa19bd0,
+    0x00000000, 0x3ff00000, 0x54442d18, 0x3e1921fb,
+    0xc9be45de, 0xbc33bd3c, 0xb948108a, 0x3ab17cce,
+    0x00000000, 0x3ff00000, 0x54442d18, 0x3e2921fb,
+    0xc9be45de, 0xbc53bd3c, 0x4be32e14, 0x3ac100c8,
+    0x00000000, 0x3ff00000, 0x54442d18, 0x3e3921fb,
+    0xc9be45de, 0xbc73bd3c, 0x2c9f4879, 0x3ace215d,
+    0xffffffff, 0x3fefffff, 0x54442d18, 0x3e4921fb,
+    0x6c837443, 0x3c888586, 0x0005f376, 0x3acd411f,
+    0xfffffffe, 0x3fefffff, 0x54442d18, 0x3e5921fb,
+    0x4df22ef1, 0xbc8de9e6, 0x9937209e, 0xbaf7b153,
+    0xfffffff6, 0x3fefffff, 0x54442d16, 0x3e6921fb,
+    0x906e88aa, 0x3c70b0cd, 0xfe19968a, 0xbb03b7c0,
+    0xffffffd9, 0x3fefffff, 0x54442d0e, 0x3e7921fb,
+    0xdf22ed26, 0xbc8e9e64, 0x8d1b6ffb, 0xbaee8bb4,
+    0xffffff62, 0x3fefffff, 0x54442cef, 0x3e8921fb,
+    0x0dd18f0f, 0x3c6619b2, 0x7f2b20fb, 0xbb00e133,
+    0xfffffd88, 0x3fefffff, 0x54442c73, 0x3e9921fb,
+    0x0dd314b2, 0x3c8619b2, 0x619fdf6e, 0xbb174e98,
+    0xfffff621, 0x3fefffff, 0x54442a83, 0x3ea921fb,
+    0x3764acf5, 0x3c8866c8, 0xf5b2407f, 0xbb388215,
+    0xffffd886, 0x3fefffff, 0x544422c2, 0x3eb921fb,
+    0x20e7a944, 0xbc8e64df, 0x7b9b9f23, 0x3b5a0961,
+    0xffff6216, 0x3fefffff, 0x544403c1, 0x3ec921fb,
+    0x52ee25ea, 0x3c69b20e, 0x4df6a86a, 0xbb5999d9,
+    0xfffd8858, 0x3fefffff, 0x544387ba, 0x3ed921fb,
+    0xd8910ead, 0x3c89b20f, 0x0809d04d, 0x3b77d9db,
+    0xfff62162, 0x3fefffff, 0x544197a1, 0x3ee921fb,
+    0x438d3925, 0xbc8937a8, 0xa5d27f7a, 0xbb858b02,
+    0xffd88586, 0x3fefffff, 0x5439d73a, 0x3ef921fb,
+    0x94b3ddd2, 0x3c8b22e4, 0xf8a3b73d, 0xbb863c7f,
+    0xff62161a, 0x3fefffff, 0x541ad59e, 0x3f0921fb,
+    0x7ea469b2, 0xbc835c13, 0xb8cee262, 0x3bae9860,
+    0xfd885867, 0x3fefffff, 0x539ecf31, 0x3f1921fb,
+    0x23a32e63, 0xbc77d556, 0xfcd23a30, 0x3b96b111,
+    0xf621619c, 0x3fefffff, 0x51aeb57c, 0x3f2921fb,
+    0xbbbd8fe6, 0xbc87507d, 0x4916c435, 0xbbca6e1d,
+    0xd8858675, 0x3fefffff, 0x49ee4ea6, 0x3f3921fb,
+    0x54748eab, 0xbc879f0e, 0x744a453e, 0x3bde894d,
+    0x62161a34, 0x3fefffff, 0x2aecb360, 0x3f4921fb,
+    0xb1f9b9c4, 0xbc6136dc, 0x7e566b4c, 0x3be87615,
+    0x88586ee6, 0x3feffffd, 0xaee6472e, 0x3f5921fa,
+    0xf173ae5b, 0x3c81af64, 0x284a9df8, 0xbbfee52e,
+    0x21621d02, 0x3feffff6, 0xbecca4ba, 0x3f6921f8,
+    0xebc82813, 0xbc76acfc, 0x7bcab5b2, 0x3c02ba40,
+    0x858e8a92, 0x3fefffd8, 0xfe670071, 0x3f7921f0,
+    0x1883bcf7, 0x3c8359c7, 0xfe6b7a9b, 0x3bfab967,
+    0x169b92db, 0x3fefff62, 0xfcdec784, 0x3f8921d1,
+    0xc81fbd0d, 0x3c85dda3, 0xbe836d9d, 0x3c29878e,
+    0x6084cd0d, 0x3feffd88, 0xf7a3667e, 0x3f992155,
+    0x4556e4cb, 0xbc81354d, 0x091a0130, 0xbbfb1d63,
+    0xe3796d7e, 0x3feff621, 0xf10dd814, 0x3fa91f65,
+    0x2e24aa15, 0xbc6c57bc, 0x0d569a90, 0xbc2912bd,
+    0xa3d12526, 0x3fefd88d, 0xbc29b42c, 0x3fb917a6,
+    0x378811c7, 0xbc887df6, 0xd26ed688, 0xbc3e2718,
+    0xcff75cb0, 0x3fef6297, 0x3c69a60b, 0x3fc8f8b8,
+    0x2a361fd3, 0x3c756217, 0xb9ff8d82, 0xbc626d19,
+    0xcf328d46, 0x3fed906b, 0xa6aea963, 0x3fd87de2,
+    0x10231ac2, 0x3c7457e6, 0xd3d5a610, 0xbc672ced,
+    0x667f3bcd, 0x3fe6a09e, 0x667f3bcd, 0x3fe6a09e,
+    0x13b26456, 0xbc8bdd34, 0x13b26456, 0xbc8bdd34,
+    0x00000000, 0x00000000, 0x00000000, 0x3ff00000,
+    0x00000000, 0x00000000, 0x00000000, 0x00000000
+};
+
+int
+ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size)
+{
+    double alpha, beta;
+    double c[2], s[2];
+    double x, z;
+    int i;
+
+    if (!table || !table_size) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =  1.0f;
+    table[0][1] = -0.0f;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* polynomial approximations calculated using Sollya */
+    x = 1.0 / table_size;
+    z = x * x;
+
+    /* alpha = 2 * sin(M_PI_4 / m) * sin(M_PI_4 / m) */
+    alpha = x * (1.1107207345394952717884501203293686870741139540138 +
+        z * (-0.114191397993514079911985272577099412137126013186879 +
+        z * 3.52164670852685621720746817665316575239342815885835e-3));
+    alpha = alpha * alpha;
+
+    /* beta = sin(M_PI_2 / m) */
+    beta = x * (1.57079632679489455959753740899031981825828552246094 +
+        z * (-0.64596409735041482313988581154262647032737731933593 +
+        z * 7.9690915468332887416913479228242067620158195495605e-2));
+
+    /* cos(0) = 1.0, sin(0) = 0.0 */
+    c[0] = 1.0;
+    s[0] = 0.0;
+
+    /* generate sine and cosine tables with maximum error less than 1 ULP */
+    for (i = 1; i < (table_size + 1)/2; i++) {
+        c[1] = c[0] - ((alpha * c[0]) + (beta * s[0]));
+        s[1] = s[0] - ((alpha * s[0]) - (beta * c[0]));
+
+        table[i          + 0][0] = (float)  c[1];
+        table[i          + 0][1] = (float) -s[1];
+        table[table_size - i][0] = (float)  s[1];
+        table[table_size - i][1] = (float) -c[1];
+
+        c[0] = c[1];
+        s[0] = s[1];
+    }
+
+    if (FFTS_UNLIKELY(table_size & 1)) {
+        goto exit;
+    }
+
+mid_point:
+    table[i][0] =  0.70710677f;
+    table[i][1] = -0.70710677f;
+
+exit:
+    return 0;
+}
+
+/* Oscar Buneman's method for generating a sequence of sines and cosines.
+*  Expired US Patent 4,878,187 A
+*
+*  D. Potts, G. Steidl, M. Tasche, Numerical stability of fast
+*  trigonometric transforms — a worst case study,
+*  J. Concrete Appl. Math. 1 (2003) 1–36
+*
+*  O. Buneman, Stable on–line creation of sines and cosines of
+*  successive angles, Proc. IEEE 75, 1434 – 1435 (1987).
+*/
+#if HAVE_SSE2
+int
+ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
+{
+    static const __m128d sign_swap = { 0.0, -0.0 };
+    const __m128d *FFTS_RESTRICT ct;
+    const double *FFTS_RESTRICT hs;
+    __m128d FFTS_ALIGN(16) w[32];
+    __m128d FFTS_ALIGN(16) h[32];
+    int i, log_2, offset;
+
+    /* size must be a power of two */
+    if (!table || !table_size || (table_size & (table_size - 1))) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =  1.0f;
+    table[0][1] = -0.0f;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(table_size/2 > 1);
+    log_2 = ffts_ctzl(table_size);
+    FFTS_ASSUME(log_2 > 1);
+    offset = 32 - log_2;
+    ct = (const __m128d*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const double*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i] = ct[2*i];
+
+        /* duplicate the high part */
+        h[i] = _mm_set1_pd(hs[2*i]);
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    for (i = 1; i < table_size/2; i++) {
+        /* calculate trailing zeros in index */
+        log_2 = ffts_ctzl(i);
+
+        /* note that storing is not 16 byte aligned */
+        _mm_storel_pi((__m64*) &table[i + 0],
+            _mm_cvtpd_ps(_mm_or_pd(w[log_2], sign_swap)));
+        _mm_storel_pi((__m64*) &table[table_size - i], _mm_cvtpd_ps(
+            _mm_or_pd(_mm_shuffle_pd(w[log_2], w[log_2], 1), sign_swap)));
+
+        /* skip and find next trailing zero */
+        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+        w[log_2] = _mm_mul_pd(h[log_2], _mm_add_pd(w[log_2 + 1], w[offset]));
+    }
+
+mid_point:
+    table[i][0] =  0.70710677f;
+    table[i][1] = -0.70710677f;
+
+exit:
+    return 0;
+}
+
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+    static const __m128d sign_swap = { 0.0, -0.0 };
+    const struct ffts_dd2_t *FFTS_RESTRICT ct;
+    const double *FFTS_RESTRICT hs;
+    struct ffts_dd2_t FFTS_ALIGN(16) w[32];
+    struct ffts_dd2_t FFTS_ALIGN(16) h[32];
+    struct ffts_dd2_t FFTS_ALIGN(16) sum;
+    int i, log_2, offset;
+
+    /* size must be a power of two */
+    if (!table || !table_size || (table_size & (table_size - 1))) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =  1.0;
+    table[0][1] = -0.0;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(table_size/2 > 1);
+    log_2 = ffts_ctzl(table_size);
+    FFTS_ASSUME(log_2 > 1);
+    offset = 32 - log_2;
+    ct = (const struct ffts_dd2_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const double*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i] = ct[i];
+
+        /* duplicate the high and low parts */
+        h[i].hi = _mm_set1_pd(hs[2*i + 0]);
+        h[i].lo = _mm_set1_pd(hs[2*i + 1]);
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    for (i = 1; i < table_size/2; i++) {
+        /* calculate trailing zeros in index */
+        log_2 = ffts_ctzl(i);
+
+        /* result of ffts_dd_mul_dd is normalized */
+        _mm_store_pd((double*) &table[i + 0],
+            _mm_or_pd(w[log_2].hi, sign_swap));
+        _mm_store_pd((double*) &table[table_size - i],
+            _mm_or_pd(_mm_shuffle_pd(w[log_2].hi, w[log_2].hi, 1), sign_swap));
+
+        /* skip and find next trailing zero */
+        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+        sum = ffts_dd2_add_dd2_unnormalized(&w[log_2 + 1], &w[offset]);
+        w[log_2] = ffts_dd2_mul_dd2(&h[log_2], &sum);
+    }
+
+mid_point:
+    table[i][0] =  0.707106781186547524;
+    table[i][1] = -0.707106781186547524;
+
+exit:
+    return 0;
+}
+#else
+int
+ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size)
+{
+    const ffts_cpx_64f *FFTS_RESTRICT ct;
+    const double *FFTS_RESTRICT hs;
+    ffts_cpx_64f FFTS_ALIGN(16) w[32];
+    int i, log_2, offset;
+
+    /* size must be a power of two */
+    if (!table || !table_size || (table_size & (table_size - 1))) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =  1.0f;
+    table[0][1] = -0.0f;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(table_size/2 > 1);
+    log_2 = ffts_ctzl(table_size);
+    FFTS_ASSUME(log_2 > 1);
+    offset = 32 - log_2;
+    ct = (const ffts_cpx_64f*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const double*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i][0] = ct[2*i][0];
+        w[i][1] = ct[2*i][1];
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    for (i = 1; i < table_size/2; i++) {
+        /* calculate trailing zeros in index */
+        log_2 = ffts_ctzl(i);
+
+        table[i          + 0][0] = (float)  w[log_2][0];
+        table[i          + 0][1] = (float) -w[log_2][1];
+        table[table_size - i][0] = (float)  w[log_2][1];
+        table[table_size - i][1] = (float) -w[log_2][0];
+
+        /* skip and find next trailing zero */
+        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+        w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
+        w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+    }
+
+mid_point:
+    table[i][0] =  0.70710677f;
+    table[i][1] = -0.70710677f;
+
+exit:
+    return 0;
+}
+
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size)
+{
+    const struct ffts_dd_t *FFTS_RESTRICT ct;
+    const struct ffts_dd_t *FFTS_RESTRICT hs;
+    struct ffts_dd_t FFTS_ALIGN(16) w[32][2];
+    int i, log_2, offset;
+
+    /* size must be a power of two */
+    if (!table || !table_size || (table_size & (table_size - 1))) {
+        return -1;
+    }
+
+    /* the first */
+    table[0][0] =  1.0;
+    table[0][1] = -0.0;
+
+    if (FFTS_UNLIKELY(table_size == 1)) {
+        goto exit;
+    }
+
+    if (FFTS_UNLIKELY(table_size == 2)) {
+        /* skip over */
+        i = 1;
+        goto mid_point;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(table_size/2 > 1);
+    log_2 = ffts_ctzl(table_size);
+    FFTS_ASSUME(log_2 > 1);
+    offset = 32 - log_2;
+    ct = (const struct ffts_dd_t*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const struct ffts_dd_t*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i][0].hi = ct[2*i + 0].hi;
+        w[i][0].lo = ct[2*i + 1].hi;
+        w[i][1].hi = ct[2*i + 0].lo;
+        w[i][1].lo = ct[2*i + 1].lo;
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    for (i = 1; i < table_size/2; i++) {
+        /* calculate trailing zeros in index */
+        log_2 = ffts_ctzl(i);
+
+        /* result of ffts_dd_mul_dd is normalized */
+        table[i          + 0][0] =  w[log_2][0].hi;
+        table[i          + 0][1] = -w[log_2][1].hi;
+        table[table_size - i][0] =  w[log_2][1].hi;
+        table[table_size - i][1] = -w[log_2][0].hi;
+
+        /* skip and find next trailing zero */
+        offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+        w[log_2][0] = ffts_dd_mul_dd(hs[log_2],
+            ffts_dd_add_dd_unnormalized(w[log_2 + 1][0], w[offset][0]));
+        w[log_2][1] = ffts_dd_mul_dd(hs[log_2],
+            ffts_dd_add_dd_unnormalized(w[log_2 + 1][1], w[offset][1]));
+    }
+
+mid_point:
+    table[i][0] =  0.707106781186547524;
+    table[i][1] = -0.707106781186547524;
+
+exit:
+    return 0;
+}
+#endif
+
+int
+ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
+                                int sign,
+                                int invert)
+{
+    const ffts_cpx_64f *FFTS_RESTRICT ct;
+    const double *FFTS_RESTRICT hs;
+    ffts_cpx_64f FFTS_ALIGN(16) w[32];
+    int i, log_2, offset, N;
+    float *A, *B;
+
+    if (!p) {
+        return -1;
+    }
+
+    A = (float*) FFTS_ASSUME_ALIGNED_32(p->A);
+    B = (float*) FFTS_ASSUME_ALIGNED_32(p->B);
+    N = (int) p->N;
+
+    /* the first */
+    if (sign < 0) {
+        A[0] =  0.5f;
+        A[1] = -0.5f;
+        B[0] =  invert ? -0.5f : 0.5f;
+        B[1] =  0.5f;
+    } else {
+        /* peel of the first */
+        A[0] = 1.0f;
+        A[1] = invert ? 1.0f : -1.0f;
+        B[0] = 1.0f;
+        B[1] = 1.0f;
+    }
+
+    if (FFTS_UNLIKELY(N == 4)) {
+        i = 1;
+        goto last;
+    }
+
+    /* calculate table offset */
+    FFTS_ASSUME(N / 4 > 1);
+    log_2 = ffts_ctzl(N);
+    FFTS_ASSUME(log_2 > 2);
+    offset = 34 - log_2;
+    ct = (const ffts_cpx_64f*)
+        FFTS_ASSUME_ALIGNED_32(&cos_sin_pi_table[8 * offset]);
+    hs = (const double*) &half_secant[4 * offset];
+
+    /* initialize from lookup table */
+    for (i = 0; i <= log_2; i++) {
+        w[i][0] = ct[2*i][0];
+        w[i][1] = ct[2*i][1];
+    }
+
+    /* generate sine and cosine tables with maximum error less than 0.5 ULP */
+    if (sign < 0) {
+        for (i = 1; i < N/4; i++) {
+            float t0, t1, t2; 
+
+            /* calculate trailing zeros in index */
+            log_2 = ffts_ctzl(i);
+
+            t0 = (float) (0.5 * (1.0 - w[log_2][1]));
+            t1 = (float) (0.5 * w[log_2][0]);
+            t2 = (float) (0.5 * (1.0 + w[log_2][1]));
+
+            A[    2 * i + 0] =  t0;
+            A[N - 2 * i + 0] =  t0;
+            A[    2 * i + 1] = -t1;
+            A[N - 2 * i + 1] =  t1;
+
+            B[    2 * i + 0] =  invert ? -t2 : t2;
+            B[N - 2 * i + 0] =  invert ? -t2 : t2;
+            B[    2 * i + 1] =  t1;
+            B[N - 2 * i + 1] = -t1;
+
+            /* skip and find next trailing zero */
+            offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+            w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
+            w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+        }
+    } else {
+        for (i = 1; i < N/4; i++) {
+            float t0, t1, t2; 
+
+            /* calculate trailing zeros in index */
+            log_2 = ffts_ctzl(i);
+
+            t0 = (float) (1.0 - w[log_2][1]);
+            t1 = (float) w[log_2][0];
+            t2 = (float) (1.0 + w[log_2][1]);
+
+            A[    2 * i + 0] = t0;
+            A[N - 2 * i + 0] = t0;
+            A[    2 * i + 1] = invert ?  t1 : -t1;
+            A[N - 2 * i + 1] = invert ? -t1 :  t1;
+
+            B[    2 * i + 0] =  t2;
+            B[N - 2 * i + 0] =  t2;
+            B[    2 * i + 1] =  t1;
+            B[N - 2 * i + 1] = -t1;
+
+            /* skip and find next trailing zero */
+            offset = (log_2 + 2 + ffts_ctzl(~i >> (log_2 + 2)));
+            w[log_2][0] = hs[2 * log_2] * (w[log_2 + 1][0] + w[offset][0]);
+            w[log_2][1] = hs[2 * log_2] * (w[log_2 + 1][1] + w[offset][1]);
+        }
+    }
+
+last:
+    if (sign < 0) {
+        A[2 * i + 0] = 0.0f;
+        A[2 * i + 1] = 0.0f;
+        B[2 * i + 0] = invert ? -1.0f : 1.0f;
+        B[2 * i + 1] = 0.0f;
+    } else {
+        A[2 * i + 0] = 0.0f;
+        A[2 * i + 1] = 0.0f;
+        B[2 * i + 0] = 2.0f;
+        B[2 * i + 1] = 0.0f;
+    }
+
+    return 0;
+}
--- a/lib/ffts/src/ffts_trig.h
+++ b/lib/ffts/src/ffts_trig.h
@ -0,0 +1,56 @@
+/*
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+#ifndef FFTS_TRIG_H
+#define FFTS_TRIG_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include "ffts_internal.h"
+
+int
+ffts_generate_cosine_sine_32f(ffts_cpx_32f *const table, int table_size);
+
+int
+ffts_generate_cosine_sine_pow2_32f(ffts_cpx_32f *const table, int table_size);
+
+int
+ffts_generate_cosine_sine_pow2_64f(ffts_cpx_64f *const table, int table_size);
+
+int
+ffts_generate_table_1d_real_32f(struct _ffts_plan_t *const p,
+                                int sign,
+                                int invert);
+
+#endif /* FFTS_TRIG_H */
--- a/lib/ffts/src/macros-alpha.h
+++ b/lib/ffts/src/macros-alpha.h
@ -1,206 +1,264 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
- Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-*/
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

-#ifndef __MACROS_ALPHA_H__
-#define __MACROS_ALPHA_H__
+*/

-#include <math.h>
+#ifndef FFTS_MACROS_ALPHA_H
+#define FFTS_MACROS_ALPHA_H

-#ifdef __alpha__
-#define restrict
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
 #endif

-typedef struct {float r1, i1, r2, i2;} V;
+#include "ffts_attributes.h"

-#define FFTS_MALLOC(d,a) malloc(d)
-#define FFTS_FREE(d) free(d)
+#ifdef HAVE_STRING_H
+#include <string.h>
+#endif

-#define VLIT4(f3,f2,f1,f0) ((V){f0,f1,f2,f3})
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif

-static inline V VADD(V x, V y)
+typedef union {
+    struct {
+        float r1;
+        float i1;
+        float r2;
+        float i2;
+    } r;
+    uint32_t u[4];
+} V4SF;
+
+#define FFTS_MALLOC(d,a) (malloc(d))
+#define FFTS_FREE(d) (free(d))
+
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_LIT4(float f3, float f2, float f1, float f0)
 {
-    V z;
-    z.r1 = x.r1 + y.r1;
-    z.i1 = x.i1 + y.i1;
-    z.r2 = x.r2 + y.r2;
-    z.i2 = x.i2 + y.i2;
+    V4SF z;
+
+    z.r.r1 = f0;
+    z.r.i1 = f1;
+    z.r.r2 = f2;
+    z.r.i2 = f3;
+
    return z;
 }

-
-static inline V VSUB(V x, V y)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_ADD(V4SF x, V4SF y)
 {
-    V z;
-    z.r1 = x.r1 - y.r1;
-    z.i1 = x.i1 - y.i1;
-    z.r2 = x.r2 - y.r2;
-    z.i2 = x.i2 - y.i2;
+    V4SF z;
+
+    z.r.r1 = x.r.r1 + y.r.r1;
+    z.r.i1 = x.r.i1 + y.r.i1;
+    z.r.r2 = x.r.r2 + y.r.r2;
+    z.r.i2 = x.r.i2 + y.r.i2;
+
    return z;
 }

-
-static inline V VMUL(V x, V y)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_SUB(V4SF x, V4SF y)
 {
-    V z;
-    z.r1 = x.r1 * y.r1;
-    z.i1 = x.i1 * y.i1;
-    z.r2 = x.r2 * y.r2;
-    z.i2 = x.i2 * y.i2;
+    V4SF z;
+
+    z.r.r1 = x.r.r1 - y.r.r1;
+    z.r.i1 = x.r.i1 - y.r.i1;
+    z.r.r2 = x.r.r2 - y.r.r2;
+    z.r.i2 = x.r.i2 - y.r.i2;
+
    return z;
 }

-static inline V VXOR(V x, V y)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_MUL(V4SF x, V4SF y)
 {
-    V r;
-    r.r1 = (uint32_t)x.r1 ^ (uint32_t)y.r1;
-    r.i1 = (uint32_t)x.i1 ^ (uint32_t)y.i1;
-    r.r2 = (uint32_t)x.r2 ^ (uint32_t)y.r2;
-    r.i2 = (uint32_t)x.i2 ^ (uint32_t)y.i2;
-    return r;
+    V4SF z;
+
+    z.r.r1 = x.r.r1 * y.r.r1;
+    z.r.i1 = x.r.i1 * y.r.i1;
+    z.r.r2 = x.r.r2 * y.r.r2;
+    z.r.i2 = x.r.i2 * y.r.i2;
+
+    return z;
 }

-static inline V VSWAPPAIRS(V x)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_XOR(V4SF x, V4SF y)
 {
-    V z;
-    z.r1 = x.i1;
-    z.i1 = x.r1;
-    z.r2 = x.i2;
-    z.i2 = x.r2;
+    V4SF z;
+
+    z.u[0] = x.u[0] ^ y.u[0];
+    z.u[1] = x.u[1] ^ y.u[1];
+    z.u[2] = x.u[2] ^ y.u[2];
+    z.u[3] = x.u[3] ^ y.u[3];
+
    return z;
 }

-
-static inline V VBLEND(V x, V y)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_SWAP_PAIRS(V4SF x)
 {
-    V z;
-    z.r1 = x.r1;
-    z.i1 = x.i1;
-    z.r2 = y.r2;
-    z.i2 = y.i2;
+    V4SF z;
+
+    z.r.r1 = x.r.i1;
+    z.r.i1 = x.r.r1;
+    z.r.r2 = x.r.i2;
+    z.r.i2 = x.r.r2;
+
    return z;
 }

-static inline V VUNPACKHI(V x, V y)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_BLEND(V4SF x, V4SF y)
 {
-    V z;
-    z.r1 = x.r2;
-    z.i1 = x.i2;
-    z.r2 = y.r2;
-    z.i2 = y.i2;
+    V4SF z;
+
+    z.r.r1 = x.r.r1;
+    z.r.i1 = x.r.i1;
+    z.r.r2 = y.r.r2;
+    z.r.i2 = y.r.i2;
+
    return z;
 }

-static inline V VUNPACKLO(V x, V y)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_UNPACK_HI(V4SF x, V4SF y)
 {
-    V z;
-    z.r1 = x.r1;
-    z.i1 = x.i1;
-    z.r2 = y.r1;
-    z.i2 = y.i1;
+    V4SF z;
+
+    z.r.r1 = x.r.r2;
+    z.r.i1 = x.r.i2;
+    z.r.r2 = y.r.r2;
+    z.r.i2 = y.r.i2;
+
    return z;
 }

-static inline V VDUPRE(V x)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_UNPACK_LO(V4SF x, V4SF y)
 {
-    V z;
-    z.r1 = x.r1;
-    z.i1 = x.r1;
-    z.r2 = x.r2;
-    z.i2 = x.r2;
+    V4SF z;
+
+    z.r.r1 = x.r.r1;
+    z.r.i1 = x.r.i1;
+    z.r.r2 = y.r.r1;
+    z.r.i2 = y.r.i1;
+
    return z;
 }

-static inline V VDUPIM(V x)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_DUPLICATE_RE(V4SF x)
 {
-    V z;
-    z.r1 = x.i1;
-    z.i1 = x.i1;
-    z.r2 = x.i2;
-    z.i2 = x.i2;
+    V4SF z;
+
+    z.r.r1 = x.r.r1;
+    z.r.i1 = x.r.r1;
+    z.r.r2 = x.r.r2;
+    z.r.i2 = x.r.r2;
+
    return z;
 }

-static inline V IMUL(V d, V re, V im)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_DUPLICATE_IM(V4SF x)
 {
-    re = VMUL(re, d);
-    im = VMUL(im, VSWAPPAIRS(d));
-    return VSUB(re, im);  
+    V4SF z;
+
+    z.r.r1 = x.r.i1;
+    z.r.i1 = x.r.i1;
+    z.r.r2 = x.r.i2;
+    z.r.i2 = x.r.i2;
+
+    return z;
 }

+static FFTS_ALWAYS_INLINE V4SF
+V4SF_IMUL(V4SF d, V4SF re, V4SF im)
+{
+    re = V4SF_MUL(re, d);
+    im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d));
+    return V4SF_SUB(re, im);
+}

-static inline V IMULJ(V d, V re, V im)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
 {
-    re = VMUL(re, d);
-    im = VMUL(im, VSWAPPAIRS(d));
-    return VADD(re, im);
+    re = V4SF_MUL(re, d);
+    im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d));
+    return V4SF_ADD(re, im);
 }

-static inline V MULI(int inv, V x)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_MULI(int inv, V4SF x)
 {
-    V z;
+    V4SF z;

    if (inv) {
-	z.r1 = -x.r1;
-	z.i1 = x.i1;
-	z.r2 = -x.r2;
-	z.i2 = x.i2;
-    }else{
-	z.r1 = x.r1;
-	z.i1 = -x.i1;
-	z.r2 = x.r2;
-	z.i2 = -x.i2;
+        z.r.r1 = -x.r.r1;
+        z.r.i1 =  x.r.i1;
+        z.r.r2 = -x.r.r2;
+        z.r.i2 =  x.r.i2;
+    } else {
+        z.r.r1 =  x.r.r1;
+        z.r.i1 = -x.r.i1;
+        z.r.r2 =  x.r.r2;
+        z.r.i2 = -x.r.i2;
    }
+
    return z;
 }

-
-static inline V IMULI(int inv, V x)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_IMULI(int inv, V4SF x)
 {
-    return VSWAPPAIRS(MULI(inv, x));
+    return V4SF_SWAP_PAIRS(V4SF_MULI(inv, x));
 }

-
-static inline V VLD(const void *s)
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_LD(const void *s)
 {
-    V *d = (V *)s;
-    return *d;
+    V4SF z;
+    memcpy(&z, s, sizeof(z));
+    return z;
 }

-
-static inline void VST(void *d, V s)
+static FFTS_ALWAYS_INLINE void
+V4SF_ST(void *d, V4SF s)
 {
-    V *r = (V *)d;
+    V4SF *r = (V4SF*) d;
    *r = s;
 }

-#endif
+#endif /* FFTS_MACROS_ALPHA_H */
--- a/lib/ffts/src/macros-altivec.h
+++ b/lib/ffts/src/macros-altivec.h
@ -135,3 +135,4 @@ static inline void VST(void *d, V s)
    *r = s;
 }
 #endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/lib/ffts/src/macros-neon.h
+++ b/lib/ffts/src/macros-neon.h
@ -1,96 +1,119 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */
-#ifndef __MACROS_NEON_H__
-#define __MACROS_NEON_H__

-#include "neon.h"
+#ifndef FFTS_MACROS_NEON_H
+#define FFTS_MACROS_NEON_H
+
 #include <arm_neon.h>

-typedef float32x4_t V;
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif

-typedef float32x4x2_t VS;
+#define FFTS_MALLOC(d,a) (valloc(d))
+#define FFTS_FREE(d) (free(d))

-#define ADD vaddq_f32
-#define SUB vsubq_f32
-#define MUL vmulq_f32
-#define VADD vaddq_f32
-#define VSUB vsubq_f32
-#define VMUL vmulq_f32
-#define VXOR(x,y) (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))
-#define VST vst1q_f32
-#define VLD vld1q_f32 
-#define VST2 vst2q_f32
-#define VLD2 vld2q_f32 
+typedef float32x4_t   V4SF;
+typedef float32x4x2_t V4SF2;

-#define VSWAPPAIRS(x) (vrev64q_f32(x))
+#define V4SF_ADD vaddq_f32
+#define V4SF_SUB vsubq_f32
+#define V4SF_MUL vmulq_f32

-#define VUNPACKHI(a,b) (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))
-#define VUNPACKLO(a,b) (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))
+#define V4SF_XOR(x,y) \
+    (vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(x), vreinterpretq_u32_f32(y))))

-#define VBLEND(x,y) (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))
+#define V4SF_ST vst1q_f32
+#define V4SF_LD vld1q_f32

-__INLINE V VLIT4(data_t f3, data_t f2, data_t f1, data_t f0) {
-    data_t __attribute__ ((aligned(16))) d[4] = {f0, f1, f2, f3};
-    return VLD(d);
-}
+#define V4SF_SWAP_PAIRS(x) \
+    (vrev64q_f32(x))

-#define VDUPRE(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
-#define VDUPIM(r) vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
+#define V4SF_UNPACK_HI(a,b) \
+    (vcombine_f32(vget_high_f32(a), vget_high_f32(b)))

-#define FFTS_MALLOC(d,a) (valloc(d))
-#define FFTS_FREE(d) (free(d))
+#define V4SF_UNPACK_LO(a,b) \
+    (vcombine_f32(vget_low_f32(a), vget_low_f32(b)))

-__INLINE void STORESPR(data_t * addr,  VS p) {
+#define V4SF_BLEND(x,y) \
+    (vcombine_f32(vget_low_f32(x), vget_high_f32(y)))

-	vst1q_f32(addr, p.val[0]);
-	vst1q_f32(addr + 4, p.val[1]);
-	
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_LIT4(float f3, float f2, float f1, float f0)
+{
+    float FFTS_ALIGN(16) d[4] = {f0, f1, f2, f3};
+    return V4SF_LD(d);
 }

-__INLINE V IMULI(int inv, V a) {
-	if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
-	else    return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+#define V4SF_DUPLICATE_RE(r) \
+    vcombine_f32(vdup_lane_f32(vget_low_f32(r),0), vdup_lane_f32(vget_high_f32(r),0))
+
+#define V4SF_DUPLICATE_IM(r) \
+    vcombine_f32(vdup_lane_f32(vget_low_f32(r),1), vdup_lane_f32(vget_high_f32(r),1))
+
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_IMULI(int inv, V4SF a)
+{
+    if (inv) {
+        return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+    } else {
+        return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+    }
 }

-__INLINE V IMUL(V d, V re, V im) {
-  re = VMUL(re, d);                   
-  im = VMUL(im, VSWAPPAIRS(d));
-  return VSUB(re, im);  
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_IMUL(V4SF d, V4SF re, V4SF im)
+{
+  re = V4SF_MUL(re, d);
+  im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d));
+  return V4SF_SUB(re, im);
 }

-__INLINE V IMULJ(V d, V re, V im) {
-  re = VMUL(re, d);                   
-  im = VMUL(im, VSWAPPAIRS(d));
-  return VADD(re, im);  
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
+{
+  re = V4SF_MUL(re, d);
+  im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d));
+  return V4SF_ADD(re, im);
 }

-#endif
+#define V4SF2_ST vst2q_f32
+#define V4SF2_LD vld2q_f32
+
+static FFTS_ALWAYS_INLINE void
+V4SF2_STORE_SPR(float *addr, V4SF2 p)
+{
+    vst1q_f32(addr, p.val[0]);
+    vst1q_f32(addr + 4, p.val[1]);
+}
+
+#endif /* FFTS_MACROS_NEON_H */
--- a/lib/ffts/src/macros-sse.h
+++ b/lib/ffts/src/macros-sse.h
@ -1,84 +1,100 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */

-#ifndef __SSE_FLOAT_H__
-#define __SSE_FLOAT_H__
+#ifndef FFTS_MACROS_SSE_H
+#define FFTS_MACROS_SSE_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif

 #include <xmmintrin.h>

-//#define VL 4
+#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
+#define FFTS_FREE(d) (_mm_free(d))

-typedef __m128 V;
+typedef __m128 V4SF;

-#define VADD _mm_add_ps
-#define VSUB _mm_sub_ps
-#define VMUL _mm_mul_ps
-//#define VLIT4 _mm_set_ps
-#define VXOR _mm_xor_ps
-#define VST _mm_store_ps
-#define VLD _mm_load_ps
+#define V4SF_ADD  _mm_add_ps
+#define V4SF_SUB  _mm_sub_ps
+#define V4SF_MUL  _mm_mul_ps
+#define V4SF_LIT4 _mm_set_ps
+#define V4SF_XOR  _mm_xor_ps
+#define V4SF_ST   _mm_store_ps
+#define V4SF_LD   _mm_load_ps

-#define VSWAPPAIRS(x) (_mm_shuffle_ps(x,x,_MM_SHUFFLE(2,3,0,1)))
+#define V4SF_SWAP_PAIRS(x) \
+    (_mm_shuffle_ps(x, x, _MM_SHUFFLE(2,3,0,1)))

-#define VUNPACKHI(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,3,2)))
-#define VUNPACKLO(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(1,0,1,0)))
+#define V4SF_UNPACK_HI(x,y) \
+    (_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,3,2)))

-#define VBLEND(x,y) (_mm_shuffle_ps(x,y,_MM_SHUFFLE(3,2,1,0)))
+#define V4SF_UNPACK_LO(x,y) \
+    (_mm_movelh_ps(x, y))

-#define VLIT4 _mm_set_ps
+#define V4SF_BLEND(x, y) \
+    (_mm_shuffle_ps(x, y, _MM_SHUFFLE(3,2,1,0)))

-#define VDUPRE(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(2,2,0,0)))
-#define VDUPIM(r) (_mm_shuffle_ps(r,r,_MM_SHUFFLE(3,3,1,1)))
+#define V4SF_DUPLICATE_RE(r) \
+    (_mm_shuffle_ps(r, r, _MM_SHUFFLE(2,2,0,0)))

-#define FFTS_MALLOC(d,a) (_mm_malloc(d,a))
-#define FFTS_FREE(d) (_mm_free(d))
+#define V4SF_DUPLICATE_IM(r) \
+    (_mm_shuffle_ps(r, r, _MM_SHUFFLE(3,3,1,1)))

-__INLINE V IMULI(int inv, V a) {
-	if(inv) return VSWAPPAIRS(VXOR(a, VLIT4(0.0f, -0.0f, 0.0f, -0.0f)));
-	else    return VSWAPPAIRS(VXOR(a, VLIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_IMULI(int inv, V4SF a)
+{
+    if (inv) {
+        return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(0.0f, -0.0f, 0.0f, -0.0f)));
+    } else {
+        return V4SF_SWAP_PAIRS(V4SF_XOR(a, V4SF_LIT4(-0.0f, 0.0f, -0.0f, 0.0f)));
+    }
 }

-
-__INLINE V IMUL(V d, V re, V im) {
-  re = VMUL(re, d);                   
-  im = VMUL(im, VSWAPPAIRS(d));
-  return VSUB(re, im);  
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_IMUL(V4SF d, V4SF re, V4SF im)
+{
+    re = V4SF_MUL(re, d);
+    im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d));
+    return V4SF_SUB(re, im);
 }

-__INLINE V IMULJ(V d, V re, V im) {
-  re = VMUL(re, d);                   
-  im = VMUL(im, VSWAPPAIRS(d));
-  return VADD(re, im);  
+static FFTS_ALWAYS_INLINE V4SF
+V4SF_IMULJ(V4SF d, V4SF re, V4SF im)
+{
+    re = V4SF_MUL(re, d);
+    im = V4SF_MUL(im, V4SF_SWAP_PAIRS(d));
+    return V4SF_ADD(re, im);
 }

-#endif
+#endif /* FFTS_MACROS_SSE_H */
--- a/lib/ffts/src/macros.h
+++ b/lib/ffts/src/macros.h
@ -1,161 +1,204 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz> 
- Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2013, Michael J. Cree <mcree@orcon.net.nz>
+Copyright (c) 2012, 2013, Anthony M. Blake <amb@anthonix.com>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */

-#ifndef __MACROS_H__
-#define __MACROS_H__
+#ifndef FFTS_MACROS_H
+#define FFTS_MACROS_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif

 #ifdef HAVE_NEON
 #include "macros-neon.h"
+#elif HAVE_SSE
+#include "macros-sse.h"
+#elif __powerpc__
+#include "macros-altivec.h"
 #else
-#ifdef __alpha__
 #include "macros-alpha.h"
-#else
-#ifdef __powerpc__
-#include "macros-altivec.h"
-#endif
 #endif

-#endif
+static FFTS_INLINE void
+V4SF_TX2(V4SF *a, V4SF *b)
+{
+    V4SF t0 = V4SF_UNPACK_LO(*a, *b);
+    V4SF t1 = V4SF_UNPACK_HI(*a, *b);
+    *a = t0;
+    *b = t1;
+}

+static FFTS_INLINE void
+V4SF_K_N(int inv,
+         V4SF re,
+         V4SF im,
+         V4SF *r0,
+         V4SF *r1,
+         V4SF *r2,
+         V4SF *r3)
+{
+    V4SF uk, uk2, zk_p, zk_n, zk, zk_d;

-#ifdef HAVE_VFP
-#include "macros-alpha.h"
-#endif
-#ifdef HAVE_SSE
-	#include "macros-sse.h"
-#endif
+    uk  = *r0;
+    uk2 = *r1;

-static inline void TX2(V *a, V *b)
-{
-    V TX2_t0 = VUNPACKLO(*a, *b);
-    V TX2_t1 = VUNPACKHI(*a, *b);
-    *a = TX2_t0; *b = TX2_t1; 
+    zk_p = V4SF_IMUL(*r2, re, im);
+    zk_n = V4SF_IMULJ(*r3, re, im);
+
+    zk   = V4SF_ADD(zk_p, zk_n);
+    zk_d = V4SF_IMULI(inv, V4SF_SUB(zk_p, zk_n));
+
+    *r2 = V4SF_SUB(uk, zk);
+    *r0 = V4SF_ADD(uk, zk);
+    *r3 = V4SF_ADD(uk2, zk_d);
+    *r1 = V4SF_SUB(uk2, zk_d);
 }

-static inline void K_N(int inv, V re, V im, V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void
+V4SF_L_2_4(int inv,
+           const float *FFTS_RESTRICT i0,
+           const float *FFTS_RESTRICT i1,
+           const float *FFTS_RESTRICT i2,
+           const float *FFTS_RESTRICT i3,
+           V4SF *r0,
+           V4SF *r1,
+           V4SF *r2,
+           V4SF *r3)
 {
-    V uk, uk2, zk_p, zk_n, zk, zk_d;
-    uk   = *r0; uk2  = *r1;
-    zk_p = IMUL(*r2, re, im);
-    zk_n = IMULJ(*r3, re, im);
-
-    zk   = VADD(zk_p, zk_n);
-    zk_d = IMULI(inv, VSUB(zk_p, zk_n));
-  
-    *r2 = VSUB(uk, zk);
-    *r0 = VADD(uk, zk);
-    *r3 = VADD(uk2, zk_d);
-    *r1 = VSUB(uk2, zk_d);
-}
+    V4SF t0, t1, t2, t3, t4, t5, t6, t7;

+    t0 = V4SF_LD(i0);
+    t1 = V4SF_LD(i1);
+    t2 = V4SF_LD(i2);
+    t3 = V4SF_LD(i3);

-static inline void S_4(V r0, V r1, V r2, V r3, 
-		       data_t * restrict o0, data_t * restrict o1,
-		       data_t * restrict o2, data_t * restrict o3)
-{
-    VST(o0, r0); VST(o1, r1); VST(o2, r2); VST(o3, r3);
-}
+    t4 = V4SF_ADD(t0, t1);
+    t5 = V4SF_SUB(t0, t1);
+    t6 = V4SF_ADD(t2, t3);
+    t7 = V4SF_SUB(t2, t3);

+    *r0 = V4SF_UNPACK_LO(t4, t5);
+    *r1 = V4SF_UNPACK_LO(t6, t7);

-static inline void L_2_4(int inv, 
-			 const data_t * restrict i0, const data_t * restrict i1,
-			 const data_t * restrict i2, const data_t * restrict i3,
-			 V *r0, V *r1, V *r2, V *r3)
-{
-    V t0, t1, t2, t3, t4, t5, t6, t7;
-
-    t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);    
-    t4 = VADD(t0, t1);
-    t5 = VSUB(t0, t1);
-    t6 = VADD(t2, t3);
-    t7 = VSUB(t2, t3);
-    *r0 = VUNPACKLO(t4, t5);
-    *r1 = VUNPACKLO(t6, t7);
-    t5 = IMULI(inv, t5);
-    t0 = VADD(t6, t4);
-    t2 = VSUB(t6, t4);
-    t1 = VSUB(t7, t5);
-    t3 = VADD(t7, t5);
-    *r3 = VUNPACKHI(t0, t1);
-    *r2 = VUNPACKHI(t2, t3);
-}
+    t5 = V4SF_IMULI(inv, t5);

+    t0 = V4SF_ADD(t6, t4);
+    t2 = V4SF_SUB(t6, t4);
+    t1 = V4SF_SUB(t7, t5);
+    t3 = V4SF_ADD(t7, t5);

-static inline void L_4_4(int inv,  
-			 const data_t * restrict i0, const data_t * restrict i1,
-			 const data_t * restrict i2, const data_t * restrict i3,
-			 V *r0, V *r1, V *r2, V *r3)
-{
-    V t0, t1, t2, t3, t4, t5, t6, t7;
- 
-    t0 = VLD(i0);    t1 = VLD(i1);    t2 = VLD(i2);    t3 = VLD(i3);   
-    t4 = VADD(t0, t1);
-    t5 = VSUB(t0, t1);
-    t6 = VADD(t2, t3);
-    t7 = IMULI(inv, VSUB(t2, t3));
-    t0 = VADD(t4, t6);
-    t2 = VSUB(t4, t6);
-    t1 = VSUB(t5, t7);
-    t3 = VADD(t5, t7);
-    TX2(&t0, &t1);
-    TX2(&t2, &t3);
-    *r0 = t0; *r2 = t1; *r1 = t2; *r3 = t3; 
+    *r3 = V4SF_UNPACK_HI(t0, t1);
+    *r2 = V4SF_UNPACK_HI(t2, t3);
 }

+static FFTS_INLINE void
+V4SF_L_4_4(int inv,
+           const float *FFTS_RESTRICT i0,
+           const float *FFTS_RESTRICT i1,
+           const float *FFTS_RESTRICT i2,
+           const float *FFTS_RESTRICT i3,
+           V4SF *r0,
+           V4SF *r1,
+           V4SF *r2,
+           V4SF *r3)
+{
+    V4SF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4SF_LD(i0);
+    t1 = V4SF_LD(i1);
+    t2 = V4SF_LD(i2);
+    t3 = V4SF_LD(i3);

+    t4 = V4SF_ADD(t0, t1);
+    t5 = V4SF_SUB(t0, t1);
+    t6 = V4SF_ADD(t2, t3);
+
+    t7 = V4SF_IMULI(inv, V4SF_SUB(t2, t3));
+
+    t0 = V4SF_ADD(t4, t6);
+    t2 = V4SF_SUB(t4, t6);
+    t1 = V4SF_SUB(t5, t7);
+    t3 = V4SF_ADD(t5, t7);
+
+    V4SF_TX2(&t0, &t1);
+    V4SF_TX2(&t2, &t3);
+
+    *r0 = t0;
+    *r2 = t1;
+    *r1 = t2;
+    *r3 = t3;
+}

-static inline void L_4_2(int inv,  
-			 const data_t * restrict i0, const data_t * restrict i1,
-			 const data_t * restrict i2, const data_t * restrict i3,
-			 V *r0, V *r1, V *r2, V *r3)
+static FFTS_INLINE void
+V4SF_L_4_2(int inv,
+           const float *FFTS_RESTRICT i0,
+           const float *FFTS_RESTRICT i1,
+           const float *FFTS_RESTRICT i2,
+           const float *FFTS_RESTRICT i3,
+           V4SF *r0,
+           V4SF *r1,
+           V4SF *r2,
+           V4SF *r3)
 {
-    V t0, t1, t2, t3, t4, t5, t6, t7;
-
-    t0 = VLD(i0);    t1 = VLD(i1);    t6 = VLD(i2);    t7 = VLD(i3);
-    t2 = VBLEND(t6, t7);
-    t3 = VBLEND(t7, t6);
-    t4 = VADD(t0, t1);
-    t5 = VSUB(t0, t1);
-    t6 = VADD(t2, t3);
-    t7 = VSUB(t2, t3);
-    *r2 = VUNPACKHI(t4, t5);
-    *r3 = VUNPACKHI(t6, t7); 
-    t7 = IMULI(inv, t7);
-    t0 = VADD(t4, t6);
-    t2 = VSUB(t4, t6);
-    t1 = VSUB(t5, t7);
-    t3 = VADD(t5, t7);
-    *r0 = VUNPACKLO(t0, t1);
-    *r1 = VUNPACKLO(t2, t3);
+    V4SF t0, t1, t2, t3, t4, t5, t6, t7;
+
+    t0 = V4SF_LD(i0);
+    t1 = V4SF_LD(i1);
+    t6 = V4SF_LD(i2);
+    t7 = V4SF_LD(i3);
+
+    t2 = V4SF_BLEND(t6, t7);
+    t3 = V4SF_BLEND(t7, t6);
+
+    t4 = V4SF_ADD(t0, t1);
+    t5 = V4SF_SUB(t0, t1);
+    t6 = V4SF_ADD(t2, t3);
+    t7 = V4SF_SUB(t2, t3);
+
+    *r2 = V4SF_UNPACK_HI(t4, t5);
+    *r3 = V4SF_UNPACK_HI(t6, t7);
+
+    t7 = V4SF_IMULI(inv, t7);
+
+    t0 = V4SF_ADD(t4, t6);
+    t2 = V4SF_SUB(t4, t6);
+    t1 = V4SF_SUB(t5, t7);
+    t3 = V4SF_ADD(t5, t7);
+
+    *r0 = V4SF_UNPACK_LO(t0, t1);
+    *r1 = V4SF_UNPACK_LO(t2, t3);
 }
-#endif
+
+#define V4SF_S_4(r0, r1, r2, r3, o0, o1, o2, o3) \
+    V4SF_ST(o0, r0); V4SF_ST(o1, r1); V4SF_ST(o2, r2); V4SF_ST(o3, r3);
+
+#endif /* FFTS_MACROS_H */
--- a/lib/ffts/src/neon.h
+++ b/lib/ffts/src/neon.h
@ -1,38 +1,38 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.

- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato 
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */

-#ifndef __NEON_H__
-#define __NEON_H__
+#ifndef FFTS_NEON_H
+#define FFTS_NEON_H

 #include "ffts.h"

@ -45,21 +45,19 @@ void neon_eo();
 void neon_oe();
 void neon_end();

-void neon_transpose(uint64_t *in, uint64_t *out, int w, int h); 
-void neon_transpose_to_buf(uint64_t *in, uint64_t *out, int w); 
-
-//typedef struct _ffts_plan_t ffts_plan_t;
+void neon_transpose4(uint64_t *in, uint64_t *out, int w, int h); 
+void neon_transpose8(uint64_t *in, uint64_t *out, int w, int h); 

-void neon_static_e_f(ffts_plan_t * , const void * , void * );
-void neon_static_o_f(ffts_plan_t * , const void * , void * );
-void neon_static_x4_f(float *, size_t, float *);
-void neon_static_x8_f(float *, size_t, float *);
-void neon_static_x8_t_f(float *, size_t, float *);
+void neon_static_e_f(ffts_plan_t*, const void*, void*);
+void neon_static_o_f(ffts_plan_t*, const void*, void*);
+void neon_static_x4_f(float*, const float*);
+void neon_static_x8_f(float*, size_t, const float*);
+void neon_static_x8_t_f(float*, size_t, const float*);

-void neon_static_e_i(ffts_plan_t * , const void * , void * );
-void neon_static_o_i(ffts_plan_t * , const void * , void * );
-void neon_static_x4_i(float *, size_t, float *);
-void neon_static_x8_i(float *, size_t, float *);
-void neon_static_x8_t_i(float *, size_t, float *);
+void neon_static_e_i(ffts_plan_t*, const void*, void*);
+void neon_static_o_i(ffts_plan_t*, const void*, void*);
+void neon_static_x4_i(float*, const float*);
+void neon_static_x8_i(float*, size_t, const float*);
+void neon_static_x8_t_i(float*, size_t, const float*);

-#endif
+#endif /* FFTS_NEON_H */
--- a/lib/ffts/src/neon.s
+++ b/lib/ffts/src/neon.s
--- a/lib/ffts/src/neon_float.h
+++ b/lib/ffts/src/neon_float.h
--- a/lib/ffts/src/neon_static.s
+++ b/lib/ffts/src/neon_static.s
--- a/lib/ffts/src/neon_static_f.s
+++ b/lib/ffts/src/neon_static_f.s
@ -1,956 +0,0 @@
-/*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-	.align 4
-#ifdef __APPLE__
-	.globl	_neon_static_e_f
-_neon_static_e_f:
-#else
-	.globl	neon_static_e_f
-neon_static_e_f:
-#endif
-  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-  vstmdb	sp!, {d8-d15}
-  ldr lr, [r0, #40]  @ this is p->N
-  add	r3, r1, #0	
-  add	r7, r1, lr 
-  add	r5, r7, lr 
-  add	r10, r5, lr
-  add	r4, r10, lr	
-  add	r8, r4, lr
-  add	r6, r8, lr
-  add	r9, r6, lr
-  ldr	r12, [r0]
-  add	r1, r0, #0
-  add	r0, r2, #0	
-  ldr	r2, [r1, #16]   @ this is p->ee_ws
-  ldr	r11, [r1, #28]  @ this is p->i0	
-	
-	vld1.32	{d16, d17}, [r2, :128]
-_neon_ee_loop:
-	vld2.32 {q15}, [r10, :128]!
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vld2.32 {q9},  [r4, :128]!
-	vld2.32 {q10}, [r3, :128]!
-	vld2.32 {q11}, [r6, :128]!
-	vld2.32 {q12}, [r5, :128]!
-	vsub.f32	q1, q14, q13
-	vld2.32 {q0}, [r9, :128]!
-	subs	r11, r11, #1
-	vsub.f32	q2, q0, q15
-	vadd.f32	q0, q0, q15
-	vmul.f32	d10, d2, d17
-	vmul.f32	d11, d3, d16
-	vmul.f32	d12, d3, d17
-	vmul.f32	d6, d4, d17  
-	vmul.f32	d7, d5, d16  
-	vmul.f32	d8, d4, d16  
-	vmul.f32	d9, d5, d17  
-	vmul.f32	d13, d2, d16
-	vsub.f32	d7, d7, d6
-	vadd.f32	d11, d11, d10
-	vsub.f32	q1, q12, q11
-	vsub.f32	q2, q10, q9
-	vadd.f32	d6, d9, d8
-	vadd.f32	q4, q14, q13
-	vadd.f32	q11, q12, q11
-	vadd.f32	q12, q10, q9
-	vsub.f32	d10, d13, d12
-	vsub.f32	q7, q4, q0
-	vsub.f32	q9, q12, q11
-	vsub.f32	q13, q5, q3
-	vsub.f32	d29, d5, d2  @
-	vadd.f32	q5, q5, q3
-	vadd.f32	q10, q4, q0
-	vadd.f32	q11, q12, q11
-	vadd.f32	d31, d5, d2  @
-	vadd.f32	d28, d4, d3  @
-	vsub.f32	d30, d4, d3  @
-	vsub.f32	d5, d19, d14  @
-	vsub.f32	d7, d31, d26  @
-	vadd.f32	q1, q14, q5
-	vadd.f32	q0, q11, q10
-	vadd.f32	d6, d30, d27  @
-	vadd.f32	d4, d18, d15  @
-	vadd.f32	d13, d19, d14  @
-	vsub.f32	d12, d18, d15  @
-	vadd.f32	d15, d31, d26  @
-	ldr r2, [r12], #4
-	vtrn.32	q1, q3
-	ldr lr, [r12], #4
-	vtrn.32	q0, q2
-	add r2, r0, r2, lsl #2
-	vsub.f32	q4, q11, q10
-	add lr, r0, lr, lsl #2
-	vsub.f32	q5, q14, q5
-	vsub.f32	d14, d30, d27 @
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_ee_loop
-
-	ldr	r11, [r1, #12]
-	vld2.32 {q9}, [r5, :128]! @tag2
-	vld2.32 {q13}, [r3, :128]! @tag0
-	vld2.32 {q12}, [r4, :128]! @tag1
-	vld2.32 {q0}, [r7, :128]! @tag4
-	vsub.f32	q11, q13, q12
-	vld2.32 {q8}, [r6, :128]! @tag3
-	vadd.f32	q12, q13, q12
-	vsub.f32	q10, q9, q8
-	vadd.f32	q8, q9, q8
-	vadd.f32	q9, q12, q8
-	vsub.f32	d9, d23, d20  @
-	vadd.f32	d11, d23, d20  @
-	vsub.f32	q8, q12, q8
-	vadd.f32	d8, d22, d21  @
-	vsub.f32	d10, d22, d21  @
-	ldr r2, [r12], #4
-	vld1.32	{d20, d21}, [r11, :128]
-	ldr lr, [r12], #4
-	vtrn.32	q9, q4
-	add r2, r0, r2, lsl #2
-	vtrn.32	q8, q5
-	add lr, r0, lr, lsl #2
-	vswp d9,d10
-	vst1.32 {d8,d9,d10,d11}, [lr, :128]!
-	vld2.32 {q13}, [r10, :128]! @tag7
-	vld2.32 {q15}, [r9, :128]! @tag6
-	vld2.32 {q11}, [r8, :128]! @tag5
-	vsub.f32	q14, q15, q13
-	vsub.f32	q12, q0, q11
-	vadd.f32	q11, q0, q11
-	vadd.f32	q13, q15, q13
-	vsub.f32	d13, d29, d24  @
-	vadd.f32	q15, q13, q11
-	vadd.f32	d12, d28, d25  @
-	vadd.f32	d15, d29, d24  @
-	vsub.f32	d14, d28, d25  @
-	vtrn.32	q15, q6
-	vsub.f32	q15, q13, q11
-	vtrn.32	q15, q7
-	vswp d13, d14
-	vst1.32 {d12,d13,d14,d15}, [lr, :128]!
-	vtrn.32	q13, q14
-	vtrn.32	q11, q12
-	vmul.f32	d24, d26, d21
-	vmul.f32	d28, d27, d20
-	vmul.f32	d25, d26, d20
-	vmul.f32	d26, d27, d21
-	vmul.f32	d27, d22, d21
-	vmul.f32	d30, d23, d20
-	vmul.f32	d29, d23, d21
-	vmul.f32	d22, d22, d20
-	vsub.f32	d21, d28, d24
-	vadd.f32	d20, d26, d25
-	vadd.f32	d25, d30, d27
-	vsub.f32	d24, d22, d29
-	vadd.f32	q11, q12, q10
-	vsub.f32	q10, q12, q10
-	vadd.f32	q0, q9, q11
-	vsub.f32	q2, q9, q11
-	vsub.f32	d3, d17, d20  @
-	vadd.f32	d7, d17, d20  @
-	vadd.f32	d2, d16, d21  @
-	vsub.f32	d6, d16, d21  @
-	vswp d1, d2
-	vswp d5, d6
-	vstmia r2!, {q0-q3}
-
-  add	r2, r7, #0	
-  add	r7, r9, #0	
-  add	r9, r2, #0	
-  add	r2, r8, #0	
-  add	r8, r10, #0	
-  add	r10, r2, #0	
-  ldr	r11, [r1, #32]  @ this is p->i1	
-	cmp r11, #0
-	beq _neon_oo_loop_exit
-_neon_oo_loop:
-	vld2.32 {q8}, [r6, :128]!
-	vld2.32 {q9}, [r5, :128]!
-	vld2.32 {q10}, [r4, :128]!
-	vld2.32 {q13}, [r3, :128]!
-	vadd.f32	q11, q9, q8
-	vsub.f32	q8, q9, q8
-	vsub.f32	q9, q13, q10
-	vadd.f32	q12, q13, q10
-	subs	r11, r11, #1
-	vld2.32 {q10}, [r7, :128]!
-	vld2.32 {q13}, [r9, :128]!
-	vsub.f32	q2, q12, q11
-	vadd.f32	d7, d19, d16  @
-	vsub.f32	d3, d19, d16  @
-	vsub.f32	d6, d18, d17  @
-	vadd.f32	d2, d18, d17  @
-	vld2.32 {q9}, [r8, :128]!
-	vld2.32 {q8}, [r10, :128]!
-	vadd.f32	q0, q12, q11
-	vadd.f32	q11, q13, q8
-	vadd.f32	q12, q10, q9
-	vsub.f32	q8, q13, q8
-	vsub.f32	q9, q10, q9
-	vsub.f32	q6, q12, q11
-	vadd.f32	q4, q12, q11
-	vtrn.32	q0, q2
-	ldr r2, [r12], #4
-	vadd.f32	d15, d19, d16  @
-	ldr lr, [r12], #4
-	vsub.f32	d11, d19, d16  @
-	vsub.f32	d14, d18, d17  @
-	vadd.f32	d10, d18, d17  @
-	add r2, r0, r2, lsl #2
-	vtrn.32	q1, q3
-	add lr, r0, lr, lsl #2
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_oo_loop
-_neon_oo_loop_exit:
-
-
-  add	r2, r3, #0
-  add	r3, r7, #0
-  add	r7, r2, #0
-  add	r2, r4, #0
-  add	r4, r8, #0
-  add	r8, r2, #0
-  add	r2, r5, #0
-  add	r5, r9, #0
-  add	r9, r2, #0
-  add	r2, r6, #0
-  add	r6, r10, #0
-  add	r10, r2, #0
-  add	r2, r9, #0	
-  add	r9, r10, #0	
-  add	r10, r2, #0	
-  ldr	r2, [r1, #16]
-  ldr	r11, [r1, #32]  @ this is p->i1	
-	cmp r11, #0
-	beq _neon_ee_loop2_exit
-
-	vld1.32	{d16, d17}, [r2, :128]
-_neon_ee_loop2:
-	vld2.32 {q15}, [r10, :128]!
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vld2.32 {q9},  [r4, :128]!
-	vld2.32 {q10}, [r3, :128]!
-	vld2.32 {q11}, [r6, :128]!
-	vld2.32 {q12}, [r5, :128]!
-	vsub.f32	q1, q14, q13
-	vld2.32 {q0}, [r9, :128]!
-	subs	r11, r11, #1
-	vsub.f32	q2, q0, q15
-	vadd.f32	q0, q0, q15
-	vmul.f32	d10, d2, d17
-	vmul.f32	d11, d3, d16
-	vmul.f32	d12, d3, d17
-	vmul.f32	d6, d4, d17  
-	vmul.f32	d7, d5, d16  
-	vmul.f32	d8, d4, d16  
-	vmul.f32	d9, d5, d17  
-	vmul.f32	d13, d2, d16
-	vsub.f32	d7, d7, d6
-	vadd.f32	d11, d11, d10
-	vsub.f32	q1, q12, q11
-	vsub.f32	q2, q10, q9
-	vadd.f32	d6, d9, d8
-	vadd.f32	q4, q14, q13
-	vadd.f32	q11, q12, q11
-	vadd.f32	q12, q10, q9
-	vsub.f32	d10, d13, d12
-	vsub.f32	q7, q4, q0
-	vsub.f32	q9, q12, q11
-	vsub.f32	q13, q5, q3
-	vsub.f32	d29, d5, d2  @
-	vadd.f32	q5, q5, q3
-	vadd.f32	q10, q4, q0
-	vadd.f32	q11, q12, q11
-	vadd.f32	d31, d5, d2  @
-	vadd.f32	d28, d4, d3  @
-	vsub.f32	d30, d4, d3  @
-	vsub.f32	d5, d19, d14  @
-	vsub.f32	d7, d31, d26  @
-	vadd.f32	q1, q14, q5
-	vadd.f32	q0, q11, q10
-	vadd.f32	d6, d30, d27  @
-	vadd.f32	d4, d18, d15  @
-	vadd.f32	d13, d19, d14  @
-	vsub.f32	d12, d18, d15  @
-	vadd.f32	d15, d31, d26  @
-	ldr r2, [r12], #4
-	vtrn.32	q1, q3
-	ldr lr, [r12], #4
-	vtrn.32	q0, q2
-	add r2, r0, r2, lsl #2
-	vsub.f32	q4, q11, q10
-	add lr, r0, lr, lsl #2
-	vsub.f32	q5, q14, q5
-	vsub.f32	d14, d30, d27 @
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_ee_loop2
-_neon_ee_loop2_exit:
-
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-
-
-
-	.align 4
-#ifdef __APPLE__
-	.globl	_neon_static_o_f
-_neon_static_o_f:
-#else
-	.globl	neon_static_o_f
-neon_static_o_f:
-#endif
-  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-  vstmdb	sp!, {d8-d15}
-  ldr lr, [r0, #40]  @ this is p->N
-  add	r3, r1, #0	
-  add	r7, r1, lr 
-  add	r5, r7, lr 
-  add	r10, r5, lr
-  add	r4, r10, lr	
-  add	r8, r4, lr
-  add	r6, r8, lr
-  add	r9, r6, lr
-  ldr	r12, [r0]
-  add	r1, r0, #0
-  add	r0, r2, #0	
-  ldr	r2, [r1, #16]   @ this is p->ee_ws
-  ldr	r11, [r1, #28]  @ this is p->i0	
-	
-	vld1.32	{d16, d17}, [r2, :128]
-_neon_ee_o_loop:
-	vld2.32 {q15}, [r10, :128]!
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vld2.32 {q9},  [r4, :128]!
-	vld2.32 {q10}, [r3, :128]!
-	vld2.32 {q11}, [r6, :128]!
-	vld2.32 {q12}, [r5, :128]!
-	vsub.f32	q1, q14, q13
-	vld2.32 {q0}, [r9, :128]!
-	subs	r11, r11, #1
-	vsub.f32	q2, q0, q15
-	vadd.f32	q0, q0, q15
-	vmul.f32	d10, d2, d17
-	vmul.f32	d11, d3, d16
-	vmul.f32	d12, d3, d17
-	vmul.f32	d6, d4, d17  
-	vmul.f32	d7, d5, d16  
-	vmul.f32	d8, d4, d16  
-	vmul.f32	d9, d5, d17  
-	vmul.f32	d13, d2, d16
-	vsub.f32	d7, d7, d6
-	vadd.f32	d11, d11, d10
-	vsub.f32	q1, q12, q11
-	vsub.f32	q2, q10, q9
-	vadd.f32	d6, d9, d8
-	vadd.f32	q4, q14, q13
-	vadd.f32	q11, q12, q11
-	vadd.f32	q12, q10, q9
-	vsub.f32	d10, d13, d12
-	vsub.f32	q7, q4, q0
-	vsub.f32	q9, q12, q11
-	vsub.f32	q13, q5, q3
-	vsub.f32	d29, d5, d2  @
-	vadd.f32	q5, q5, q3
-	vadd.f32	q10, q4, q0
-	vadd.f32	q11, q12, q11
-	vadd.f32	d31, d5, d2  @
-	vadd.f32	d28, d4, d3  @
-	vsub.f32	d30, d4, d3  @
-	vsub.f32	d5, d19, d14  @
-	vsub.f32	d7, d31, d26  @
-	vadd.f32	q1, q14, q5
-	vadd.f32	q0, q11, q10
-	vadd.f32	d6, d30, d27  @
-	vadd.f32	d4, d18, d15  @
-	vadd.f32	d13, d19, d14  @
-	vsub.f32	d12, d18, d15  @
-	vadd.f32	d15, d31, d26  @
-	ldr r2, [r12], #4
-	vtrn.32	q1, q3
-	ldr lr, [r12], #4
-	vtrn.32	q0, q2
-	add r2, r0, r2, lsl #2
-	vsub.f32	q4, q11, q10
-	add lr, r0, lr, lsl #2
-	vsub.f32	q5, q14, q5
-	vsub.f32	d14, d30, d27 @
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_ee_o_loop
-
-  add	r2, r7, #0	
-  add	r7, r9, #0	
-  add	r9, r2, #0	
-  add	r2, r8, #0	
-  add	r8, r10, #0	
-  add	r10, r2, #0	
-  ldr	r11, [r1, #32]  @ this is p->i1	
-	cmp r11, #0
-	beq _neon_oo_o_loop_exit
-_neon_oo_o_loop:
-	vld2.32 {q8}, [r6, :128]!
-	vld2.32 {q9}, [r5, :128]!
-	vld2.32 {q10}, [r4, :128]!
-	vld2.32 {q13}, [r3, :128]!
-	vadd.f32	q11, q9, q8
-	vsub.f32	q8, q9, q8
-	vsub.f32	q9, q13, q10
-	vadd.f32	q12, q13, q10
-	subs	r11, r11, #1
-	vld2.32 {q10}, [r7, :128]!
-	vld2.32 {q13}, [r9, :128]!
-	vsub.f32	q2, q12, q11
-	vadd.f32	d7, d19, d16  @
-	vsub.f32	d3, d19, d16  @
-	vsub.f32	d6, d18, d17  @
-	vadd.f32	d2, d18, d17  @
-	vld2.32 {q9}, [r8, :128]!
-	vld2.32 {q8}, [r10, :128]!
-	vadd.f32	q0, q12, q11
-	vadd.f32	q11, q13, q8
-	vadd.f32	q12, q10, q9
-	vsub.f32	q8, q13, q8
-	vsub.f32	q9, q10, q9
-	vsub.f32	q6, q12, q11
-	vadd.f32	q4, q12, q11
-	vtrn.32	q0, q2
-	ldr r2, [r12], #4
-	vadd.f32	d15, d19, d16  @
-	ldr lr, [r12], #4
-	vsub.f32	d11, d19, d16  @
-	vsub.f32	d14, d18, d17  @
-	vadd.f32	d10, d18, d17  @
-	add r2, r0, r2, lsl #2
-	vtrn.32	q1, q3
-	add lr, r0, lr, lsl #2
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_oo_o_loop
-_neon_oo_o_loop_exit:
-
-	ldr	r11, [r1, #8]
-	vld1.32 {q8}, [r5, :128]!
-	vld1.32 {q10}, [r6, :128]!
-	vld2.32 {q11}, [r4, :128]!
-	vld2.32 {q13}, [r3, :128]!
-	vld2.32 {q15}, [r10, :128]!
-	vorr	d25, d17, d17
-	vorr	d24, d20, d20
-	vorr	d20, d16, d16
-	vsub.f32	q9, q13, q11
-	vadd.f32	q11, q13, q11
-	ldr r2, [r12], #4
-	vtrn.32	d24, d25
-	ldr lr, [r12], #4
-	vtrn.32	d20, d21
-	add r2, r0, r2, lsl #2
-	vsub.f32	q8, q10, q12
-	add lr, r0, lr, lsl #2
-	vadd.f32	q10, q10, q12
-	vadd.f32	q0, q11, q10
-	vsub.f32	d25, d19, d16  @
-	vadd.f32	d27, d19, d16  @
-	vsub.f32	q1, q11, q10
-	vadd.f32	d24, d18, d17  @
-	vsub.f32	d26, d18, d17  @
-	vtrn.32	q0, q12
-	vtrn.32	q1, q13
-	vld1.32	{d24, d25}, [r11, :128]
-	vswp d1, d2
-	vst1.32 {q0, q1}, [r2, :128]!
-	vld2.32 {q0}, [r9, :128]!
-	vadd.f32	q1, q0, q15
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vsub.f32	q15, q0, q15
-	vsub.f32	q0, q14, q13
-	vadd.f32	q3, q14, q13
-	vadd.f32	q2, q3, q1
-	vsub.f32	d29, d1, d30  @
-	vadd.f32	d27, d1, d30  @
-	vsub.f32	q3, q3, q1
-	vadd.f32	d28, d0, d31  @
-	vsub.f32	d26, d0, d31  @
-	vtrn.32	q2, q14
-	vtrn.32	q3, q13
-	vswp d5, d6
-	vst1.32 {q2, q3}, [r2, :128]!
-	vtrn.32	q11, q9
-	vtrn.32	q10, q8
-	vmul.f32	d20, d18, d25
-	vmul.f32	d22, d19, d24
-	vmul.f32	d21, d19, d25
-	vmul.f32	d18, d18, d24
-	vmul.f32	d19, d16, d25
-	vmul.f32	d30, d17, d24
-	vmul.f32	d23, d16, d24
-	vmul.f32	d24, d17, d25
-	vadd.f32	d17, d22, d20
-	vsub.f32	d16, d18, d21
-	vsub.f32	d21, d30, d19
-	vadd.f32	d20, d24, d23
-	vadd.f32	q9, q8, q10
-	vsub.f32	q8, q8, q10
-	vadd.f32	q4, q14, q9
-	vsub.f32	q6, q14, q9
-	vsub.f32	d11, d27, d16  @
-	vadd.f32	d15, d27, d16  @
-	vadd.f32	d10, d26, d17  @
-	vsub.f32	d14, d26, d17  @
-	vswp d9, d10
-	vswp d13, d14
-	vstmia lr!, {q4-q7}
-
-
-  add	r2, r3, #0
-  add	r3, r7, #0
-  add	r7, r2, #0
-  add	r2, r4, #0
-  add	r4, r8, #0
-  add	r8, r2, #0
-  add	r2, r5, #0
-  add	r5, r9, #0
-  add	r9, r2, #0
-  add	r2, r6, #0
-  add	r6, r10, #0
-  add	r10, r2, #0
-  add	r2, r9, #0	
-  add	r9, r10, #0	
-  add	r10, r2, #0	
-  ldr	r2, [r1, #16]
-  ldr	r11, [r1, #32]  @ this is p->i1	
-	cmp r11, #0
-	beq _neon_ee_o_loop2_exit
-
-	vld1.32	{d16, d17}, [r2, :128]
-_neon_ee_o_loop2:
-	vld2.32 {q15}, [r10, :128]!
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vld2.32 {q9},  [r4, :128]!
-	vld2.32 {q10}, [r3, :128]!
-	vld2.32 {q11}, [r6, :128]!
-	vld2.32 {q12}, [r5, :128]!
-	vsub.f32	q1, q14, q13
-	vld2.32 {q0}, [r9, :128]!
-	subs	r11, r11, #1
-	vsub.f32	q2, q0, q15
-	vadd.f32	q0, q0, q15
-	vmul.f32	d10, d2, d17
-	vmul.f32	d11, d3, d16
-	vmul.f32	d12, d3, d17
-	vmul.f32	d6, d4, d17  
-	vmul.f32	d7, d5, d16  
-	vmul.f32	d8, d4, d16  
-	vmul.f32	d9, d5, d17  
-	vmul.f32	d13, d2, d16
-	vsub.f32	d7, d7, d6
-	vadd.f32	d11, d11, d10
-	vsub.f32	q1, q12, q11
-	vsub.f32	q2, q10, q9
-	vadd.f32	d6, d9, d8
-	vadd.f32	q4, q14, q13
-	vadd.f32	q11, q12, q11
-	vadd.f32	q12, q10, q9
-	vsub.f32	d10, d13, d12
-	vsub.f32	q7, q4, q0
-	vsub.f32	q9, q12, q11
-	vsub.f32	q13, q5, q3
-	vsub.f32	d29, d5, d2  @
-	vadd.f32	q5, q5, q3
-	vadd.f32	q10, q4, q0
-	vadd.f32	q11, q12, q11
-	vadd.f32	d31, d5, d2  @
-	vadd.f32	d28, d4, d3  @
-	vsub.f32	d30, d4, d3  @
-	vsub.f32	d5, d19, d14  @
-	vsub.f32	d7, d31, d26  @
-	vadd.f32	q1, q14, q5
-	vadd.f32	q0, q11, q10
-	vadd.f32	d6, d30, d27  @
-	vadd.f32	d4, d18, d15  @
-	vadd.f32	d13, d19, d14  @
-	vsub.f32	d12, d18, d15  @
-	vadd.f32	d15, d31, d26  @
-	ldr r2, [r12], #4
-	vtrn.32	q1, q3
-	ldr lr, [r12], #4
-	vtrn.32	q0, q2
-	add r2, r0, r2, lsl #2
-	vsub.f32	q4, q11, q10
-	add lr, r0, lr, lsl #2
-	vsub.f32	q5, q14, q5
-	vsub.f32	d14, d30, d27 @
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_ee_o_loop2
-_neon_ee_o_loop2_exit:
-
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-	.align	4
-#ifdef __APPLE__
-	.globl	_neon_static_x4_f
-_neon_static_x4_f:
-#else
-	.globl	neon_static_x4_f
-neon_static_x4_f:
-#endif
-@	add r3, r0, #0
-  push	{r4, r5, r6, lr}
-  vstmdb	sp!, {d8-d15}
-
-	vld1.32 {q8,q9}, [r0, :128]
-	add r4, r0, r1, lsl #1
-	vld1.32 {q10,q11}, [r4, :128]
-	add r5, r0, r1, lsl #2
-	vld1.32 {q12,q13}, [r5, :128]
-	add r6, r4, r1, lsl #2
-	vld1.32 {q14,q15}, [r6, :128]
-	vld1.32 {q2,q3}, [r2, :128]
-	
-	vmul.f32	q0, q13, q3
-	vmul.f32	q5, q12, q2
-	vmul.f32	q1, q14, q2
-	vmul.f32	q4, q14, q3
-	vmul.f32	q14, q12, q3
-	vmul.f32	q13, q13, q2
-	vmul.f32	q12, q15, q3
-	vmul.f32	q2, q15, q2
-	vsub.f32	q0, q5, q0
-	vadd.f32	q13, q13, q14
-	vadd.f32	q12, q12, q1
-	vsub.f32	q1, q2, q4
-	vadd.f32	q15, q0, q12
-	vsub.f32	q12, q0, q12
-	vadd.f32	q14, q13, q1
-	vsub.f32	q13, q13, q1
-	vadd.f32	q0, q8, q15
-	vadd.f32	q1, q9, q14
-	vadd.f32	q2, q10, q13  @
-	vsub.f32	q4, q8, q15
-	vsub.f32	q3, q11, q12  @
-	vst1.32 {q0,q1}, [r0, :128]
-	vsub.f32	q5, q9, q14
-	vsub.f32	q6, q10, q13  @
-	vadd.f32	q7, q11, q12  @
-	vst1.32 {q2,q3}, [r4, :128]
-	vst1.32 {q4,q5}, [r5, :128]
-	vst1.32 {q6,q7}, [r6, :128]
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, pc}
-
-
-
-	.align 4
-#ifdef __APPLE__
-	.globl	_neon_static_x8_f
-_neon_static_x8_f:
-#else
-	.globl	neon_static_x8_f
-neon_static_x8_f:
-#endif
-  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-  vstmdb	sp!, {d8-d15}
-	mov r11, #0
-	add r3, r0, #0           @ data0
-	add r5, r0, r1, lsl #1   @ data2
-	add r4, r0, r1           @ data1
-	add r7, r5, r1, lsl #1   @ data4
-	add r6, r5, r1           @ data3
-	add r9, r7, r1, lsl #1   @ data6
-	add r8, r7, r1           @ data5
-	add r10, r9, r1          @ data7
-	add r12, r2, #0          @ LUT
-
-	sub r11, r11, r1, lsr #5
-neon_x8_loop: 
-  vld1.32 {q2,q3}, [r12, :128]!
-  vld1.32 {q14,q15}, [r6, :128]
-  vld1.32 {q10,q11}, [r5, :128]
-  adds	r11, r11, #1
-  vmul.f32	q12, q15, q2
-  vmul.f32	q8, q14, q3
-  vmul.f32	q13, q14, q2
-  vmul.f32	q9, q10, q3
-  vmul.f32	q1, q10, q2
-  vmul.f32	q0, q11, q2
-  vmul.f32	q14, q11, q3
-  vmul.f32	q15, q15, q3
-  vld1.32 {q2,q3}, [r12, :128]!
-  vsub.f32	q10, q12, q8
-  vadd.f32	q11, q0, q9
-  vadd.f32	q8, q15, q13
-  vld1.32 {q12,q13}, [r4, :128]
-  vsub.f32	q9, q1, q14
-  vsub.f32	q15, q11, q10
-  vsub.f32	q14, q9, q8
-  vadd.f32	q4, q12, q15  @
-  vsub.f32	q6, q12, q15  @ 
-  vsub.f32	q5, q13, q14  @
-  vadd.f32	q7, q13, q14  @
-  vld1.32 {q14,q15}, [r9, :128]
-  vld1.32 {q12,q13}, [r7, :128]
-  vmul.f32	q1, q14, q2
-  vmul.f32	q0, q14, q3
-  vst1.32 {q4,q5}, [r4, :128]
-  vmul.f32	q14, q15, q3
-  vmul.f32	q4, q15, q2
-  vadd.f32	q15, q9, q8
-  vst1.32 {q6,q7}, [r6, :128]
-  vmul.f32	q8, q12, q3
-  vmul.f32	q5, q13, q3
-  vmul.f32	q12, q12, q2
-  vmul.f32	q9, q13, q2
-  vadd.f32	q14, q14, q1
-  vsub.f32	q13, q4, q0
-  vadd.f32	q0, q9, q8
-  vld1.32 {q8,q9}, [r3, :128]
-  vadd.f32	q1, q11, q10
-  vsub.f32	q12, q12, q5
-  vadd.f32	q11, q8, q15
-  vsub.f32	q8, q8, q15
-  vadd.f32	q2, q12, q14
-  vsub.f32	q10, q0, q13
-  vadd.f32	q15, q0, q13
-  vadd.f32	q13, q9, q1
-  vsub.f32	q9, q9, q1
-  vsub.f32	q12, q12, q14
-  vadd.f32	q0, q11, q2
-  vadd.f32	q1, q13, q15
-  vsub.f32	q4, q11, q2
-  vadd.f32	q2, q8, q10  @
-  vsub.f32	q3, q9, q12  @
-  vst1.32 {q0,q1}, [r3, :128]!
-  vsub.f32	q5, q13, q15
-  vld1.32 {q14,q15}, [r10, :128]
-  vadd.f32	q7, q9, q12  @
-  vld1.32 {q12,q13}, [r8, :128]
-  vst1.32 {q2,q3}, [r5, :128]!
-  vld1.32 {q2,q3}, [r12, :128]!
-  vsub.f32	q6, q8, q10  @
-  vmul.f32	q8, q14, q2
-  vst1.32 {q4,q5}, [r7, :128]!
-  vmul.f32	q10, q15, q3
-  vmul.f32	q9, q13, q3
-  vmul.f32	q11, q12, q2
-  vmul.f32	q14, q14, q3
-  vst1.32 {q6,q7}, [r9, :128]!
-  vmul.f32	q15, q15, q2
-  vmul.f32	q12, q12, q3
-  vmul.f32	q13, q13, q2
-  vadd.f32	q10, q10, q8
-  vsub.f32	q11, q11, q9
-  vld1.32 {q8,q9}, [r4, :128]
-  vsub.f32	q14, q15, q14
-  vadd.f32	q15, q13, q12
-  vadd.f32	q13, q11, q10
-  vadd.f32	q12, q15, q14
-  vsub.f32	q15, q15, q14
-  vsub.f32	q14, q11, q10
-  vld1.32 {q10,q11}, [r6, :128]
-  vadd.f32	q0, q8, q13
-  vadd.f32	q1, q9, q12
-  vadd.f32	q2, q10, q15  @
-  vsub.f32	q3, q11, q14  @
-  vsub.f32	q4, q8, q13
-  vst1.32 {q0,q1}, [r4, :128]!
-  vsub.f32	q5, q9, q12
-  vsub.f32	q6, q10, q15  @
-  vst1.32 {q2,q3}, [r6, :128]!
-  vadd.f32	q7, q11, q14  @
-  vst1.32 {q4,q5}, [r8, :128]!
-  vst1.32 {q6,q7}, [r10, :128]!
-	bne neon_x8_loop	
-
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-	.align 4
-#ifdef __APPLE__
-	.globl	_neon_static_x8_t_f
-_neon_static_x8_t_f:
-#else
-	.globl	neon_static_x8_t_f
-neon_static_x8_t_f:
-#endif
-  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-  vstmdb	sp!, {d8-d15}
-	mov r11, #0
-	add r3, r0, #0           @ data0
-	add r5, r0, r1, lsl #1   @ data2
-	add r4, r0, r1           @ data1
-	add r7, r5, r1, lsl #1   @ data4
-	add r6, r5, r1           @ data3
-	add r9, r7, r1, lsl #1   @ data6
-	add r8, r7, r1           @ data5
-	add r10, r9, r1          @ data7
-	add r12, r2, #0          @ LUT
-
-	sub r11, r11, r1, lsr #5
-neon_x8_t_loop: 
-  vld1.32 {q2,q3}, [r12, :128]!
-  vld1.32 {q14,q15}, [r6, :128]
-  vld1.32 {q10,q11}, [r5, :128]
-  adds	r11, r11, #1
-  vmul.f32	q12, q15, q2
-  vmul.f32	q8, q14, q3
-  vmul.f32	q13, q14, q2
-  vmul.f32	q9, q10, q3
-  vmul.f32	q1, q10, q2
-  vmul.f32	q0, q11, q2
-  vmul.f32	q14, q11, q3
-  vmul.f32	q15, q15, q3
-  vld1.32 {q2,q3}, [r12, :128]!
-  vsub.f32	q10, q12, q8
-  vadd.f32	q11, q0, q9
-  vadd.f32	q8, q15, q13
-  vld1.32 {q12,q13}, [r4, :128]
-  vsub.f32	q9, q1, q14
-  vsub.f32	q15, q11, q10
-  vsub.f32	q14, q9, q8
-  vadd.f32	q4, q12, q15  @
-  vsub.f32	q6, q12, q15  @
-  vsub.f32	q5, q13, q14  @
-  vadd.f32	q7, q13, q14  @
-  vld1.32 {q14,q15}, [r9, :128]
-  vld1.32 {q12,q13}, [r7, :128]
-  vmul.f32	q1, q14, q2
-  vmul.f32	q0, q14, q3
-  vst1.32 {q4,q5}, [r4, :128]
-  vmul.f32	q14, q15, q3
-  vmul.f32	q4, q15, q2
-  vadd.f32	q15, q9, q8
-  vst1.32 {q6,q7}, [r6, :128]
-  vmul.f32	q8, q12, q3
-  vmul.f32	q5, q13, q3
-  vmul.f32	q12, q12, q2
-  vmul.f32	q9, q13, q2
-  vadd.f32	q14, q14, q1
-  vsub.f32	q13, q4, q0
-  vadd.f32	q0, q9, q8
-  vld1.32 {q8,q9}, [r3, :128]
-  vadd.f32	q1, q11, q10
-  vsub.f32	q12, q12, q5
-  vadd.f32	q11, q8, q15
-  vsub.f32	q8, q8, q15
-  vadd.f32	q2, q12, q14
-  vsub.f32	q10, q0, q13
-  vadd.f32	q15, q0, q13
-  vadd.f32	q13, q9, q1
-  vsub.f32	q9, q9, q1
-  vsub.f32	q12, q12, q14
-  vadd.f32	q0, q11, q2
-  vadd.f32	q1, q13, q15
-  vsub.f32	q4, q11, q2
-  vadd.f32	q2, q8, q10  @
-  vsub.f32	q3, q9, q12  @
-  vst2.32 {q0,q1}, [r3, :128]!
-  vsub.f32	q5, q13, q15
-  vld1.32 {q14,q15}, [r10, :128]
-  vadd.f32	q7, q9, q12  @
-  vld1.32 {q12,q13}, [r8, :128]
-  vst2.32 {q2,q3}, [r5, :128]!
-  vld1.32 {q2,q3}, [r12, :128]!
-  vsub.f32	q6, q8, q10  @
-  vmul.f32	q8, q14, q2
-  vst2.32 {q4,q5}, [r7, :128]!
-  vmul.f32	q10, q15, q3
-  vmul.f32	q9, q13, q3
-  vmul.f32	q11, q12, q2
-  vmul.f32	q14, q14, q3
-  vst2.32 {q6,q7}, [r9, :128]!
-  vmul.f32	q15, q15, q2
-  vmul.f32	q12, q12, q3
-  vmul.f32	q13, q13, q2
-  vadd.f32	q10, q10, q8
-  vsub.f32	q11, q11, q9
-  vld1.32 {q8,q9}, [r4, :128]
-  vsub.f32	q14, q15, q14
-  vadd.f32	q15, q13, q12
-  vadd.f32	q13, q11, q10
-  vadd.f32	q12, q15, q14
-  vsub.f32	q15, q15, q14
-  vsub.f32	q14, q11, q10
-  vld1.32 {q10,q11}, [r6, :128]
-  vadd.f32	q0, q8, q13
-  vadd.f32	q1, q9, q12
-  vadd.f32	q2, q10, q15  @
-  vsub.f32	q3, q11, q14  @
-  vsub.f32	q4, q8, q13
-  vst2.32 {q0,q1}, [r4, :128]!
-  vsub.f32	q5, q9, q12
-  vsub.f32	q6, q10, q15  @
-  vst2.32 {q2,q3}, [r6, :128]!
-  vadd.f32	q7, q11, q14  @
-  vst2.32 {q4,q5}, [r8, :128]!
-  vst2.32 {q6,q7}, [r10, :128]!
-	bne neon_x8_t_loop	
-	
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-
--- a/lib/ffts/src/neon_static_i.s
+++ b/lib/ffts/src/neon_static_i.s
@ -1,955 +0,0 @@
-/*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-	.align 4
-#ifdef __APPLE__
-	.globl	_neon_static_e_i
-_neon_static_e_i:
-#else
-	.globl	neon_static_e_i
-neon_static_e_i:
-#endif
-  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-  vstmdb	sp!, {d8-d15}
-  ldr lr, [r0, #40]  @ this is p->N
-  add	r3, r1, #0	
-  add	r7, r1, lr 
-  add	r5, r7, lr 
-  add	r10, r5, lr
-  add	r4, r10, lr	
-  add	r8, r4, lr
-  add	r6, r8, lr
-  add	r9, r6, lr
-  ldr	r12, [r0]
-  add	r1, r0, #0
-  add	r0, r2, #0	
-  ldr	r2, [r1, #16]   @ this is p->ee_ws
-  ldr	r11, [r1, #28]  @ this is p->i0	
-	
-	vld1.32	{d16, d17}, [r2, :128]
-_neon_ee_loop:
-	vld2.32 {q15}, [r10, :128]!
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vld2.32 {q9},  [r4, :128]!
-	vld2.32 {q10}, [r3, :128]!
-	vld2.32 {q11}, [r6, :128]!
-	vld2.32 {q12}, [r5, :128]!
-	vsub.f32	q1, q14, q13
-	vld2.32 {q0}, [r9, :128]!
-	subs	r11, r11, #1
-	vsub.f32	q2, q0, q15
-	vadd.f32	q0, q0, q15
-	vmul.f32	d10, d2, d17
-	vmul.f32	d11, d3, d16
-	vmul.f32	d12, d3, d17
-	vmul.f32	d6, d4, d17  
-	vmul.f32	d7, d5, d16  
-	vmul.f32	d8, d4, d16  
-	vmul.f32	d9, d5, d17  
-	vmul.f32	d13, d2, d16
-	vsub.f32	d7, d7, d6
-	vadd.f32	d11, d11, d10
-	vsub.f32	q1, q12, q11
-	vsub.f32	q2, q10, q9
-	vadd.f32	d6, d9, d8
-	vadd.f32	q4, q14, q13
-	vadd.f32	q11, q12, q11
-	vadd.f32	q12, q10, q9
-	vsub.f32	d10, d13, d12
-	vsub.f32	q7, q4, q0
-	vsub.f32	q9, q12, q11
-	vsub.f32	q13, q5, q3
-	vadd.f32	d29, d5, d2  @
-	vadd.f32	q5, q5, q3
-	vadd.f32	q10, q4, q0
-	vadd.f32	q11, q12, q11
-	vsub.f32	d31, d5, d2  @
-	vsub.f32	d28, d4, d3  @
-	vadd.f32	d30, d4, d3  @
-	vadd.f32	d5, d19, d14  @
-	vadd.f32	d7, d31, d26  @
-	vadd.f32	q1, q14, q5
-	vadd.f32	q0, q11, q10
-	vsub.f32	d6, d30, d27  @
-	vsub.f32	d4, d18, d15  @
-	vsub.f32	d13, d19, d14  @
-	vadd.f32	d12, d18, d15  @
-	vsub.f32	d15, d31, d26  @
-	ldr r2, [r12], #4
-	vtrn.32	q1, q3
-	ldr lr, [r12], #4
-	vtrn.32	q0, q2
-	add r2, r0, r2, lsl #2
-	vsub.f32	q4, q11, q10
-	add lr, r0, lr, lsl #2
-	vsub.f32	q5, q14, q5
-	vadd.f32	d14, d30, d27 @
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_ee_loop
-
-	ldr	r11, [r1, #12]
-	vld2.32 {q9}, [r5, :128]! @tag2
-	vld2.32 {q13}, [r3, :128]! @tag0
-	vld2.32 {q12}, [r4, :128]! @tag1
-	vld2.32 {q0}, [r7, :128]! @tag4
-	vsub.f32	q11, q13, q12
-	vld2.32 {q8}, [r6, :128]! @tag3
-	vadd.f32	q12, q13, q12
-	vsub.f32	q10, q9, q8
-	vadd.f32	q8, q9, q8
-	vadd.f32	q9, q12, q8
-	vadd.f32	d9, d23, d20  @
-	vsub.f32	d11, d23, d20  @
-	vsub.f32	q8, q12, q8
-	vsub.f32	d8, d22, d21  @
-	vadd.f32	d10, d22, d21  @
-	ldr r2, [r12], #4
-	vld1.32	{d20, d21}, [r11, :128]
-	ldr lr, [r12], #4
-	vtrn.32	q9, q4
-	add r2, r0, r2, lsl #2
-	vtrn.32	q8, q5
-	add lr, r0, lr, lsl #2
-	vswp d9,d10
-	vst1.32 {d8,d9,d10,d11}, [lr, :128]!
-	vld2.32 {q13}, [r10, :128]! @tag7
-	vld2.32 {q15}, [r9, :128]! @tag6
-	vld2.32 {q11}, [r8, :128]! @tag5
-	vsub.f32	q14, q15, q13
-	vsub.f32	q12, q0, q11
-	vadd.f32	q11, q0, q11
-	vadd.f32	q13, q15, q13
-	vadd.f32	d13, d29, d24  @
-	vadd.f32	q15, q13, q11
-	vsub.f32	d12, d28, d25  @
-	vsub.f32	d15, d29, d24  @
-	vadd.f32	d14, d28, d25  @
-	vtrn.32	q15, q6
-	vsub.f32	q15, q13, q11
-	vtrn.32	q15, q7
-	vswp d13, d14
-	vst1.32 {d12,d13,d14,d15}, [lr, :128]!
-	vtrn.32	q13, q14
-	vtrn.32	q11, q12
-	vmul.f32	d24, d26, d21
-	vmul.f32	d28, d27, d20
-	vmul.f32	d25, d26, d20
-	vmul.f32	d26, d27, d21
-	vmul.f32	d27, d22, d21
-	vmul.f32	d30, d23, d20
-	vmul.f32	d29, d23, d21
-	vmul.f32	d22, d22, d20
-	vsub.f32	d21, d28, d24
-	vadd.f32	d20, d26, d25
-	vadd.f32	d25, d30, d27
-	vsub.f32	d24, d22, d29
-	vadd.f32	q11, q12, q10
-	vsub.f32	q10, q12, q10
-	vadd.f32	q0, q9, q11
-	vsub.f32	q2, q9, q11
-	vadd.f32	d3, d17, d20  @
-	vsub.f32	d7, d17, d20  @
-	vsub.f32	d2, d16, d21  @
-	vadd.f32	d6, d16, d21  @
-	vswp d1, d2
-	vswp d5, d6
-	vstmia r2!, {q0-q3}
-
-  add	r2, r7, #0	
-  add	r7, r9, #0	
-  add	r9, r2, #0	
-  add	r2, r8, #0	
-  add	r8, r10, #0	
-  add	r10, r2, #0	
-  ldr	r11, [r1, #32]  @ this is p->i1	
-	cmp r11, #0
-	beq _neon_oo_loop_exit
-_neon_oo_loop:
-	vld2.32 {q8}, [r6, :128]!
-	vld2.32 {q9}, [r5, :128]!
-	vld2.32 {q10}, [r4, :128]!
-	vld2.32 {q13}, [r3, :128]!
-	vadd.f32	q11, q9, q8
-	vsub.f32	q8, q9, q8
-	vsub.f32	q9, q13, q10
-	vadd.f32	q12, q13, q10
-	subs	r11, r11, #1
-	vld2.32 {q10}, [r7, :128]!
-	vld2.32 {q13}, [r9, :128]!
-	vsub.f32	q2, q12, q11
-	vsub.f32	d7, d19, d16  @
-	vadd.f32	d3, d19, d16  @
-	vadd.f32	d6, d18, d17  @
-	vsub.f32	d2, d18, d17  @
-	vld2.32 {q9}, [r8, :128]!
-	vld2.32 {q8}, [r10, :128]!
-	vadd.f32	q0, q12, q11
-	vadd.f32	q11, q13, q8
-	vadd.f32	q12, q10, q9
-	vsub.f32	q8, q13, q8
-	vsub.f32	q9, q10, q9
-	vsub.f32	q6, q12, q11
-	vadd.f32	q4, q12, q11
-	vtrn.32	q0, q2
-	ldr r2, [r12], #4
-	vsub.f32	d15, d19, d16  @
-	ldr lr, [r12], #4
-	vadd.f32	d11, d19, d16  @
-	vadd.f32	d14, d18, d17  @
-	vsub.f32	d10, d18, d17  @
-	add r2, r0, r2, lsl #2
-	vtrn.32	q1, q3
-	add lr, r0, lr, lsl #2
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_oo_loop
-_neon_oo_loop_exit:
-
-  add	r2, r3, #0
-  add	r3, r7, #0
-  add	r7, r2, #0
-  add	r2, r4, #0
-  add	r4, r8, #0
-  add	r8, r2, #0
-  add	r2, r5, #0
-  add	r5, r9, #0
-  add	r9, r2, #0
-  add	r2, r6, #0
-  add	r6, r10, #0
-  add	r10, r2, #0
-  add	r2, r9, #0	
-  add	r9, r10, #0	
-  add	r10, r2, #0	
-  ldr	r2, [r1, #16]
-  ldr	r11, [r1, #32]  @ this is p->i1	
-	cmp r11, #0
-	beq _neon_ee_loop2_exit
-
-	vld1.32	{d16, d17}, [r2, :128]
-_neon_ee_loop2:
-	vld2.32 {q15}, [r10, :128]!
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vld2.32 {q9},  [r4, :128]!
-	vld2.32 {q10}, [r3, :128]!
-	vld2.32 {q11}, [r6, :128]!
-	vld2.32 {q12}, [r5, :128]!
-	vsub.f32	q1, q14, q13
-	vld2.32 {q0}, [r9, :128]!
-	subs	r11, r11, #1
-	vsub.f32	q2, q0, q15
-	vadd.f32	q0, q0, q15
-	vmul.f32	d10, d2, d17
-	vmul.f32	d11, d3, d16
-	vmul.f32	d12, d3, d17
-	vmul.f32	d6, d4, d17  
-	vmul.f32	d7, d5, d16  
-	vmul.f32	d8, d4, d16  
-	vmul.f32	d9, d5, d17  
-	vmul.f32	d13, d2, d16
-	vsub.f32	d7, d7, d6
-	vadd.f32	d11, d11, d10
-	vsub.f32	q1, q12, q11
-	vsub.f32	q2, q10, q9
-	vadd.f32	d6, d9, d8
-	vadd.f32	q4, q14, q13
-	vadd.f32	q11, q12, q11
-	vadd.f32	q12, q10, q9
-	vsub.f32	d10, d13, d12
-	vsub.f32	q7, q4, q0
-	vsub.f32	q9, q12, q11
-	vsub.f32	q13, q5, q3
-	vadd.f32	d29, d5, d2  @
-	vadd.f32	q5, q5, q3
-	vadd.f32	q10, q4, q0
-	vadd.f32	q11, q12, q11
-	vsub.f32	d31, d5, d2  @
-	vsub.f32	d28, d4, d3  @
-	vadd.f32	d30, d4, d3  @
-	vadd.f32	d5, d19, d14  @
-	vadd.f32	d7, d31, d26  @
-	vadd.f32	q1, q14, q5
-	vadd.f32	q0, q11, q10
-	vsub.f32	d6, d30, d27  @
-	vsub.f32	d4, d18, d15  @
-	vsub.f32	d13, d19, d14  @
-	vadd.f32	d12, d18, d15  @
-	vsub.f32	d15, d31, d26  @
-	ldr r2, [r12], #4
-	vtrn.32	q1, q3
-	ldr lr, [r12], #4
-	vtrn.32	q0, q2
-	add r2, r0, r2, lsl #2
-	vsub.f32	q4, q11, q10
-	add lr, r0, lr, lsl #2
-	vsub.f32	q5, q14, q5
-	vadd.f32	d14, d30, d27 @
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_ee_loop2
-_neon_ee_loop2_exit:
-
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-
-
-
-	.align 4
-#ifdef __APPLE__
-	.globl	_neon_static_o_i
-_neon_static_o_i:
-#else
-	.globl	neon_static_o_i
-neon_static_o_i:
-#endif
-  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-  vstmdb	sp!, {d8-d15}
-  ldr lr, [r0, #40]  @ this is p->N
-  add	r3, r1, #0	
-  add	r7, r1, lr 
-  add	r5, r7, lr 
-  add	r10, r5, lr
-  add	r4, r10, lr	
-  add	r8, r4, lr
-  add	r6, r8, lr
-  add	r9, r6, lr
-  ldr	r12, [r0]
-  add	r1, r0, #0
-  add	r0, r2, #0	
-  ldr	r2, [r1, #16]   @ this is p->ee_ws
-  ldr	r11, [r1, #28]  @ this is p->i0	
-	
-	vld1.32	{d16, d17}, [r2, :128]
-_neon_ee_o_loop:
-	vld2.32 {q15}, [r10, :128]!
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vld2.32 {q9},  [r4, :128]!
-	vld2.32 {q10}, [r3, :128]!
-	vld2.32 {q11}, [r6, :128]!
-	vld2.32 {q12}, [r5, :128]!
-	vsub.f32	q1, q14, q13
-	vld2.32 {q0}, [r9, :128]!
-	subs	r11, r11, #1
-	vsub.f32	q2, q0, q15
-	vadd.f32	q0, q0, q15
-	vmul.f32	d10, d2, d17
-	vmul.f32	d11, d3, d16
-	vmul.f32	d12, d3, d17
-	vmul.f32	d6, d4, d17  
-	vmul.f32	d7, d5, d16  
-	vmul.f32	d8, d4, d16  
-	vmul.f32	d9, d5, d17  
-	vmul.f32	d13, d2, d16
-	vsub.f32	d7, d7, d6
-	vadd.f32	d11, d11, d10
-	vsub.f32	q1, q12, q11
-	vsub.f32	q2, q10, q9
-	vadd.f32	d6, d9, d8
-	vadd.f32	q4, q14, q13
-	vadd.f32	q11, q12, q11
-	vadd.f32	q12, q10, q9
-	vsub.f32	d10, d13, d12
-	vsub.f32	q7, q4, q0
-	vsub.f32	q9, q12, q11
-	vsub.f32	q13, q5, q3
-	vadd.f32	d29, d5, d2  @
-	vadd.f32	q5, q5, q3
-	vadd.f32	q10, q4, q0
-	vadd.f32	q11, q12, q11
-	vsub.f32	d31, d5, d2  @
-	vsub.f32	d28, d4, d3  @
-	vadd.f32	d30, d4, d3  @
-	vadd.f32	d5, d19, d14  @
-	vadd.f32	d7, d31, d26  @
-	vadd.f32	q1, q14, q5
-	vadd.f32	q0, q11, q10
-	vsub.f32	d6, d30, d27  @
-	vsub.f32	d4, d18, d15  @
-	vsub.f32	d13, d19, d14  @
-	vadd.f32	d12, d18, d15  @
-	vsub.f32	d15, d31, d26  @
-	ldr r2, [r12], #4
-	vtrn.32	q1, q3
-	ldr lr, [r12], #4
-	vtrn.32	q0, q2
-	add r2, r0, r2, lsl #2
-	vsub.f32	q4, q11, q10
-	add lr, r0, lr, lsl #2
-	vsub.f32	q5, q14, q5
-	vadd.f32	d14, d30, d27 @
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_ee_o_loop
-
-  add	r2, r7, #0	
-  add	r7, r9, #0	
-  add	r9, r2, #0	
-  add	r2, r8, #0	
-  add	r8, r10, #0	
-  add	r10, r2, #0	
-  ldr	r11, [r1, #32]  @ this is p->i1	
-	cmp r11, #0
-	beq _neon_oo_o_loop_exit
-_neon_oo_o_loop:
-	vld2.32 {q8}, [r6, :128]!
-	vld2.32 {q9}, [r5, :128]!
-	vld2.32 {q10}, [r4, :128]!
-	vld2.32 {q13}, [r3, :128]!
-	vadd.f32	q11, q9, q8
-	vsub.f32	q8, q9, q8
-	vsub.f32	q9, q13, q10
-	vadd.f32	q12, q13, q10
-	subs	r11, r11, #1
-	vld2.32 {q10}, [r7, :128]!
-	vld2.32 {q13}, [r9, :128]!
-	vsub.f32	q2, q12, q11
-	vsub.f32	d7, d19, d16  @
-	vadd.f32	d3, d19, d16  @
-	vadd.f32	d6, d18, d17  @
-	vsub.f32	d2, d18, d17  @
-	vld2.32 {q9}, [r8, :128]!
-	vld2.32 {q8}, [r10, :128]!
-	vadd.f32	q0, q12, q11
-	vadd.f32	q11, q13, q8
-	vadd.f32	q12, q10, q9
-	vsub.f32	q8, q13, q8
-	vsub.f32	q9, q10, q9
-	vsub.f32	q6, q12, q11
-	vadd.f32	q4, q12, q11
-	vtrn.32	q0, q2
-	ldr r2, [r12], #4
-	vsub.f32	d15, d19, d16  @
-	ldr lr, [r12], #4
-	vadd.f32	d11, d19, d16  @
-	vadd.f32	d14, d18, d17  @
-	vsub.f32	d10, d18, d17  @
-	add r2, r0, r2, lsl #2
-	vtrn.32	q1, q3
-	add lr, r0, lr, lsl #2
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_oo_o_loop
-_neon_oo_o_loop_exit:
-
-	ldr	r11, [r1, #8]
-	vld1.32 {q8}, [r5, :128]!
-	vld1.32 {q10}, [r6, :128]!
-	vld2.32 {q11}, [r4, :128]!
-	vld2.32 {q13}, [r3, :128]!
-	vld2.32 {q15}, [r10, :128]!
-	vorr	d25, d17, d17
-	vorr	d24, d20, d20
-	vorr	d20, d16, d16
-	vsub.f32	q9, q13, q11
-	vadd.f32	q11, q13, q11
-	ldr r2, [r12], #4
-	vtrn.32	d24, d25
-	ldr lr, [r12], #4
-	vtrn.32	d20, d21
-	add r2, r0, r2, lsl #2
-	vsub.f32	q8, q10, q12
-	add lr, r0, lr, lsl #2
-	vadd.f32	q10, q10, q12
-	vadd.f32	q0, q11, q10
-	vadd.f32	d25, d19, d16  @
-	vsub.f32	d27, d19, d16  @
-	vsub.f32	q1, q11, q10
-	vsub.f32	d24, d18, d17  @
-	vadd.f32	d26, d18, d17  @
-	vtrn.32	q0, q12
-	vtrn.32	q1, q13
-	vld1.32	{d24, d25}, [r11, :128]
-	vswp d1, d2
-	vst1.32 {q0, q1}, [r2, :128]!
-	vld2.32 {q0}, [r9, :128]!
-	vadd.f32	q1, q0, q15
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vsub.f32	q15, q0, q15
-	vsub.f32	q0, q14, q13
-	vadd.f32	q3, q14, q13
-	vadd.f32	q2, q3, q1
-	vadd.f32	d29, d1, d30  @
-	vsub.f32	d27, d1, d30  @
-	vsub.f32	q3, q3, q1
-	vsub.f32	d28, d0, d31  @
-	vadd.f32	d26, d0, d31  @
-	vtrn.32	q2, q14
-	vtrn.32	q3, q13
-	vswp d5, d6
-	vst1.32 {q2, q3}, [r2, :128]!
-	vtrn.32	q11, q9
-	vtrn.32	q10, q8
-	vmul.f32	d20, d18, d25
-	vmul.f32	d22, d19, d24
-	vmul.f32	d21, d19, d25
-	vmul.f32	d18, d18, d24
-	vmul.f32	d19, d16, d25
-	vmul.f32	d30, d17, d24
-	vmul.f32	d23, d16, d24
-	vmul.f32	d24, d17, d25
-	vadd.f32	d17, d22, d20
-	vsub.f32	d16, d18, d21
-	vsub.f32	d21, d30, d19
-	vadd.f32	d20, d24, d23
-	vadd.f32	q9, q8, q10
-	vsub.f32	q8, q8, q10
-	vadd.f32	q4, q14, q9
-	vsub.f32	q6, q14, q9
-	vadd.f32	d11, d27, d16  @
-	vsub.f32	d15, d27, d16  @
-	vsub.f32	d10, d26, d17  @
-	vadd.f32	d14, d26, d17  @
-	vswp d9, d10
-	vswp d13, d14
-	vstmia lr!, {q4-q7}
-
-
-  add	r2, r3, #0
-  add	r3, r7, #0
-  add	r7, r2, #0
-  add	r2, r4, #0
-  add	r4, r8, #0
-  add	r8, r2, #0
-  add	r2, r5, #0
-  add	r5, r9, #0
-  add	r9, r2, #0
-  add	r2, r6, #0
-  add	r6, r10, #0
-  add	r10, r2, #0
-  add	r2, r9, #0	
-  add	r9, r10, #0	
-  add	r10, r2, #0	
-  ldr	r2, [r1, #16]
-  ldr	r11, [r1, #32]  @ this is p->i1	
-	cmp r11, #0
-	beq _neon_ee_o_loop2_exit
-
-	vld1.32	{d16, d17}, [r2, :128]
-_neon_ee_o_loop2:
-	vld2.32 {q15}, [r10, :128]!
-	vld2.32 {q13}, [r8, :128]!
-	vld2.32 {q14}, [r7, :128]!
-	vld2.32 {q9},  [r4, :128]!
-	vld2.32 {q10}, [r3, :128]!
-	vld2.32 {q11}, [r6, :128]!
-	vld2.32 {q12}, [r5, :128]!
-	vsub.f32	q1, q14, q13
-	vld2.32 {q0}, [r9, :128]!
-	subs	r11, r11, #1
-	vsub.f32	q2, q0, q15
-	vadd.f32	q0, q0, q15
-	vmul.f32	d10, d2, d17
-	vmul.f32	d11, d3, d16
-	vmul.f32	d12, d3, d17
-	vmul.f32	d6, d4, d17  
-	vmul.f32	d7, d5, d16  
-	vmul.f32	d8, d4, d16  
-	vmul.f32	d9, d5, d17  
-	vmul.f32	d13, d2, d16
-	vsub.f32	d7, d7, d6
-	vadd.f32	d11, d11, d10
-	vsub.f32	q1, q12, q11
-	vsub.f32	q2, q10, q9
-	vadd.f32	d6, d9, d8
-	vadd.f32	q4, q14, q13
-	vadd.f32	q11, q12, q11
-	vadd.f32	q12, q10, q9
-	vsub.f32	d10, d13, d12
-	vsub.f32	q7, q4, q0
-	vsub.f32	q9, q12, q11
-	vsub.f32	q13, q5, q3
-	vadd.f32	d29, d5, d2  @
-	vadd.f32	q5, q5, q3
-	vadd.f32	q10, q4, q0
-	vadd.f32	q11, q12, q11
-	vsub.f32	d31, d5, d2  @
-	vsub.f32	d28, d4, d3  @
-	vadd.f32	d30, d4, d3  @
-	vadd.f32	d5, d19, d14  @
-	vadd.f32	d7, d31, d26  @
-	vadd.f32	q1, q14, q5
-	vadd.f32	q0, q11, q10
-	vsub.f32	d6, d30, d27  @
-	vsub.f32	d4, d18, d15  @
-	vsub.f32	d13, d19, d14  @
-	vadd.f32	d12, d18, d15  @
-	vsub.f32	d15, d31, d26  @
-	ldr r2, [r12], #4
-	vtrn.32	q1, q3
-	ldr lr, [r12], #4
-	vtrn.32	q0, q2
-	add r2, r0, r2, lsl #2
-	vsub.f32	q4, q11, q10
-	add lr, r0, lr, lsl #2
-	vsub.f32	q5, q14, q5
-	vadd.f32	d14, d30, d27 @
-	vst2.32 {q0,q1}, [r2, :128]!
-	vst2.32 {q2,q3}, [lr, :128]!
-	vtrn.32	q4, q6
-	vtrn.32	q5, q7
-	vst2.32 {q4,q5}, [r2, :128]!
-	vst2.32 {q6,q7}, [lr, :128]!
-	bne _neon_ee_o_loop2
-_neon_ee_o_loop2_exit:
-
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-	.align	4
-#ifdef __APPLE__
-	.globl	_neon_static_x4_i
-_neon_static_x4_i:
-#else
-	.globl	neon_static_x4_i
-neon_static_x4_i:
-#endif
-@	add r3, r0, #0
-  push	{r4, r5, r6, lr}
-  vstmdb	sp!, {d8-d15}
-
-	vld1.32 {q8,q9}, [r0, :128]
-	add r4, r0, r1, lsl #1
-	vld1.32 {q10,q11}, [r4, :128]
-	add r5, r0, r1, lsl #2
-	vld1.32 {q12,q13}, [r5, :128]
-	add r6, r4, r1, lsl #2
-	vld1.32 {q14,q15}, [r6, :128]
-	vld1.32 {q2,q3}, [r2, :128]
-	
-	vmul.f32	q0, q13, q3
-	vmul.f32	q5, q12, q2
-	vmul.f32	q1, q14, q2
-	vmul.f32	q4, q14, q3
-	vmul.f32	q14, q12, q3
-	vmul.f32	q13, q13, q2
-	vmul.f32	q12, q15, q3
-	vmul.f32	q2, q15, q2
-	vsub.f32	q0, q5, q0
-	vadd.f32	q13, q13, q14
-	vadd.f32	q12, q12, q1
-	vsub.f32	q1, q2, q4
-	vadd.f32	q15, q0, q12
-	vsub.f32	q12, q0, q12
-	vadd.f32	q14, q13, q1
-	vsub.f32	q13, q13, q1
-	vadd.f32	q0, q8, q15
-	vadd.f32	q1, q9, q14
-	vsub.f32	q2, q10, q13  @
-	vsub.f32	q4, q8, q15
-	vadd.f32	q3, q11, q12  @
-	vst1.32 {q0,q1}, [r0, :128]
-	vsub.f32	q5, q9, q14
-	vadd.f32	q6, q10, q13  @
-	vsub.f32	q7, q11, q12  @
-	vst1.32 {q2,q3}, [r4, :128]
-	vst1.32 {q4,q5}, [r5, :128]
-	vst1.32 {q6,q7}, [r6, :128]
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, pc}
-
-
-
-	.align 4
-#ifdef __APPLE__
-	.globl	_neon_static_x8_i
-_neon_static_x8_i:
-#else
-	.globl	neon_static_x8_i
-neon_static_x8_i:
-#endif
-  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-  vstmdb	sp!, {d8-d15}
-	mov r11, #0
-	add r3, r0, #0           @ data0
-	add r5, r0, r1, lsl #1   @ data2
-	add r4, r0, r1           @ data1
-	add r7, r5, r1, lsl #1   @ data4
-	add r6, r5, r1           @ data3
-	add r9, r7, r1, lsl #1   @ data6
-	add r8, r7, r1           @ data5
-	add r10, r9, r1          @ data7
-	add r12, r2, #0          @ LUT
-
-	sub r11, r11, r1, lsr #5
-neon_x8_loop: 
-  vld1.32 {q2,q3}, [r12, :128]!
-  vld1.32 {q14,q15}, [r6, :128]
-  vld1.32 {q10,q11}, [r5, :128]
-  adds	r11, r11, #1
-  vmul.f32	q12, q15, q2
-  vmul.f32	q8, q14, q3
-  vmul.f32	q13, q14, q2
-  vmul.f32	q9, q10, q3
-  vmul.f32	q1, q10, q2
-  vmul.f32	q0, q11, q2
-  vmul.f32	q14, q11, q3
-  vmul.f32	q15, q15, q3
-  vld1.32 {q2,q3}, [r12, :128]!
-  vsub.f32	q10, q12, q8
-  vadd.f32	q11, q0, q9
-  vadd.f32	q8, q15, q13
-  vld1.32 {q12,q13}, [r4, :128]
-  vsub.f32	q9, q1, q14
-  vsub.f32	q15, q11, q10
-  vsub.f32	q14, q9, q8
-  vsub.f32	q4, q12, q15  @
-  vadd.f32	q6, q12, q15  @ 
-  vadd.f32	q5, q13, q14  @
-  vsub.f32	q7, q13, q14  @
-  vld1.32 {q14,q15}, [r9, :128]
-  vld1.32 {q12,q13}, [r7, :128]
-  vmul.f32	q1, q14, q2
-  vmul.f32	q0, q14, q3
-  vst1.32 {q4,q5}, [r4, :128]
-  vmul.f32	q14, q15, q3
-  vmul.f32	q4, q15, q2
-  vadd.f32	q15, q9, q8
-  vst1.32 {q6,q7}, [r6, :128]
-  vmul.f32	q8, q12, q3
-  vmul.f32	q5, q13, q3
-  vmul.f32	q12, q12, q2
-  vmul.f32	q9, q13, q2
-  vadd.f32	q14, q14, q1
-  vsub.f32	q13, q4, q0
-  vadd.f32	q0, q9, q8
-  vld1.32 {q8,q9}, [r3, :128]
-  vadd.f32	q1, q11, q10
-  vsub.f32	q12, q12, q5
-  vadd.f32	q11, q8, q15
-  vsub.f32	q8, q8, q15
-  vadd.f32	q2, q12, q14
-  vsub.f32	q10, q0, q13
-  vadd.f32	q15, q0, q13
-  vadd.f32	q13, q9, q1
-  vsub.f32	q9, q9, q1
-  vsub.f32	q12, q12, q14
-  vadd.f32	q0, q11, q2
-  vadd.f32	q1, q13, q15
-  vsub.f32	q4, q11, q2
-  vsub.f32	q2, q8, q10  @
-  vadd.f32	q3, q9, q12  @
-  vst1.32 {q0,q1}, [r3, :128]!
-  vsub.f32	q5, q13, q15
-  vld1.32 {q14,q15}, [r10, :128]
-  vsub.f32	q7, q9, q12  @
-  vld1.32 {q12,q13}, [r8, :128]
-  vst1.32 {q2,q3}, [r5, :128]!
-  vld1.32 {q2,q3}, [r12, :128]!
-  vadd.f32	q6, q8, q10  @
-  vmul.f32	q8, q14, q2
-  vst1.32 {q4,q5}, [r7, :128]!
-  vmul.f32	q10, q15, q3
-  vmul.f32	q9, q13, q3
-  vmul.f32	q11, q12, q2
-  vmul.f32	q14, q14, q3
-  vst1.32 {q6,q7}, [r9, :128]!
-  vmul.f32	q15, q15, q2
-  vmul.f32	q12, q12, q3
-  vmul.f32	q13, q13, q2
-  vadd.f32	q10, q10, q8
-  vsub.f32	q11, q11, q9
-  vld1.32 {q8,q9}, [r4, :128]
-  vsub.f32	q14, q15, q14
-  vadd.f32	q15, q13, q12
-  vadd.f32	q13, q11, q10
-  vadd.f32	q12, q15, q14
-  vsub.f32	q15, q15, q14
-  vsub.f32	q14, q11, q10
-  vld1.32 {q10,q11}, [r6, :128]
-  vadd.f32	q0, q8, q13
-  vadd.f32	q1, q9, q12
-  vsub.f32	q2, q10, q15  @
-  vadd.f32	q3, q11, q14  @
-  vsub.f32	q4, q8, q13
-  vst1.32 {q0,q1}, [r4, :128]!
-  vsub.f32	q5, q9, q12
-  vadd.f32	q6, q10, q15  @
-  vst1.32 {q2,q3}, [r6, :128]!
-  vsub.f32	q7, q11, q14  @
-  vst1.32 {q4,q5}, [r8, :128]!
-  vst1.32 {q6,q7}, [r10, :128]!
-	bne neon_x8_loop	
-
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-	.align 4
-#ifdef __APPLE__
-	.globl	_neon_static_x8_t_i
-_neon_static_x8_t_i:
-#else
-	.globl	neon_static_x8_t_i
-neon_static_x8_t_i:
-#endif
-  push	{r4, r5, r6, r7, r8, r9, r10, r11, lr}
-  vstmdb	sp!, {d8-d15}
-	mov r11, #0
-	add r3, r0, #0           @ data0
-	add r5, r0, r1, lsl #1   @ data2
-	add r4, r0, r1           @ data1
-	add r7, r5, r1, lsl #1   @ data4
-	add r6, r5, r1           @ data3
-	add r9, r7, r1, lsl #1   @ data6
-	add r8, r7, r1           @ data5
-	add r10, r9, r1          @ data7
-	add r12, r2, #0          @ LUT
-
-	sub r11, r11, r1, lsr #5
-neon_x8_t_loop: 
-  vld1.32 {q2,q3}, [r12, :128]!
-  vld1.32 {q14,q15}, [r6, :128]
-  vld1.32 {q10,q11}, [r5, :128]
-  adds	r11, r11, #1
-  vmul.f32	q12, q15, q2
-  vmul.f32	q8, q14, q3
-  vmul.f32	q13, q14, q2
-  vmul.f32	q9, q10, q3
-  vmul.f32	q1, q10, q2
-  vmul.f32	q0, q11, q2
-  vmul.f32	q14, q11, q3
-  vmul.f32	q15, q15, q3
-  vld1.32 {q2,q3}, [r12, :128]!
-  vsub.f32	q10, q12, q8
-  vadd.f32	q11, q0, q9
-  vadd.f32	q8, q15, q13
-  vld1.32 {q12,q13}, [r4, :128]
-  vsub.f32	q9, q1, q14
-  vsub.f32	q15, q11, q10
-  vsub.f32	q14, q9, q8
-  vsub.f32	q4, q12, q15  @
-  vadd.f32	q6, q12, q15  @
-  vadd.f32	q5, q13, q14  @
-  vsub.f32	q7, q13, q14  @
-  vld1.32 {q14,q15}, [r9, :128]
-  vld1.32 {q12,q13}, [r7, :128]
-  vmul.f32	q1, q14, q2
-  vmul.f32	q0, q14, q3
-  vst1.32 {q4,q5}, [r4, :128]
-  vmul.f32	q14, q15, q3
-  vmul.f32	q4, q15, q2
-  vadd.f32	q15, q9, q8
-  vst1.32 {q6,q7}, [r6, :128]
-  vmul.f32	q8, q12, q3
-  vmul.f32	q5, q13, q3
-  vmul.f32	q12, q12, q2
-  vmul.f32	q9, q13, q2
-  vadd.f32	q14, q14, q1
-  vsub.f32	q13, q4, q0
-  vadd.f32	q0, q9, q8
-  vld1.32 {q8,q9}, [r3, :128]
-  vadd.f32	q1, q11, q10
-  vsub.f32	q12, q12, q5
-  vadd.f32	q11, q8, q15
-  vsub.f32	q8, q8, q15
-  vadd.f32	q2, q12, q14
-  vsub.f32	q10, q0, q13
-  vadd.f32	q15, q0, q13
-  vadd.f32	q13, q9, q1
-  vsub.f32	q9, q9, q1
-  vsub.f32	q12, q12, q14
-  vadd.f32	q0, q11, q2
-  vadd.f32	q1, q13, q15
-  vsub.f32	q4, q11, q2
-  vsub.f32	q2, q8, q10  @
-  vadd.f32	q3, q9, q12  @
-  vst2.32 {q0,q1}, [r3, :128]!
-  vsub.f32	q5, q13, q15
-  vld1.32 {q14,q15}, [r10, :128]
-  vsub.f32	q7, q9, q12  @
-  vld1.32 {q12,q13}, [r8, :128]
-  vst2.32 {q2,q3}, [r5, :128]!
-  vld1.32 {q2,q3}, [r12, :128]!
-  vadd.f32	q6, q8, q10  @
-  vmul.f32	q8, q14, q2
-  vst2.32 {q4,q5}, [r7, :128]!
-  vmul.f32	q10, q15, q3
-  vmul.f32	q9, q13, q3
-  vmul.f32	q11, q12, q2
-  vmul.f32	q14, q14, q3
-  vst2.32 {q6,q7}, [r9, :128]!
-  vmul.f32	q15, q15, q2
-  vmul.f32	q12, q12, q3
-  vmul.f32	q13, q13, q2
-  vadd.f32	q10, q10, q8
-  vsub.f32	q11, q11, q9
-  vld1.32 {q8,q9}, [r4, :128]
-  vsub.f32	q14, q15, q14
-  vadd.f32	q15, q13, q12
-  vadd.f32	q13, q11, q10
-  vadd.f32	q12, q15, q14
-  vsub.f32	q15, q15, q14
-  vsub.f32	q14, q11, q10
-  vld1.32 {q10,q11}, [r6, :128]
-  vadd.f32	q0, q8, q13
-  vadd.f32	q1, q9, q12
-  vsub.f32	q2, q10, q15  @
-  vadd.f32	q3, q11, q14  @
-  vsub.f32	q4, q8, q13
-  vst2.32 {q0,q1}, [r4, :128]!
-  vsub.f32	q5, q9, q12
-  vadd.f32	q6, q10, q15  @
-  vst2.32 {q2,q3}, [r6, :128]!
-  vsub.f32	q7, q11, q14  @
-  vst2.32 {q4,q5}, [r8, :128]!
-  vst2.32 {q6,q7}, [r10, :128]!
-	bne neon_x8_t_loop	
-	
-	vldmia	sp!, {d8-d15}
-	pop	{r4, r5, r6, r7, r8, r9, r10, r11, pc}
-
-
--- a/lib/ffts/src/patterns.c
+++ b/lib/ffts/src/patterns.c
@ -1,208 +0,0 @@
-/*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-#include "patterns.h"
-
-void permute_addr(int N, int offset, int stride, int *d) {
-    int i, a[4] = {0,2,1,3};
-    for(i=0;i<4;i++) {
-        d[i] = offset + (a[i] << stride);
-        if(d[i] < 0) d[i] += N;
-    }
-}
-
-void ffts_hardcodedleaf_is_rec(ptrdiff_t **is, int bigN, int N, int poffset, int offset, int stride, int even, int VL) {
- 
-	if(N > 4) {  
-    ffts_hardcodedleaf_is_rec(is, bigN, N/2, poffset, offset, stride + 1, even, VL);  
-		if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset+(1<<stride),offset+(N/2), stride + 2, 0, VL);
-		if(N/4 >= 4) ffts_hardcodedleaf_is_rec(is, bigN, N/4, poffset-(1<<stride),offset+(3*N/4), stride + 2, 0, VL);
-		else {
-			int temp = poffset+(1<<stride);
-			if(temp < 0) temp += bigN;
-			temp *= 2;
-
-			if(!(temp % (VL*2))) { 
-				(*is)[0] = poffset+(1<<stride);
-				(*is)[1] = poffset+(1<<stride)+(1<<(stride+2));
-				(*is)[2] = poffset-(1<<stride);
-				(*is)[3] = poffset-(1<<stride)+(1<<(stride+2));
-				int i;
-				for(i=0;i<4;i++) if((*is)[i] < 0) (*is)[i] += bigN;
-				for(i=0;i<4;i++) (*is)[i] *= 2; 
-				*is += 4;
-			}
-		}
-  }else if(N == 4) {
-		int perm[4];
-		permute_addr(bigN, poffset, stride, perm);
-		if(!((perm[0]*2) % (VL*2))) { 
-			int i;
-			for(i=0;i<4;i++) {
-				(*is)[i] = perm[i] * 2;
-				}
-			*is += 4;
-		}
-	}
-}
-
-void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL) {
-	int i, i0 = N/leafN/3+1, i1=N/leafN/3, i2 = N/leafN/3;
-	int stride = log(N/leafN)/log(2);
-	
-	p->is = malloc(N/VL * sizeof(ptrdiff_t));
-	
-	ptrdiff_t *is = p->is;
-
-	if((N/leafN) % 3 > 1) i1++;
-
-	for(i=0;i<i0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
-	for(i=i0;i<i0+i1;i++) {
-		ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i, 0, stride+1, 1, VL);
-		ffts_hardcodedleaf_is_rec(&is, N, leafN/2, i-(1<<stride), 0, stride+1, 1, VL);
-	}
-	for(i=0-i2;i<0;i++) ffts_hardcodedleaf_is_rec(&is, N, leafN, i, 0, stride, 1, VL);
-
-
-//for(i=0;i<N/VL;i++) {
-//	printf("%td ", p->is[i]);
-//	if(i % 16 == 15) printf("\n");
-//}
-
-	p->i0 = i0; p->i1 = i1;
-}
-/**
- *
- *
- */
-void ffts_elaborate_offsets(ptrdiff_t *offsets, int leafN, int N, int ioffset, int ooffset, int stride, int even) {
-  if((even && N == leafN) || (!even && N <= leafN)) {
-		offsets[2*(ooffset/leafN)]   = ioffset*2;
-		offsets[2*(ooffset/leafN)+1] = ooffset;
-	}else if(N > 4) {
-		ffts_elaborate_offsets(offsets, leafN, N/2, ioffset, ooffset, stride+1, even);
-		ffts_elaborate_offsets(offsets, leafN, N/4, ioffset+(1<<stride), ooffset+N/2, stride+2, 0);
-  	if(N/4 >= leafN) 
-		ffts_elaborate_offsets(offsets, leafN, N/4, ioffset-(1<<stride), ooffset+3*N/4, stride+2, 0);
-	}
-
-}
-
-int compare_offsets(const void *a, const void *b) {
-	return ((ptrdiff_t *)a)[0] - ((ptrdiff_t *)b)[0];
-}
-
-uint32_t reverse_bits(uint32_t a, int n) {
-	uint32_t x = 0;
-
-	int i;
-	for(i=0;i<n;i++) {
-		if(a & (1 << i)) x |= 1 << (n-i-1);
-	}
-	return x;
-}
-
-
-void ffts_init_offsets(ffts_plan_t *p, int N, int leafN) {
-
-	ptrdiff_t *offsets = malloc(2 * N/leafN * sizeof(ptrdiff_t));
-
-	ffts_elaborate_offsets(offsets, leafN, N, 0, 0, 1, 1);
-
-	size_t i;
-	for(i=0;i<2*N/leafN;i+=2) {
-		if(offsets[i] < 0) offsets[i] = N + offsets[i];
-	}
-	
-	qsort(offsets, N/leafN, 2 * sizeof(ptrdiff_t), compare_offsets); 
-	//elaborate_is(p, N, 0, 0, 1);
-	p->offsets = malloc(N/leafN * sizeof(ptrdiff_t));
-	for(i=0;i<N/leafN;i++) {
-		p->offsets[i] = offsets[i*2+1]*2;
-	}
-//for(i=0;i<N/leafN;i++) {
-//	printf("%4d %4d\n", p->offsets[i], reverse_bits(p->offsets[i], __builtin_ctzl(2*N)));
-//}
-	free(offsets);
-}
-
-/*
-int tree_count(int N, int leafN, int offset) {
-	
-	if(N <= leafN) return 0;
-	int count = 0;	
-	count += tree_count(N/4, leafN, offset);
-	count += tree_count(N/8, leafN, offset + N/4);
-	count += tree_count(N/8, leafN, offset + N/4 + N/8);
-	count += tree_count(N/4, leafN, offset + N/2);
-	count += tree_count(N/4, leafN, offset + 3*N/4);
-
-	return 1 + count;
-}
-
-void elaborate_tree(transform_index_t **p, int N, int leafN, int offset) {
-	
-	if(N <= leafN) return;
-	elaborate_tree(p, N/4, leafN, offset);
-	elaborate_tree(p, N/8, leafN, offset + N/4);
-	elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
-	elaborate_tree(p, N/4, leafN, offset + N/2);
-	elaborate_tree(p, N/4, leafN, offset + 3*N/4);
-
-	(*p)[0] = N;
-	(*p)[1] = offset*2;
-
-	(*p)+=2;
-}
-
-void ffts_init_tree(ffts_plan_t *p, int N, int leafN) {
-
-	int count = tree_count(N, leafN, 0) + 1;
-	transform_index_t *ps = p->transforms = malloc(count * 2 * sizeof(transform_index_t));
-
-//printf("count = %d\n", count);
-
-	elaborate_tree(&ps, N, leafN, 0);
-	#ifdef __ARM_NEON__
-	ps -= 2;
-	#endif
-	ps[0] = 0;
-	ps[1] = 0;
-//int i;
-//for(i=0;i<count;i++) {
-//	fprintf(stderr, "%lu %lu - %d\n", p->transforms[i*2], p->transforms[i*2+1],
-//		__builtin_ctzl(p->transforms[i*2]) - 5);
-//}
-
-}
-*/
--- a/lib/ffts/src/patterns.h
+++ b/lib/ffts/src/patterns.h
@ -1,44 +1,520 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+Copyright (c) 2015, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */

+#ifndef FFTS_PATTERNS_H
+#define FFTS_PATTERNS_H
+
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
+#endif
+
+#include <stddef.h>
+
+#ifdef HAVE_STDLIB_H
+#include <stdlib.h>
+#endif
+
+#ifndef LEAF_N
+#define LEAF_N 8
+#endif
+
+#if LEAF_N == 8
+static void
+ffts_elaborate_offsets_even8(ptrdiff_t *const offsets,
+                             int log_N);
+
+static void
+ffts_elaborate_offsets_odd8(ptrdiff_t *const offsets,
+                            int log_N,
+                            int input_offset,
+                            int output_offset,
+                            int stride);
+
+static void
+ffts_hardcodedleaf_is_rec_even4(ptrdiff_t **is,
+                                int big_N,
+                                int offset,
+                                int stride,
+                                int VL);
+
+static void
+ffts_hardcodedleaf_is_rec_even8(ptrdiff_t **is,
+                                int big_N,
+                                int offset,
+                                int stride,
+                                int VL);
+#else
+static void
+ffts_elaborate_offsets_even(ptrdiff_t *const offsets,
+                            int leaf_N,
+                            int N,
+                            int input_offset,
+                            int output_offset,
+                            int stride);
+
+static void
+ffts_elaborate_offsets_odd(ptrdiff_t *const offsets,
+                           int leaf_N,
+                           int N,
+                           int input_offset,
+                           int output_offset,
+                           int stride);
+
+static void
+ffts_hardcodedleaf_is_rec_even(ptrdiff_t **is,
+                               int big_N,
+                               int N,
+                               int offset,
+                               int stride,
+                               int VL);
+
+static void
+ffts_hardcodedleaf_is_rec_odd(ptrdiff_t **is,
+                              int big_N,
+                              int N,
+                              int offset,
+                              int stride,
+                              int VL);
+#endif
+
+static int
+ffts_compare_offsets(const void *pa, const void *pb)
+{
+    const ptrdiff_t a = *(const ptrdiff_t*) pa;
+    const ptrdiff_t b = *(const ptrdiff_t*) pb;
+    return (a > b) - (a < b);
+}
+
+static void
+ffts_permute_addr(int N, int offset, int stride, int *const d)
+{
+    int a[4] = {0,2,1,3};
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        d[i] = offset + (a[i] << stride);
+        if (d[i] < 0) {
+            d[i] += N;
+        }
+    }
+}
+
+#if LEAF_N == 8
+static void
+ffts_elaborate_offsets_even8(ptrdiff_t *const offsets, int log_N)
+{
+    int offset = 1 << (log_N - 4);
+    int stride = 1;
+
+    offsets[0] = 0;
+    offsets[1] = 0;
+    offsets[2] = offset * 2;
+    offsets[3] = 8;
+    offsets[4] = offset;
+    offsets[5] = 16;
+    offsets[6] = -offset;
+    offsets[7] = 24;
+
+    for(; log_N > 5; --log_N, stride *= 2) {
+        ffts_elaborate_offsets_odd8(offsets, log_N - 2,
+            stride, 1 << (log_N - 1), stride * 4);
+
+        ffts_elaborate_offsets_odd8(offsets, log_N - 2,
+            -stride, 3 * (1 << (log_N - 2)), stride * 4);
+    }
+}
+
+static void
+ffts_elaborate_offsets_odd8(ptrdiff_t *const offsets,
+                            int log_N,
+                            int input_offset,
+                            int output_offset,
+                            int stride)
+{
+    if (log_N <= 4) {
+        offsets[(output_offset / 4) + 0] = input_offset * 2;
+        offsets[(output_offset / 4) + 1] = output_offset;
+
+        if (log_N == 4) {
+            offsets[(output_offset / 4) + 2] = (input_offset + stride) * 2;
+            offsets[(output_offset / 4) + 3] = output_offset + 8;
+        }
+    } else {
+        ffts_elaborate_offsets_odd8(offsets, log_N - 1, input_offset,
+            output_offset, stride * 2);
+
+        ffts_elaborate_offsets_odd8(offsets, log_N - 2, input_offset + stride,
+            output_offset + (1 << (log_N - 1)), stride * 4);
+
+        ffts_elaborate_offsets_odd8(offsets, log_N - 2, input_offset - stride,
+            output_offset + 3 * (1 << (log_N - 2)), stride * 4);
+    }
+}
+
+static void
+ffts_hardcodedleaf_is_rec_even4(ptrdiff_t **is,
+                                int big_N,
+                                int offset,
+                                int stride,
+                                int VL)
+{
+    int i, perm[4];
+
+    ffts_permute_addr(big_N, offset, stride, perm);
+
+    if (!((2 * perm[0]) % (2 * VL))) {
+        for (i = 0; i < 4; i++) {
+            (*is)[i] = 2 * perm[i];
+        }
+
+        *is += 4;
+    }
+}
+
+static void
+ffts_hardcodedleaf_is_rec_even8(ptrdiff_t **is,
+                                int big_N,
+                                int offset,
+                                int stride,
+                                int VL)
+{
+    int temp;
+
+    ffts_hardcodedleaf_is_rec_even4(is, big_N, offset, stride + 1, VL);
+
+    temp = offset + (1 << stride);
+    if (temp < 0) {
+        temp += big_N;
+    }
+
+    temp *= 2;
+
+    if (!(temp % (2 * VL))) {
+        int i;
+
+        (*is)[0] = offset + (1 << stride);
+        (*is)[1] = offset + (1 << stride) + (1 << (stride + 2));
+        (*is)[2] = offset - (1 << stride);
+        (*is)[3] = offset - (1 << stride) + (1 << (stride + 2));
+
+        for (i = 0; i < 4; i++) {
+            if ((*is)[i] < 0) {
+                (*is)[i] += big_N;
+            }
+        }
+
+        for (i = 0; i < 4; i++) {
+            (*is)[i] *= 2;
+        }
+
+        *is += 4;
+    }
+}
+#else
+static void
+ffts_elaborate_offsets_even(ptrdiff_t *const offsets,
+                            int leaf_N,
+                            int N,
+                            int input_offset,
+                            int output_offset,
+                            int stride)
+{
+    if (N == leaf_N) {
+        offsets[2 * (output_offset / leaf_N) + 0] = input_offset * 2;
+        offsets[2 * (output_offset / leaf_N) + 1] = output_offset;
+    } else if (N > 4) {
+        ffts_elaborate_offsets_even(offsets, leaf_N,
+            N/2, input_offset, output_offset, stride * 2);
+
+        ffts_elaborate_offsets_odd(offsets, leaf_N,
+            N/4, input_offset + stride, output_offset + N/2, stride * 4);
+
+        if (N/4 >= leaf_N) {
+            ffts_elaborate_offsets_odd(offsets, leaf_N,
+                N/4, input_offset - stride, output_offset + 3*N/4, stride * 4);
+        }
+    }
+}

-#ifndef __PATTERNS_H__
-#define __PATTERNS_H__
+static void
+ffts_elaborate_offsets_odd(ptrdiff_t *const offsets,
+                           int leaf_N,
+                           int N,
+                           int input_offset,
+                           int output_offset,
+                           int stride)
+{
+    if (N <= leaf_N) {
+        offsets[2 * (output_offset / leaf_N) + 0] = input_offset * 2;
+        offsets[2 * (output_offset / leaf_N) + 1] = output_offset;
+    } else if (N > 4) {
+        ffts_elaborate_offsets_odd(offsets, leaf_N, N/2,
+            input_offset, output_offset, stride * 2);

-#include "ffts.h"
+        ffts_elaborate_offsets_odd(offsets, leaf_N, N/4,
+            input_offset + stride, output_offset + N/2, stride * 4);

-void ffts_init_is(ffts_plan_t *p, int N, int leafN, int VL); 
-void ffts_init_offsets(ffts_plan_t *p, int N, int leafN); 
-//void ffts_init_tree(ffts_plan_t *p, int N, int leafN); 
+        if (N/4 >= leaf_N) {
+            ffts_elaborate_offsets_odd(offsets, leaf_N, N/4,
+                input_offset - stride, output_offset + 3*N/4, stride * 4);
+        }
+    }
+}

+static void
+ffts_hardcodedleaf_is_rec_even(ptrdiff_t **is,
+                               int big_N,
+                               int N,
+                               int offset,
+                               int stride,
+                               int VL)
+{
+    if (N > 4) {
+        ffts_hardcodedleaf_is_rec_even(is, big_N, N/2, offset, stride + 1, VL);
+
+        if (N/4 >= 4) {
+            ffts_hardcodedleaf_is_rec_odd(
+                is, big_N, N/4, offset + (1 << stride), stride + 2, VL);
+            ffts_hardcodedleaf_is_rec_odd(
+                is, big_N, N/4, offset - (1 << stride), stride + 2, VL);
+        } else {
+            int temp = offset + (1 << stride);
+
+            if (temp < 0) {
+                temp += big_N;
+            }
+
+            temp *= 2;
+
+            if (!(temp % (2 * VL))) {
+                int i;
+
+                (*is)[0] = offset + (1 << stride);
+                (*is)[1] = offset + (1 << stride) + (1 << (stride + 2));
+                (*is)[2] = offset - (1 << stride);
+                (*is)[3] = offset - (1 << stride) + (1 << (stride + 2));
+
+                for (i = 0; i < 4; i++) {
+                    if ((*is)[i] < 0) {
+                        (*is)[i] += big_N;
+                    }
+                }
+
+                for (i = 0; i < 4; i++) {
+                    (*is)[i] *= 2;
+                }
+
+                *is += 4;
+            }
+        }
+    } else if (N == 4) {
+        int perm[4];
+
+        ffts_permute_addr(big_N, offset, stride, perm);
+
+        if (!((2 * perm[0]) % (2 * VL))) {
+            int i;
+
+            for (i = 0; i < 4; i++) {
+                (*is)[i] = 2 * perm[i];
+            }
+
+            *is += 4;
+        }
+    }
+}
+
+static void
+ffts_hardcodedleaf_is_rec_odd(ptrdiff_t **is,
+                              int big_N,
+                              int N,
+                              int offset,
+                              int stride,
+                              int VL)
+{
+    if (N > 4) {
+        ffts_hardcodedleaf_is_rec_odd(is, big_N, N/2, offset, stride + 1, VL);
+
+        if (N/4 >= 4) {
+            ffts_hardcodedleaf_is_rec_odd(
+                is, big_N, N/4, offset + (1 << stride), stride + 2, VL);
+            ffts_hardcodedleaf_is_rec_odd(
+                is, big_N, N/4, offset - (1 << stride), stride + 2, VL);
+        } else {
+            int temp = offset + (1 << stride);
+
+            if (temp < 0) {
+                temp += big_N;
+            }
+
+            temp *= 2;
+
+            if (!(temp % (2 * VL))) {
+                int i;
+
+                (*is)[0] = offset + (1 << stride);
+                (*is)[1] = offset + (1 << stride) + (1 << (stride + 2));
+                (*is)[2] = offset - (1 << stride);
+                (*is)[3] = offset - (1 << stride) + (1 << (stride + 2));
+
+                for (i = 0; i < 4; i++) {
+                    if ((*is)[i] < 0) {
+                        (*is)[i] += big_N;
+                    }
+                }
+
+                for (i = 0; i < 4; i++) {
+                    (*is)[i] *= 2;
+                }
+
+                *is += 4;
+            }
+        }
+    } else if (N == 4) {
+        int perm[4];
+
+        ffts_permute_addr(big_N, offset, stride, perm);
+
+        if (!((2 * perm[0]) % (2 * VL))) {
+            int i;
+
+            for (i = 0; i < 4; i++) {
+                (*is)[i] = 2 * perm[i];
+            }
+
+            *is += 4;
+        }
+    }
+}
+#endif
+
+static ptrdiff_t*
+ffts_init_is(size_t N, size_t leaf_N, int VL)
+{
+    int i, i0, i1, i2;
+    int stride = ffts_ctzl(N/leaf_N);
+    ptrdiff_t *is, *pis;
+
+    is = malloc(N / VL * sizeof(*is));
+    if (!is) {
+        return NULL;
+    }
+
+    i0 = N/leaf_N/3 + 1;
+    i1 = i2 = N/leaf_N/3;
+    if ((N/leaf_N) % 3 > 1) {
+        i1++;
+    }
+
+    pis = is;
+
+#if LEAF_N == 8
+    for (i = 0; i < i0; i++) {
+        ffts_hardcodedleaf_is_rec_even8(
+            &pis, N, i, stride, VL);
+    }
+
+    for (i = i0; i < i0 + i1; i++) {
+        ffts_hardcodedleaf_is_rec_even4(
+            &pis, N, i, stride + 1, VL);
+        ffts_hardcodedleaf_is_rec_even4(
+            &pis, N, i - (1 << stride), stride + 1, VL);
+    }
+
+    for (i = 0 - i2; i < 0; i++) {
+        ffts_hardcodedleaf_is_rec_even8(
+            &pis, N, i, stride, VL);
+    }
+#else
+    for (i = 0; i < i0; i++) {
+        ffts_hardcodedleaf_is_rec_even(
+            &pis, N, leaf_N, i, stride, VL);
+    }
+
+    for (i = i0; i < i0 + i1; i++) {
+        ffts_hardcodedleaf_is_rec_even(
+            &pis, N, leaf_N / 2, i, stride + 1, VL);
+        ffts_hardcodedleaf_is_rec_even(
+            &pis, N, leaf_N / 2, i - (1 << stride), stride + 1, VL);
+    }
+
+    for (i = 0 - i2; i < 0; i++) {
+        ffts_hardcodedleaf_is_rec_even(
+            &pis, N, leaf_N, i, stride, VL);
+    }
 #endif
+
+    return is;
+}
+
+static ptrdiff_t*
+ffts_init_offsets(size_t N, size_t leaf_N)
+{
+    ptrdiff_t *offsets, *tmp;
+    size_t i;
+
+    offsets = malloc(N/leaf_N * sizeof(*offsets));
+    if (!offsets) {
+        return NULL;
+    }
+
+    tmp = malloc(2 * N/leaf_N * sizeof(*tmp));
+    if (!tmp) {
+        free(offsets);
+        return NULL;
+    }
+
+#if LEAF_N == 8
+    ffts_elaborate_offsets_even8(tmp, ffts_ctzl(N));
+#else
+    ffts_elaborate_offsets_even(tmp, leaf_N, N, 0, 0, 1);
+#endif
+
+    for (i = 0; i < 2*N/leaf_N; i += 2) {
+        if (tmp[i] < 0) {
+            tmp[i] += N;
+        }
+    }
+
+    qsort(tmp, N/leaf_N, 2 * sizeof(*tmp), ffts_compare_offsets);
+
+    for (i = 0; i < N/leaf_N; i++) {
+        offsets[i] = 2 * tmp[2*i + 1];
+    }
+
+    free(tmp);
+    return offsets;
+}
+
+#endif /* FFTS_PATTERNS_H */
--- a/lib/ffts/src/sequitur.h
+++ b/lib/ffts/src/sequitur.h
@ -0,0 +1,448 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ 	* Redistributions of source code must retain the above copyright
+ 		notice, this list of conditions and the following disclaimer.
+ 	* Redistributions in binary form must reproduce the above copyright
+ 		notice, this list of conditions and the following disclaimer in the
+ 		documentation and/or other materials provided with the distribution.
+ 	* Neither the name of the organization nor the
+	  names of its contributors may be used to endorse or promote products
+ 		derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+typedef struct _sym_t {
+    int c;
+    struct _sym_t *pPrev, *pNext;
+    struct _seq_rule_t *r;
+    int offset;
+} sym_t;
+
+typedef struct _seq_rule_t {
+    int c;
+    sym_t *ss;
+    struct _seq_rule_t *pPrev, *pNext;
+    int count;
+    int length;
+} seq_rule_t;
+
+void sym_tail_insert(sym_t **ss, sym_t *s)
+{
+    if (!*ss) {
+        *ss = s;
+        s->pPrev = s->pNext = NULL;
+    } else {
+        while (*ss) {
+            s->pPrev = *ss;
+            ss = &(*ss)->pNext;
+        }
+
+        *ss = s;
+    }
+}
+
+sym_t* sym_init(int c)
+{
+    sym_t *s;
+
+    s = (sym_t*) malloc(sizeof(*s));
+    if (!s) {
+        return NULL;
+    }
+
+    s->c = c;
+    s->pPrev = s->pNext = NULL;
+    s->r = NULL;
+
+    return s;
+}
+
+sym_t* sym_init_from_sym(sym_t *s2)
+{
+    sym_t *s;
+
+    s = (sym_t*) malloc(sizeof(*s));
+    if (!s) {
+        return NULL;
+    }
+
+    s->c = s2->c;
+    s->pPrev = s->pNext = NULL;
+    s->r = s2->r;
+    s->offset = s2->offset;
+
+    return s;
+}
+
+seq_rule_t* seq_init_rule(int c)
+{
+    seq_rule_t *G;
+
+    G = (seq_rule_t *)malloc(sizeof(*G));
+    if (!G) {
+        return NULL;
+    }
+
+    G->c = c;
+    G->count = 2;
+    G->ss = NULL;
+    G->pPrev = NULL;
+    G->pNext = NULL;
+
+    return G;
+}
+
+seq_rule_t* seq_grammer_insert_new_rule(seq_rule_t *G, char r, sym_t *a, sym_t *b)
+{
+    sym_t *sa, *sb;
+
+    while (G->pNext) {
+        G = G->pNext;
+    }
+
+    G->pNext = seq_init_rule(r);
+    if (!G->pNext) {
+        return NULL;
+    }
+
+    sa = sym_init_from_sym(a);
+    if (!sa) {
+        goto cleanup_pnext;
+    }
+
+    sb = sym_init_from_sym(b);
+    if (!sb) {
+        goto cleanup_sa;
+    }
+
+    sb->offset = sb->offset - sa->offset;
+    sa->offset = 0;
+    sym_tail_insert(&G->pNext->ss, sa);
+    sym_tail_insert(&G->pNext->ss, sb);
+    return G->pNext;
+
+cleanup_sa:
+    free(sa);
+
+cleanup_pnext:
+    free(G->pNext);
+    G->pNext = NULL;
+
+    return NULL;
+}
+
+sym_t* sym_match_digram(sym_t *s, sym_t *term, sym_t *a, sym_t *b)
+{
+    while (s != term) {
+        if (s->c == a->c && s->pNext->c == b->c &&
+                s->pNext->offset - s->offset == b->offset-a->offset) {
+            return s;
+        }
+
+        s = s->pNext;
+    }
+
+    return NULL;
+}
+
+seq_rule_t* seq_match_digram(seq_rule_t *R, sym_t *a, sym_t *b)
+{
+    while (R) {
+        if (R->ss->c == a->c && R->ss->pNext->c == b->c &&
+                R->ss->pNext->offset - R->ss->offset == b->offset - a->offset) {
+            return R;
+        }
+
+        R = R->pNext;
+    }
+
+    return NULL;
+}
+
+sym_t* sym_tail(sym_t *s)
+{
+    while (s->pNext) {
+        s = s->pNext;
+    }
+
+    return s;
+}
+
+int sym_count(sym_t *s)
+{
+    int count = 0;
+
+    while (s) {
+        count++;
+        s = s->pNext;
+    }
+
+    return count;
+}
+
+sym_t* sym_copylist(sym_t *s)
+{
+    sym_t *head = NULL;
+    sym_t *prev = head;
+
+    while (s) {
+        sym_t *copy = sym_init_from_sym(s);
+        if (!copy) {
+            return NULL;
+        }
+
+        copy->pPrev = prev;
+
+        if (prev) {
+            prev->pNext = copy;
+        }
+
+        if (!head) {
+            head = copy;
+        }
+
+        prev = copy;
+        s = s->pNext;
+    }
+
+    return head;
+}
+
+void seq_enforce_uniqueness(seq_rule_t *G)
+{
+    seq_rule_t *R = G;//->pNext;
+    seq_rule_t **ppr = &G->pNext;
+
+    while (R) {
+        if (R == G || R->count > 1) {
+            sym_t *s = R->ss;
+            sym_t **pp = &R->ss;
+
+            while (s) {
+                if (s->r && s->r->count == 1) {
+                    sym_t *temp_itr;
+
+                    *pp = s->r->ss;
+
+                    temp_itr = s->r->ss;
+                    while (temp_itr) {
+                        temp_itr->offset += s->offset;
+                        temp_itr = temp_itr->pNext;
+                    }
+
+                    s->r->ss->pPrev = s->pPrev;
+                    if (s->pNext) {
+                        s->pNext->pPrev = sym_tail(s->r->ss);
+                    }
+
+                    sym_tail(s->r->ss)->pNext = s->pNext;
+                    s = s->r->ss;
+                    continue;
+                }
+
+                pp = &s->pNext;
+                s = s->pNext;
+            }
+
+            ppr = &R->pNext;
+        } else {
+            *ppr = R->pNext;
+        }
+
+        R = R->pNext;
+    }
+}
+
+void seq_merge_small_rules(seq_rule_t *G, int thresh)
+{
+    seq_rule_t *R = G;
+
+    while (R) {
+        if (sym_count(R->ss) <= thresh) {
+            //printf("count %d > %d for %d\n", sym_count(R->ss), thresh, R->c);
+            sym_t *s = R->ss;
+            sym_t **pp = &R->ss;
+
+            while (s) {
+                if (s->r) {
+                    sym_t *copylist;
+                    sym_t *copylist_itr;
+
+                    s->r->count--;
+
+                    copylist = sym_copylist(s->r->ss);
+                    if (!copylist) {
+                        return;
+                    }
+
+                    copylist_itr = copylist;
+                    while (copylist_itr) {
+                        copylist_itr->offset += s->offset;
+                        copylist_itr = copylist_itr->pNext;
+                    }
+
+                    *pp = copylist;
+                    copylist->pPrev = s->pPrev;
+                    if (s->pNext) {
+                        s->pNext->pPrev = sym_tail(copylist);
+                    }
+
+                    sym_tail(copylist)->pNext = s->pNext;
+                    pp = &(sym_tail(copylist)->pNext);
+                    s = sym_tail(copylist)->pNext;
+                    continue;
+                }
+
+                pp = &s->pNext;
+                s = s->pNext;
+            }
+        }
+
+        R = R->pNext;
+    }
+
+    seq_enforce_uniqueness(G);
+}
+
+void seq_extract_hierarchy(seq_rule_t *G)
+{
+    int next_rule = -2;
+    sym_t *cursym = G->ss;
+
+    while (cursym) {
+        sym_t *m = NULL;
+        seq_rule_t *mr = NULL;
+
+        if (cursym->pPrev && cursym->pPrev->pPrev) {
+            mr = seq_match_digram(G->pNext, cursym->pPrev, cursym);
+            if (mr) {
+                if (cursym->pPrev->r) {
+                    cursym->pPrev->r->count--;
+                }
+
+                if(cursym->r) {
+                    cursym->r->count--;
+                }
+
+                mr->count++;
+
+                cursym->pPrev->r = mr;
+                cursym->pPrev->c = mr->c;
+                cursym->pPrev->pNext = cursym->pNext;
+                cursym->pNext->pPrev = cursym->pPrev;
+                cursym = cursym->pPrev;
+            }
+
+            m = sym_match_digram(G->ss, cursym->pPrev->pPrev, cursym->pPrev, cursym);
+            if (m) {
+                seq_rule_t *newr;
+
+                if (cursym->pPrev->r) {
+                    cursym->pPrev->r->count--;
+                }
+
+                if (cursym->r) {
+                    cursym->r->count--;
+                }
+
+                newr = seq_grammer_insert_new_rule(G, next_rule, m, m->pNext);
+                if (!newr) {
+                    return;
+                }
+
+                m->r = newr;
+                m->c = next_rule;
+                m->pNext = m->pNext->pNext;
+                m->pNext->pPrev = m;
+
+                cursym->pPrev->r = newr;
+                cursym->pPrev->c = next_rule;
+                cursym->pPrev->pNext = cursym->pNext;
+                cursym->pNext->pPrev = cursym->pPrev;
+                cursym = cursym->pPrev;
+
+                next_rule--;
+            }
+        }
+
+        if (!m && !mr) {
+            cursym = cursym->pNext;
+        }
+    }
+
+    seq_enforce_uniqueness(G);
+    seq_merge_small_rules(G, 2);
+//	seq_enforce_uniqueness(G);
+}
+
+void seq_compute_lengths(seq_rule_t *G)
+{
+    seq_rule_t *R = G->pNext;
+    sym_t *s;
+    int sum;
+
+    while (R) {
+        sum = 0;
+        s = R->ss;
+
+        while (s) {
+            if (s->c >= 0) {
+                if (s->offset + s->c > sum) {
+                    sum = s->offset + s->c;
+                }
+            }
+
+            if (s->c < 0) {
+                if (s->offset + s->r->length > sum) {
+                    sum = s->offset + s->r->length;
+                }
+            }
+
+            s = s->pNext;
+        }
+
+        R->length = sum;
+        R = R->pNext;
+    }
+
+    sum = 0;
+    s = G->ss;
+
+    while (s) {
+        if (s->c >= 0) {
+            if (s->offset + s->c > sum) {
+                sum = s->offset + s->c;
+            }
+        }
+
+        if (s->c < 0) {
+            if (s->offset + s->r->length > sum) {
+                sum = s->offset + s->r->length;
+            }
+        }
+
+        s = s->pNext;
+    }
+
+    G->length = sum;
+}
--- a/lib/ffts/src/sse.s
+++ b/lib/ffts/src/sse.s
@ -1,878 +0,0 @@
-/*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-*/
-
-
-	.globl	_neon_x4
-	.align	4
-_neon_x4:
-
-	.globl _neon_x8
-	.align 4
-_neon_x8:
-
-	.globl _neon_x8_t
-	.align 4
-_neon_x8_t:
-
-
-#ifdef __APPLE__
-	.globl _leaf_ee_init
-_leaf_ee_init:
-#else
-	.globl leaf_ee_init
-leaf_ee_init:
-#endif
- 		#lea L_sse_constants(%rip), %r9 
-		movq 0xe0(%rdi), %r9
-		xorl	%eax, %eax
-# eax is loop counter (init to 0)
-# rcx is loop max count
-# rsi is 'in' base pointer
-# rdx is 'out' base pointer
-# r8 is offsets pointer
-# r9 is constants pointer
-# scratch: rax r11 r12
-#	.align	4, 0x90
-
-# _leaf_ee + 9 needs 16 byte alignment
-#ifdef __APPLE__
-	.globl _leaf_ee
-_leaf_ee:
-#else
-	.globl leaf_ee
-leaf_ee:
-#endif
-		movaps    32(%r9), %xmm0            #83.5
- 		movaps    (%r9), %xmm8            #83.5
-LEAF_EE_1:
-LEAF_EE_const_0:
-				movaps    0xFECA(%rsi,%rax,4), %xmm7                           #83.5
-LEAF_EE_const_2:
-        movaps    0xFECA(%rsi,%rax,4), %xmm12                         #83.5
-        movaps    %xmm7, %xmm6                                  #83.5
-LEAF_EE_const_3:
-        movaps    0xFECA(%rsi,%rax,4), %xmm10                         #83.5
-        movaps    %xmm12, %xmm11                                #83.5
-        subps     %xmm10, %xmm12                                #83.5
-        addps     %xmm10, %xmm11                                #83.5
-        xorps     %xmm8, %xmm12                                 #83.5
-LEAF_EE_const_1:
-        movaps    0xFECA(%rsi,%rax,4), %xmm9                          #83.5
-LEAF_EE_const_4:
-        movaps    0xFECA(%rsi,%rax,4), %xmm10                         #83.5
-        addps     %xmm9, %xmm6                                  #83.5
-        subps     %xmm9, %xmm7                                  #83.5
-LEAF_EE_const_5:
-        movaps    0xFECA(%rsi,%rax,4), %xmm13                         #83.5
-        movaps    %xmm10, %xmm9                                 #83.5
-LEAF_EE_const_6:
-        movaps    0xFECA(%rsi,%rax,4), %xmm3                          #83.5
-        movaps    %xmm6, %xmm5                                  #83.5
-LEAF_EE_const_7:
-        movaps    0xFECA(%rsi,%rax,4), %xmm14                          #83.5
-        movaps    %xmm3, %xmm15                                 #83.5
-        shufps    $177, %xmm12, %xmm12                          #83.5
-        movaps    %xmm7, %xmm4                                  #83.5
-        movslq    (%r8, %rax, 4), %r11                                   #83.44
-        subps     %xmm13, %xmm10                                #83.5
-        subps     %xmm14, %xmm3                                 #83.5
-        addps     %xmm11, %xmm5                                 #83.5
-        subps     %xmm11, %xmm6                                 #83.5
-        subps     %xmm12, %xmm4                                 #83.5
-        addps     %xmm12, %xmm7                                 #83.5
-        addps     %xmm13, %xmm9                                 #83.5
-        addps     %xmm14, %xmm15                                #83.5
-        movaps    16(%r9), %xmm12           #83.5
-        movaps    %xmm9, %xmm1                                  #83.5
-        movaps    16(%r9), %xmm11           #83.5
-        movaps    %xmm5, %xmm2                                  #83.5
-        mulps     %xmm10, %xmm12                                #83.5
-        subps     %xmm15, %xmm9                                 #83.5
-        addps     %xmm15, %xmm1                                 #83.5
-        mulps     %xmm3, %xmm11                                 #83.5
-        addps     %xmm1, %xmm2                                  #83.5
-        subps     %xmm1, %xmm5                                  #83.5
-        shufps    $177, %xmm10, %xmm10                          #83.5
-        xorps     %xmm8, %xmm9                                  #83.5
-        shufps    $177, %xmm3, %xmm3                            #83.5
-        movaps    %xmm6, %xmm1                                  #83.5
-        mulps     %xmm0, %xmm10                                 #83.5
-        movaps    %xmm4, %xmm13                                 #83.5
-        mulps     %xmm0, %xmm3                                  #83.5
-        subps     %xmm10, %xmm12                                #83.5
-        addps     %xmm3, %xmm11                                 #83.5
-        movaps    %xmm12, %xmm3                                 #83.5
-        movaps    %xmm7, %xmm14                                 #83.5
-        shufps    $177, %xmm9, %xmm9                            #83.5
-        subps     %xmm11, %xmm12                                #83.5
-        addps     %xmm11, %xmm3                                 #83.5
-        subps     %xmm9, %xmm1                                  #83.5
-        addps     %xmm9, %xmm6                                  #83.5
-        addps     %xmm3, %xmm4                                  #83.5
-        subps     %xmm3, %xmm13                                 #83.5
-        xorps     %xmm8, %xmm12                                 #83.5
-        movaps    %xmm2, %xmm3                                  #83.5
-        shufps    $177, %xmm12, %xmm12                          #83.5
-        movaps    %xmm6, %xmm9                                  #83.5
-        movslq    8(%r8, %rax, 4), %r12                                  #83.59
-        movlhps   %xmm4, %xmm3                                  #83.5
-				addq	    $4, %rax
-        shufps    $238, %xmm4, %xmm2                            #83.5
-        movaps    %xmm1, %xmm4                                  #83.5
-        #movntdq    %xmm3, (%rdx,%r11,4)                          #83.5
-        subps     %xmm12, %xmm7                                 #83.5
-        addps     %xmm12, %xmm14                                #83.5
-        movlhps   %xmm7, %xmm4                                  #83.5
-        shufps    $238, %xmm7, %xmm1                            #83.5
-        movaps    %xmm5, %xmm7                                  #83.5
-        movlhps   %xmm13, %xmm7                                 #83.5
-        movlhps   %xmm14, %xmm9                                 #83.5
-        shufps    $238, %xmm13, %xmm5                           #83.5
-        shufps    $238, %xmm14, %xmm6                           #83.5
-        movaps    %xmm3, (%rdx,%r11,4)                          #83.5
-        movaps    %xmm4, 16(%rdx,%r11,4)                        #83.5
-        movaps    %xmm7, 32(%rdx,%r11,4)                        #83.5
-        movaps    %xmm9, 48(%rdx,%r11,4)                        #83.5
-        movaps    %xmm2, (%rdx,%r12,4)                          #83.5
-        movaps    %xmm1, 16(%rdx,%r12,4)                        #83.5
-        movaps    %xmm5, 32(%rdx,%r12,4)                        #83.5
-        movaps    %xmm6, 48(%rdx,%r12,4)                        #83.5
-				cmpq	%rcx, %rax
-        jne      LEAF_EE_1 
-        
-
-
-# _leaf_oo + 4 needs to be 16 byte aligned
-#ifdef __APPLE__
-	.globl _leaf_oo
-_leaf_oo:
-#else
-	.globl leaf_oo
-leaf_oo:
-#endif
-        movaps    (%r9), %xmm5            #92.7
-LEAF_OO_1:
-LEAF_OO_const_0:
-        movaps    0xFECA(%rsi,%rax,4), %xmm4                           #93.5
-        movaps    %xmm4, %xmm6                                  #93.5
-LEAF_OO_const_1:
-        movaps    0xFECA(%rsi,%rax,4), %xmm7                          #93.5
-LEAF_OO_const_2:
-        movaps    0xFECA(%rsi,%rax,4), %xmm10                         #93.5
-        addps     %xmm7, %xmm6                                  #93.5
-        subps     %xmm7, %xmm4                                  #93.5
-LEAF_OO_const_3:
-        movaps    0xFECA(%rsi,%rax,4), %xmm8                          #93.5
-        movaps    %xmm10, %xmm9                                 #93.5
-LEAF_OO_const_4:
-        movaps    0xFECA(%rsi,%rax,4), %xmm1                          #93.5
-        movaps    %xmm6, %xmm3                                  #93.5
-LEAF_OO_const_5:
-        movaps    0xFECA(%rsi,%rax,4), %xmm11                         #93.5
-        movaps    %xmm1, %xmm2                                  #93.5
-LEAF_OO_const_6:
-        movaps    0xFECA(%rsi,%rax,4), %xmm14                         #93.5
-        movaps    %xmm4, %xmm15                                 #93.5
-LEAF_OO_const_7:
-        movaps    0xFECA(%rsi,%rax,4), %xmm12                          #93.5
-        movaps    %xmm14, %xmm13                                #93.5
-        movslq    (%r8, %rax, 4), %r11                                   #83.44
-        subps     %xmm8, %xmm10                                 #93.5
-        addps     %xmm8, %xmm9                                  #93.5
-        addps     %xmm11, %xmm2                                 #93.5
-        subps     %xmm12, %xmm14                                #93.5
-        subps     %xmm11, %xmm1                                 #93.5
-        addps     %xmm12, %xmm13                                #93.5
-        addps     %xmm9, %xmm3                                  #93.5
-        subps     %xmm9, %xmm6                                  #93.5
-        xorps     %xmm5, %xmm10                                 #93.5
-        xorps     %xmm5, %xmm14                                 #93.5
-        shufps    $177, %xmm10, %xmm10                          #93.5
-        movaps    %xmm2, %xmm9                                  #93.5
-        shufps    $177, %xmm14, %xmm14                          #93.5
-        movaps    %xmm6, %xmm7                                  #93.5
-        movslq    8(%r8, %rax, 4), %r12                                  #83.59
-        addq      $4, %rax                                          #92.18
-        addps     %xmm10, %xmm4                                 #93.5
-        addps     %xmm13, %xmm9                                 #93.5
-        subps     %xmm13, %xmm2                                 #93.5
-        subps     %xmm10, %xmm15                                #93.5
-        movaps    %xmm1, %xmm13                                 #93.5
-        movaps    %xmm2, %xmm8                                  #93.5
-        movlhps   %xmm4, %xmm7                                  #93.5
-        subps     %xmm14, %xmm13                                #93.5
-        addps     %xmm14, %xmm1                                 #93.5
-        shufps    $238, %xmm4, %xmm6                            #93.5
-        movaps    %xmm3, %xmm14                                 #93.5
-        movaps    %xmm9, %xmm4                                  #93.5
-        movlhps   %xmm15, %xmm14                                #93.5
-        movlhps   %xmm13, %xmm4                                 #93.5
-        movlhps   %xmm1, %xmm8                                  #93.5
-        shufps    $238, %xmm15, %xmm3                           #93.5
-        shufps    $238, %xmm13, %xmm9                           #93.5
-        shufps    $238, %xmm1, %xmm2                            #93.5
-        movaps    %xmm14, (%rdx,%r11,4)                         #93.5
-        movaps    %xmm7, 16(%rdx,%r11,4)                        #93.5
-        movaps    %xmm4, 32(%rdx,%r11,4)                        #93.5
-        movaps    %xmm8, 48(%rdx,%r11,4)                        #93.5
-        movaps    %xmm3, (%rdx,%r12,4)                          #93.5
-        movaps    %xmm6, 16(%rdx,%r12,4)                        #93.5
-        movaps    %xmm9, 32(%rdx,%r12,4)                        #93.5
-        movaps    %xmm2, 48(%rdx,%r12,4)                        #93.5
-				cmpq	%rcx, %rax
-        jne       LEAF_OO_1       # Prob 95%                      #92.14
-
-#ifdef __APPLE__
-	.globl _leaf_eo
-_leaf_eo:
-#else
-	.globl leaf_eo
-leaf_eo:
-#endif
-LEAF_EO_const_0:
-        movaps    0xFECA(%rsi,%rax,4), %xmm9                          #88.5
-LEAF_EO_const_2:
-        movaps    0xFECA(%rsi,%rax,4), %xmm7                          #88.5
-        movaps    %xmm9, %xmm11                                 #88.5
-LEAF_EO_const_3:
-        movaps    0xFECA(%rsi,%rax,4), %xmm5                           #88.5
-        movaps    %xmm7, %xmm6                                  #88.5
-LEAF_EO_const_1:
-        movaps    0xFECA(%rsi,%rax,4), %xmm4                          #88.5
-        subps     %xmm5, %xmm7                                  #88.5
-        addps     %xmm4, %xmm11                                 #88.5
-        subps     %xmm4, %xmm9                                  #88.5
-        addps     %xmm5, %xmm6                                  #88.5
-        movaps    (%r9), %xmm3            #88.5
-        movaps    %xmm11, %xmm10                                #88.5
-        xorps     %xmm3, %xmm7                                  #88.5
-        movaps    %xmm9, %xmm8                                  #88.5
-        shufps    $177, %xmm7, %xmm7                            #88.5
-        addps     %xmm6, %xmm10                                 #88.5
-        subps     %xmm6, %xmm11                                 #88.5
-        subps     %xmm7, %xmm8                                  #88.5
-        addps     %xmm7, %xmm9                                  #88.5
-        movslq    8(%r8, %rax, 4), %r12                                  #83.59
-        movaps    %xmm10, %xmm2                                 #88.5
-        movslq    (%r8, %rax, 4), %r11                                   #83.44
-        movaps    %xmm11, %xmm1                                 #88.5
-        shufps    $238, %xmm8, %xmm10                           #88.5
-        shufps    $238, %xmm9, %xmm11                           #88.5
-        movaps    %xmm10, (%rdx,%r12,4)                         #88.5
-        movaps    %xmm11, 16(%rdx,%r12,4)                       #88.5
-LEAF_EO_const_4:
-        movaps    0xFECA(%rsi,%rax,4), %xmm15                         #88.5
-LEAF_EO_const_5:
-        movaps    0xFECA(%rsi,%rax,4), %xmm12                         #88.5
-        movaps    %xmm15, %xmm14                                #88.5
-LEAF_EO_const_6:
-        movaps    0xFECA(%rsi,%rax,4), %xmm4                          #88.5
-        addps     %xmm12, %xmm14                                #88.5
-        subps     %xmm12, %xmm15                                #88.5
-LEAF_EO_const_7:
-        movaps    0xFECA(%rsi,%rax,4), %xmm13                         #88.5
-        movaps    %xmm4, %xmm5                                  #88.5
-        movaps    %xmm14, %xmm7                                 #88.5
-        addps     %xmm13, %xmm5                                 #88.5
-        subps     %xmm13, %xmm4                                 #88.5
-        movlhps   %xmm8, %xmm2                                  #88.5
-        movaps    %xmm5, %xmm8                                  #88.5
-        movlhps   %xmm15, %xmm7                                 #88.5
-        xorps     %xmm3, %xmm15                                 #88.5
-        movaps    %xmm5, %xmm6                                  #88.5
-        subps     %xmm14, %xmm5                                 #88.5
-        addps     %xmm14, %xmm6                                 #88.5
-        movlhps   %xmm9, %xmm1                                  #88.5
-        movaps    %xmm4, %xmm14                                 #88.5
-        movlhps   %xmm4, %xmm8                                  #88.5
-        movaps    %xmm1, %xmm12                                 #88.5
-        shufps    $177, %xmm15, %xmm15                          #88.5
-        movaps    0x30(%r9), %xmm11           #88.5
-        addq      $4, %rax                                       #90.5
-        subps     %xmm15, %xmm14                                #88.5
-        mulps     %xmm7, %xmm11                                 #88.5
-        addps     %xmm15, %xmm4                                 #88.5
-        movaps    0x30(%r9), %xmm9            #88.5
-        movaps    0x40(%r9), %xmm15           #88.5
-        shufps    $177, %xmm7, %xmm7                            #88.5
-        mulps     %xmm8, %xmm9                                  #88.5
-        mulps     %xmm15, %xmm7                                 #88.5
-        shufps    $177, %xmm8, %xmm8                            #88.5
-        subps     %xmm7, %xmm11                                 #88.5
-        mulps     %xmm15, %xmm8                                 #88.5
-        movaps    %xmm11, %xmm10                                #88.5
-        addps     %xmm8, %xmm9                                  #88.5
-        shufps    $238, %xmm14, %xmm6                           #88.5
-        subps     %xmm9, %xmm11                                 #88.5
-        addps     %xmm9, %xmm10                                 #88.5
-        xorps     %xmm3, %xmm11                                 #88.5
-        movaps    %xmm2, %xmm3                                  #88.5
-        shufps    $177, %xmm11, %xmm11                          #88.5
-        subps     %xmm10, %xmm3                                 #88.5
-        addps     %xmm10, %xmm2                                 #88.5
-        addps     %xmm11, %xmm12                                #88.5
-        subps     %xmm11, %xmm1                                 #88.5
-        shufps    $238, %xmm4, %xmm5                            #88.5
-        movaps    %xmm5, 48(%rdx,%r12,4)                        #88.5
-        movaps    %xmm6, 32(%rdx,%r12,4)                        #88.5
-        movaps    %xmm2, (%rdx,%r11,4)                          #88.5
-        movaps    %xmm1, 16(%rdx,%r11,4)                        #88.5
-        movaps    %xmm3, 32(%rdx,%r11,4)                        #88.5
-        movaps    %xmm12, 48(%rdx,%r11,4)                       #88.5
-	
-
-#ifdef __APPLE__
-	.globl _leaf_oe
-_leaf_oe:
-#else
-	.globl leaf_oe
-leaf_oe:
-#endif
-        movaps    (%r9), %xmm0           #59.5
-        #movaps    0x20(%r9), %xmm1           #59.5
-LEAF_OE_const_2:
-				movaps    0xFECA(%rsi,%rax,4), %xmm6                          #70.5
-LEAF_OE_const_3:
-        movaps    0xFECA(%rsi,%rax,4), %xmm8                           #70.5
-        movaps    %xmm6, %xmm10                                 #70.5
-        shufps    $228, %xmm8, %xmm10                           #70.5
-        movaps    %xmm10, %xmm9                                 #70.5
-        shufps    $228, %xmm6, %xmm8                            #70.5
-LEAF_OE_const_0:
-        movaps    0xFECA(%rsi,%rax,4), %xmm12                         #70.5
-LEAF_OE_const_1:
-        movaps    0xFECA(%rsi,%rax,4), %xmm7                          #70.5
-        movaps    %xmm12, %xmm14                                #70.5
-        movslq    (%r8, %rax, 4), %r11                                   #83.44
-        addps     %xmm8, %xmm9                                  #70.5
-        subps     %xmm8, %xmm10                                 #70.5
-        addps     %xmm7, %xmm14                                 #70.5
-        subps     %xmm7, %xmm12                                 #70.5
-        movaps    %xmm9, %xmm4                                  #70.5
-        movaps    %xmm14, %xmm13                                #70.5
-        shufps    $238, %xmm10, %xmm4                           #70.5
-        xorps     %xmm0, %xmm10                                 #70.5
-        shufps    $177, %xmm10, %xmm10                          #70.5
-        movaps    %xmm12, %xmm11                                #70.5
-        movaps    %xmm14, %xmm5                                 #70.5
-        addps     %xmm9, %xmm13                                 #70.5
-        subps     %xmm10, %xmm11                                #70.5
-        subps     %xmm9, %xmm14                                 #70.5
-        shufps    $238, %xmm12, %xmm5                           #70.5
-        addps     %xmm10, %xmm12                                #70.5
-        movslq    8(%r8, %rax, 4), %r12                                  #83.59
-        movlhps   %xmm11, %xmm13                                #70.5
-        movaps    %xmm13, (%rdx,%r11,4)                         #70.5
-        movaps    0x30(%r9), %xmm13          #70.5
-        movlhps   %xmm12, %xmm14                                #70.5
-        movaps    0x40(%r9), %xmm12          #70.5
-        mulps     %xmm5, %xmm13                                 #70.5
-        shufps    $177, %xmm5, %xmm5                            #70.5
-        mulps     %xmm12, %xmm5                                 #70.5
-        movaps    %xmm14, 16(%rdx,%r11,4)                       #70.5
-        subps     %xmm5, %xmm13                                 #70.5
-        movaps    0x30(%r9), %xmm5           #70.5
-        mulps     %xmm4, %xmm5                                  #70.5
-        shufps    $177, %xmm4, %xmm4                            #70.5
-        mulps     %xmm12, %xmm4                                 #70.5
-LEAF_OE_const_4:
-        movaps    0xFECA(%rsi,%rax,4), %xmm9                          #70.5
-        addps     %xmm4, %xmm5                                  #70.5
-LEAF_OE_const_6:
-        movaps    0xFECA(%rsi,%rax,4), %xmm7                          #70.5
-        movaps    %xmm9, %xmm3                                  #70.5
-LEAF_OE_const_7:
-        movaps    0xFECA(%rsi,%rax,4), %xmm2                          #70.5
-        movaps    %xmm7, %xmm6                                  #70.5
-LEAF_OE_const_5:
-        movaps    0xFECA(%rsi,%rax,4), %xmm15                         #70.5
-        movaps    %xmm13, %xmm4                                 #70.5
-        subps     %xmm2, %xmm7                                  #70.5
-        addps     %xmm15, %xmm3                                 #70.5
-        subps     %xmm15, %xmm9                                 #70.5
-        addps     %xmm2, %xmm6                                  #70.5
-        subps     %xmm5, %xmm13                                 #70.5
-        addps     %xmm5, %xmm4                                  #70.5
-        xorps     %xmm0, %xmm7                                  #70.5
-        addq      $4, %rax                                       #72.5
-        movaps    %xmm3, %xmm2                                  #70.5
-        shufps    $177, %xmm7, %xmm7                            #70.5
-        movaps    %xmm9, %xmm8                                  #70.5
-        xorps     %xmm0, %xmm13                                 #70.5
-        addps     %xmm6, %xmm2                                  #70.5
-        subps     %xmm7, %xmm8                                  #70.5
-        subps     %xmm6, %xmm3                                  #70.5
-        addps     %xmm7, %xmm9                                  #70.5
-        movaps    %xmm2, %xmm10                                 #70.5
-        movaps    %xmm3, %xmm11                                 #70.5
-        shufps    $238, %xmm8, %xmm2                            #70.5
-        shufps    $238, %xmm9, %xmm3                            #70.5
-        movaps    %xmm2, %xmm14                                 #70.5
-        shufps    $177, %xmm13, %xmm13                          #70.5
-        subps     %xmm4, %xmm14                                 #70.5
-        addps     %xmm4, %xmm2                                  #70.5
-        movaps    %xmm3, %xmm4                                  #70.5
-        subps     %xmm13, %xmm3                                 #70.5
-        addps     %xmm13, %xmm4                                 #70.5
-        movlhps   %xmm8, %xmm10                                 #70.5
-        movlhps   %xmm9, %xmm11                                 #70.5
-        movaps    %xmm10, 32(%rdx,%r11,4)                       #70.5
-        movaps    %xmm11, 48(%rdx,%r11,4)                       #70.5
-        movaps    %xmm2, (%rdx,%r12,4)                          #70.5
-        movaps    %xmm3, 16(%rdx,%r12,4)                        #70.5
-        movaps    %xmm14, 32(%rdx,%r12,4)                       #70.5
-        movaps    %xmm4, 48(%rdx,%r12,4)                        #70.5
-	
-	
-#ifdef __APPLE__
-	.globl	_leaf_end
-_leaf_end:
-#else
-	.globl	leaf_end
-leaf_end:
-#endif
-
-#ifdef __APPLE__
-	.globl	_x_init
-_x_init:
-#else
-	.globl	x_init
-x_init:
-#endif
-        #movaps    L_sse_constants(%rip), %xmm3           #34.3
-				movaps   (%r9), %xmm3           #34.3
-				movq        0x20(%rdi),%r8
-#ifdef __APPLE__
-	.globl	_x4
-_x4:
-#else	
-	.globl	x4
-x4:
-#endif
-        movaps    64(%rdx), %xmm0                               #34.3
-        movaps    96(%rdx), %xmm1                               #34.3
-        movaps    (%rdx), %xmm7                                 #34.3
-        movaps    (%r8), %xmm4      #const
-        movaps    %xmm7, %xmm9                                  #34.3
-        movaps    %xmm4, %xmm6                                  #34.3
-        movaps    16(%r8), %xmm2      #const
-        mulps     %xmm0, %xmm6                                  #34.3
-        mulps     %xmm1, %xmm4                                  #34.3
-        shufps    $177, %xmm0, %xmm0                            #34.3
-        shufps    $177, %xmm1, %xmm1                            #34.3
-        mulps     %xmm2, %xmm0                                  #34.3
-        mulps     %xmm1, %xmm2                                  #34.3
-        subps     %xmm0, %xmm6                                  #34.3
-        addps     %xmm2, %xmm4                                  #34.3
-        movaps    %xmm6, %xmm5                                  #34.3
-        subps     %xmm4, %xmm6                                  #34.3
-        addps     %xmm4, %xmm5                                  #34.3
-        movaps    32(%rdx), %xmm8                               #34.3
-        xorps     %xmm3, %xmm6                                  #34.3
-        shufps    $177, %xmm6, %xmm6                            #34.3
-        movaps    %xmm8, %xmm10                                 #34.3
-        movaps    112(%rdx), %xmm12                             #34.3
-        subps     %xmm5, %xmm9                                  #34.3
-        addps     %xmm5, %xmm7                                  #34.3
-        addps     %xmm6, %xmm10                                 #34.3
-        subps     %xmm6, %xmm8                                  #34.3
-        movaps    %xmm7, (%rdx)                                 #34.3
-        movaps    %xmm8, 32(%rdx)                               #34.3
-        movaps    %xmm9, 64(%rdx)                               #34.3
-        movaps    %xmm10, 96(%rdx)                              #34.3
-        movaps    32(%r8), %xmm14    #const                          #34.3
-        movaps    80(%rdx), %xmm11                              #34.3
-        movaps    %xmm14, %xmm0                                 #34.3
-        movaps    48(%r8), %xmm13    #const                          #34.3
-        mulps     %xmm11, %xmm0                                 #34.3
-        mulps     %xmm12, %xmm14                                #34.3
-        shufps    $177, %xmm11, %xmm11                          #34.3
-        shufps    $177, %xmm12, %xmm12                          #34.3
-        mulps     %xmm13, %xmm11                                #34.3
-        mulps     %xmm12, %xmm13                                #34.3
-        subps     %xmm11, %xmm0                                 #34.3
-        addps     %xmm13, %xmm14                                #34.3
-        movaps    %xmm0, %xmm15                                 #34.3
-        subps     %xmm14, %xmm0                                 #34.3
-        addps     %xmm14, %xmm15                                #34.3
-        xorps     %xmm3, %xmm0                                  #34.3
-        movaps    16(%rdx), %xmm1                               #34.3
-        movaps    48(%rdx), %xmm2                               #34.3
-        movaps    %xmm1, %xmm4                                  #34.3
-        shufps    $177, %xmm0, %xmm0                            #34.3
-        movaps    %xmm2, %xmm5                                  #34.3
-        addps     %xmm15, %xmm1                                 #34.3
-        subps     %xmm0, %xmm2                                  #34.3
-        subps     %xmm15, %xmm4                                 #34.3
-        addps     %xmm0, %xmm5                                  #34.3
-        movaps    %xmm1, 16(%rdx)                               #34.3
-        movaps    %xmm2, 48(%rdx)                               #34.3
-        movaps    %xmm4, 80(%rdx)                               #34.3
-        movaps    %xmm5, 112(%rdx)                              #34.3
-				ret	
-	
-# _x8_soft + 5 needs to be 16 byte aligned
-#ifdef __APPLE__
-	.globl	_x8_soft
-_x8_soft:
-#else
-	.globl	x8_soft
-x8_soft:
-#endif
-	xorl %eax, %eax
-				movq      %rdx, %rbx     
-        movq      %r8, %rsi
-				leaq       (%rdx,%rcx,4), %r9  
-        leaq       (%r9,%rcx,4), %r10 
-        leaq       (%r10,%rcx,4), %r11 
-        leaq       (%r11,%rcx,4), %r12
-        leaq       (%r12,%rcx,4), %r13
-        leaq       (%r13,%rcx,4), %r14
-        leaq       (%r14,%rcx,4), %r15
-X8_soft_loop:   
-        movaps    (%rsi), %xmm9       
-        movaps    (%r10,%rax,4), %xmm6 
-        movaps    %xmm9, %xmm11        
-        movaps    (%r11,%rax,4), %xmm7 
-        movaps    16(%rsi), %xmm8      
-        mulps     %xmm6, %xmm11      
-        mulps     %xmm7, %xmm9       
-        shufps    $177, %xmm6, %xmm6 
-        mulps     %xmm8, %xmm6       
-        shufps    $177, %xmm7, %xmm7 
-        subps     %xmm6, %xmm11   
-        mulps     %xmm7, %xmm8     
-        movaps    %xmm11, %xmm10    
-        addps     %xmm8, %xmm9       
-        movaps    32(%rsi), %xmm15    
-        addps     %xmm9, %xmm10        
-        subps     %xmm9, %xmm11        
-        movaps    (%rbx,%rax,4), %xmm5 
-        movaps    %xmm15, %xmm6        
-        movaps    (%r12,%rax,4), %xmm12
-        movaps    %xmm5, %xmm2         
-        movaps    (%r14,%rax,4), %xmm13
-        xorps     %xmm3, %xmm11     #const   
-        movaps    48(%rsi), %xmm14     
-        subps     %xmm10, %xmm2        
-        mulps     %xmm12, %xmm6        
-        addps     %xmm10, %xmm5        
-        mulps     %xmm13, %xmm15       
-        movaps    64(%rsi), %xmm10     
-        movaps    %xmm5, %xmm0         
-        shufps    $177, %xmm12, %xmm12 
-        shufps    $177, %xmm13, %xmm13 
-        mulps     %xmm14, %xmm12       
-        mulps     %xmm13, %xmm14       
-        subps     %xmm12, %xmm6        
-        addps     %xmm14, %xmm15       
-        movaps    (%r13,%rax,4), %xmm7  
-        movaps    %xmm10, %xmm13         
-        movaps    (%r15,%rax,4), %xmm8    
-        movaps    %xmm6, %xmm12      
-        movaps    80(%rsi), %xmm9     
-        addq      $96, %rsi           
-        mulps     %xmm7, %xmm13      
-        subps     %xmm15, %xmm6      
-        addps     %xmm15, %xmm12     
-        mulps     %xmm8, %xmm10      
-        subps     %xmm12, %xmm0          
-        addps     %xmm12, %xmm5          
-        shufps    $177, %xmm7, %xmm7     
-        xorps     %xmm3, %xmm6   #const        
-        shufps    $177, %xmm8, %xmm8     
-        movaps    %xmm2, %xmm12          
-        mulps     %xmm9, %xmm7           
-        mulps     %xmm8, %xmm9          
-        subps     %xmm7, %xmm13        
-        addps     %xmm9, %xmm10       
-        movaps    (%r9,%rax,4), %xmm4        
-        shufps    $177, %xmm11, %xmm11       
-        movaps    %xmm4, %xmm1              
-        shufps    $177, %xmm6, %xmm6       
-        addps     %xmm11, %xmm1           
-        subps     %xmm11, %xmm4          
-        addps     %xmm6, %xmm12         
-        subps     %xmm6, %xmm2         
-        movaps    %xmm13, %xmm11      
-        movaps    %xmm4, %xmm14      
-        movaps    %xmm1, %xmm6      
-        subps     %xmm10, %xmm13   
-        addps     %xmm10, %xmm11  
-        xorps     %xmm3, %xmm13  #const  
-        addps     %xmm11, %xmm4                                 
-        subps     %xmm11, %xmm14                                
-        shufps    $177, %xmm13, %xmm13                          
-        movaps    %xmm5, (%rbx,%rax,4)                                 
-        movaps    %xmm4, (%r9,%rax,4)                                  
-        movaps    %xmm2, (%r10,%rax,4)                                 
-        subps     %xmm13, %xmm1                                 
-        addps     %xmm13, %xmm6                                 
-        movaps    %xmm1, (%r11,%rax,4)                                  
-        movaps    %xmm0, (%r12,%rax,4)                                 
-        movaps    %xmm14, (%r13,%rax,4)                                
-        movaps    %xmm12, (%r14,%rax,4)                                
-        movaps    %xmm6, (%r15,%rax,4)                                 
-        addq      $4, %rax   
-				cmpq	%rcx, %rax
-        jne       X8_soft_loop
-				ret
-
-#ifdef __APPLE__
-	.globl	_x8_hard
-_x8_hard:
-#else
-	.globl	x8_hard
-x8_hard:
-#endif
-        movaps    (%r9), %xmm5           
-X8_loop:  
-        movaps    (%r8), %xmm9                                 
-X8_const_2:
-        movaps    0xFECA(%rdx,%rax,4), %xmm6  
-        movaps    %xmm9, %xmm11                                 
-X8_const_3:
-        movaps    0xFECA(%rdx,%rax,4), %xmm7  
-        movaps    16(%r8), %xmm8                               
-        mulps     %xmm6, %xmm11                                 
-        mulps     %xmm7, %xmm9                                  
-        shufps    $177, %xmm6, %xmm6                            
-        mulps     %xmm8, %xmm6                                  
-        shufps    $177, %xmm7, %xmm7                            
-        subps     %xmm6, %xmm11                                 
-        mulps     %xmm7, %xmm8                                  
-        movaps    %xmm11, %xmm10                                
-        addps     %xmm8, %xmm9                                  
-        movaps    32(%r8), %xmm15                              
-        addps     %xmm9, %xmm10                                 
-        subps     %xmm9, %xmm11                                 
-X8_const_0:
-        movaps    0xFECA(%rdx,%rax,4), %xmm3     
-        movaps    %xmm15, %xmm6                                 
-X8_const_4:
-        movaps    0xFECA(%rdx,%rax,4), %xmm12
-        movaps    %xmm3, %xmm2                                  
-X8_const_6:
-        movaps    0xFECA(%rdx,%rax,4), %xmm13
-        xorps     %xmm5, %xmm11                                 
-        movaps    48(%r8), %xmm14                              
-        subps     %xmm10, %xmm2                                 
-        mulps     %xmm12, %xmm6                                 
-        addps     %xmm10, %xmm3                                 
-        mulps     %xmm13, %xmm15                                
-        movaps    64(%r8), %xmm10                              
-        movaps    %xmm3, %xmm0                                  
-        shufps    $177, %xmm12, %xmm12                          
-        shufps    $177, %xmm13, %xmm13                          
-        mulps     %xmm14, %xmm12                                
-        mulps     %xmm13, %xmm14                                
-        subps     %xmm12, %xmm6                                 
-        addps     %xmm14, %xmm15                                
-X8_const_5:
-        movaps    0xFECA(%rdx,%rax,4), %xmm7
-        movaps    %xmm10, %xmm13                                
-X8_const_7:
-        movaps    0xFECA(%rdx,%rax,4), %xmm8
-        movaps    %xmm6, %xmm12                                 
-        movaps    80(%r8), %xmm9                               
-        addq      $96, %r8                                     
-        mulps     %xmm7, %xmm13                                 
-        subps     %xmm15, %xmm6                                 
-        addps     %xmm15, %xmm12                                
-        mulps     %xmm8, %xmm10                                 
-        subps     %xmm12, %xmm0                                 
-        addps     %xmm12, %xmm3                                 
-        shufps    $177, %xmm7, %xmm7                            
-        xorps     %xmm5, %xmm6                                  
-        shufps    $177, %xmm8, %xmm8                            
-        movaps    %xmm2, %xmm12                                 
-        mulps     %xmm9, %xmm7                                  
-        mulps     %xmm8, %xmm9                                  
-        subps     %xmm7, %xmm13                                 
-        addps     %xmm9, %xmm10                                 
-X8_const_1:
-        movaps    0xFECA(%rdx,%rax,4), %xmm4   
-        shufps    $177, %xmm11, %xmm11                          
-        movaps    %xmm4, %xmm1                                  
-        shufps    $177, %xmm6, %xmm6                            
-        addps     %xmm11, %xmm1                                 
-        subps     %xmm11, %xmm4                                 
-        addps     %xmm6, %xmm12                                 
-        subps     %xmm6, %xmm2                                  
-        movaps    %xmm13, %xmm11                                
-        movaps    %xmm4, %xmm14                                 
-        movaps    %xmm1, %xmm6                                  
-        subps     %xmm10, %xmm13                                
-        addps     %xmm10, %xmm11                                
-        xorps     %xmm5, %xmm13                                 
-        addps     %xmm11, %xmm4                                 
-        subps     %xmm11, %xmm14                                
-        shufps    $177, %xmm13, %xmm13                          
-X8_const1_0:
-        movaps    %xmm3, 0xFECA(%rdx,%rax,4)
-X8_const1_1:
-        movaps    %xmm4, 0xFECA(%rdx,%rax,4)
-X8_const1_2:
-        movaps    %xmm2, 0xFECA(%rdx,%rax,4) 
-        subps     %xmm13, %xmm1                                 
-        addps     %xmm13, %xmm6                                 
-X8_const1_3:
-        movaps    %xmm1, 0xFECA(%rdx,%rax,4) 
-X8_const1_4:
-        movaps    %xmm0, 0xFECA(%rdx,%rax,4)
-X8_const1_5:
-        movaps    %xmm14, 0xFECA(%rdx,%rax,4)
-X8_const1_6:
-        movaps    %xmm12, 0xFECA(%rdx,%rax,4) 
-X8_const1_7:
-        movaps    %xmm6, 0xFECA(%rdx,%rax,4)
-        addq      $4, %rax   
-				cmpq	%rcx, %rax
-        jne       X8_loop
-
-#ifdef __APPLE__	
-	.globl _sse_leaf_ee_offsets
-	.globl _sse_leaf_oo_offsets
-	.globl _sse_leaf_eo_offsets
-	.globl _sse_leaf_oe_offsets
-	.align 4
-_sse_leaf_ee_offsets:
-	.long LEAF_EE_const_0-_leaf_ee+0x4
-	.long LEAF_EE_const_1-_leaf_ee+0x5
-	.long LEAF_EE_const_2-_leaf_ee+0x5
-	.long LEAF_EE_const_3-_leaf_ee+0x5
-	.long LEAF_EE_const_4-_leaf_ee+0x5
-	.long LEAF_EE_const_5-_leaf_ee+0x5
-	.long LEAF_EE_const_6-_leaf_ee+0x4
-	.long LEAF_EE_const_7-_leaf_ee+0x5
-_sse_leaf_oo_offsets:
-	.long LEAF_OO_const_0-_leaf_oo+0x4
-	.long LEAF_OO_const_1-_leaf_oo+0x4
-	.long LEAF_OO_const_2-_leaf_oo+0x5
-	.long LEAF_OO_const_3-_leaf_oo+0x5
-	.long LEAF_OO_const_4-_leaf_oo+0x4
-	.long LEAF_OO_const_5-_leaf_oo+0x5
-	.long LEAF_OO_const_6-_leaf_oo+0x5
-	.long LEAF_OO_const_7-_leaf_oo+0x5
-_sse_leaf_eo_offsets:
-	.long LEAF_EO_const_0-_leaf_eo+0x5
-	.long LEAF_EO_const_1-_leaf_eo+0x4
-	.long LEAF_EO_const_2-_leaf_eo+0x4
-	.long LEAF_EO_const_3-_leaf_eo+0x4
-	.long LEAF_EO_const_4-_leaf_eo+0x5
-	.long LEAF_EO_const_5-_leaf_eo+0x5
-	.long LEAF_EO_const_6-_leaf_eo+0x4
-	.long LEAF_EO_const_7-_leaf_eo+0x5
-_sse_leaf_oe_offsets:
-	.long LEAF_OE_const_0-_leaf_oe+0x5
-	.long LEAF_OE_const_1-_leaf_oe+0x4
-	.long LEAF_OE_const_2-_leaf_oe+0x4
-	.long LEAF_OE_const_3-_leaf_oe+0x5
-	.long LEAF_OE_const_4-_leaf_oe+0x5
-	.long LEAF_OE_const_5-_leaf_oe+0x5
-	.long LEAF_OE_const_6-_leaf_oe+0x4
-	.long LEAF_OE_const_7-_leaf_oe+0x4
-#else
-	.globl sse_leaf_ee_offsets
-	.globl sse_leaf_oo_offsets
-	.globl sse_leaf_eo_offsets
-	.globl sse_leaf_oe_offsets
-	.align 4
-sse_leaf_ee_offsets:
-	.long LEAF_EE_const_0-leaf_ee+0x4
-	.long LEAF_EE_const_1-leaf_ee+0x5
-	.long LEAF_EE_const_2-leaf_ee+0x5
-	.long LEAF_EE_const_3-leaf_ee+0x5
-	.long LEAF_EE_const_4-leaf_ee+0x5
-	.long LEAF_EE_const_5-leaf_ee+0x5
-	.long LEAF_EE_const_6-leaf_ee+0x4
-	.long LEAF_EE_const_7-leaf_ee+0x5
-sse_leaf_oo_offsets:
-	.long LEAF_OO_const_0-leaf_oo+0x4
-	.long LEAF_OO_const_1-leaf_oo+0x4
-	.long LEAF_OO_const_2-leaf_oo+0x5
-	.long LEAF_OO_const_3-leaf_oo+0x5
-	.long LEAF_OO_const_4-leaf_oo+0x4
-	.long LEAF_OO_const_5-leaf_oo+0x5
-	.long LEAF_OO_const_6-leaf_oo+0x5
-	.long LEAF_OO_const_7-leaf_oo+0x5
-sse_leaf_eo_offsets:
-	.long LEAF_EO_const_0-leaf_eo+0x5
-	.long LEAF_EO_const_1-leaf_eo+0x4
-	.long LEAF_EO_const_2-leaf_eo+0x4
-	.long LEAF_EO_const_3-leaf_eo+0x4
-	.long LEAF_EO_const_4-leaf_eo+0x5
-	.long LEAF_EO_const_5-leaf_eo+0x5
-	.long LEAF_EO_const_6-leaf_eo+0x4
-	.long LEAF_EO_const_7-leaf_eo+0x5
-sse_leaf_oe_offsets:
-	.long LEAF_OE_const_0-leaf_oe+0x5
-	.long LEAF_OE_const_1-leaf_oe+0x4
-	.long LEAF_OE_const_2-leaf_oe+0x4
-	.long LEAF_OE_const_3-leaf_oe+0x5
-	.long LEAF_OE_const_4-leaf_oe+0x5
-	.long LEAF_OE_const_5-leaf_oe+0x5
-	.long LEAF_OE_const_6-leaf_oe+0x4
-	.long LEAF_OE_const_7-leaf_oe+0x4
-#endif
-
-#ifdef __APPLE__
-	.data
-#else
-	.section .data
-#endif
-	.p2align 4
-#ifdef __APPLE__	
-	.globl _sse_constants
-_sse_constants:
-#else
-	.globl sse_constants
-sse_constants:
-#endif
-	.long	0x00000000,0x80000000,0x00000000,0x80000000
-	.long	0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
-	.long	0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
-	.long	0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
-	.long	0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
-#ifdef __APPLE__	
-	.globl _sse_constants_inv
-_sse_constants_inv:
-#else
-	.globl sse_constants_inv
-sse_constants_inv:
-#endif
-	.long	0x80000000,0x00000000,0x80000000,0x00000000
-	.long	0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
-	.long	0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
-	.long	0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
-	.long	0x00000000,0x00000000,0x3f3504f3,0xbf3504f3
--- a/lib/ffts/src/types.h
+++ b/lib/ffts/src/types.h
@ -1,10 +1,10 @@
 /*
- 
+
 This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
+
 Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
+ Copyright (c) 2012, The University of Waikato
+
 All rights reserved.

 Redistribution and use in source and binary forms, with or without
@ -31,19 +31,15 @@

 */

+#ifndef FFTS_TYPES_H
+#define FFTS_TYPES_H

-#ifndef __TYPES_H__
-#define __TYPES_H__
-
-#define __INLINE static inline __attribute__((always_inline))
-
-#if defined(complex)
-	typedef complex float cdata_t;
-#else
-	typedef float cdata_t[2];
-#endif
-	typedef float data_t;
-
+#if defined (_MSC_VER) && (_MSC_VER >= 1020)
+#pragma once
 #endif

+/* Define complex number as two element array */
+typedef float  ffts_cpx_32f[2];
+typedef double ffts_cpx_64f[2];

+#endif /* FFTS_TYPES_H */
--- a/lib/ffts/src/vfp.h
+++ b/lib/ffts/src/vfp.h
@ -43,3 +43,4 @@ void vfp_x8();
 void vfp_end();

 #endif
+// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
--- a/lib/ffts/src/vfp.s
+++ b/lib/ffts/src/vfp.s
@ -30,7 +30,7 @@
 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 */
-
+	.fpu	vfp

@ assumes r0 = out 
@         r1 = in ? 
@ -41,7 +41,7 @@
@         r2 = const pointer
@       & lr = temps

-	.align 4
+	.align	4
 #ifdef __APPLE__
 	.globl	_vfp_e
 _vfp_e:
@ -50,44 +50,44 @@ _vfp_e:
 vfp_e:
 #endif
 _vfp_e_loop:
-	vldr	s15, [r2, #8]
-	vldr s2, [r3] @ x0 
-	vldr	s0, [r3, #4]
-	vldr s4, [r4] @ x1 
-	vldr	s11, [r2]
-	vldr s10, [r7] @ x4 
-	vldr	s3, [r7, #4]
-	vldr s8, [r8] @ x5 
-	vldr	s1, [r8, #4]
-	vldr s14, [r9] @ x6 
-	vldr	s9, [r9, #4]
-	vldr s6, [r10] @ x7 
-	vldr	s12, [r10, #4]
+	vldr		s15, [r2, #8]
+	vldr		s2, [r3] @ x0
+	vldr		s0, [r3, #4]
+	vldr		s4, [r4] @ x1
+	vldr		s11, [r2]
+	vldr		s10, [r7] @ x4
+	vldr		s3, [r7, #4]
+	vldr		s8, [r8] @ x5
+	vldr		s1, [r8, #4]
+	vldr		s14, [r9] @ x6
+	vldr		s9, [r9, #4]
+	vldr		s6, [r10] @ x7
+	vldr		s12, [r10, #4]
 	vsub.f32	s18, s3, s1
 	vsub.f32	s7, s10, s8
 	vsub.f32	s5, s14, s6
 	vadd.f32	s6, s14, s6
-	vldr	s24, [r5, #4]
+	vldr		s24, [r5, #4]
 	vsub.f32	s14, s9, s12
-	vldr	s22, [r6, #4]
+	vldr		s22, [r6, #4]
 	vadd.f32	s8, s10, s8
-	vldr s28, [r6] @ x3 
-	vldr s17, [r5] @ x2 
+	vldr		s28, [r6] @ x3
+	vldr		s17, [r5] @ x2
 	vadd.f32	s10, s9, s12
 	vmul.f32	s13, s18, s15
 	vmul.f32	s9, s7, s11
 	vmul.f32	s16, s5, s11
 	vmul.f32	s18, s18, s11
 	vmul.f32	s30, s14, s11
-	vldr	s11, [r4, #4]
-    add r3, r3, #8
-    add r4, r4, #8
-    add r5, r5, #8
-    add r6, r6, #8
-    add r7, r7, #8
-    add r8, r8, #8
-    add r9, r9, #8
-    add r10, r10, #8
+	vldr		s11, [r4, #4]
+	add			r3, r3, #8
+	add			r4, r4, #8
+	add			r5, r5, #8
+	add			r6, r6, #8
+	add			r7, r7, #8
+	add			r8, r8, #8
+	add			r9, r9, #8
+	add			r10, r10, #8
 	vmul.f32	s12, s5, s15
 	vmul.f32	s20, s14, s15
 	vadd.f32	s5, s2, s4
@ -111,7 +111,7 @@ _vfp_e_loop:
 	vsub.f32	s12, s30, s12
 	vadd.f32	s20, s3, s10
 	vsub.f32	s15, s3, s10
-	vsub.f32	s3, s26, s1      
+	vsub.f32	s3, s26, s1
 	vadd.f32	s18, s9, s13
 	vadd.f32	s10, s14, s4
 	vadd.f32	s6, s2, s7      @
@ -120,15 +120,15 @@ _vfp_e_loop:
 	vsub.f32	s4, s14, s4
 	vsub.f32	s8, s22, s16    @
 	vadd.f32	s1, s28, s12
-ldr lr, [r12], #4
-add lr, r0, lr, lsl #2
-subs	r11, r11, #1
-	vstr	s18, [lr]
+	ldr			lr, [r12], #4
+	add			lr, r0, lr, lsl #2
+	subs		r11, r11, #1
+	vstr		s18, [lr]
 	vsub.f32	s2, s28, s12
 	vadd.f32	s12, s22, s16   @
 	vsub.f32	s16, s3, s24    @
 	vsub.f32	s13, s9, s13
-	vstr	s26, [lr, #4]
+	vstr		s26, [lr, #4]
 	vadd.f32	s28, s5, s15    @
 	vsub.f32	s7, s5, s15     @
 	vadd.f32	s14, s6, s10
@ -136,26 +136,26 @@ subs	r11, r11, #1
 	vadd.f32	s9, s0, s2      @
 	vsub.f32	s2, s0, s2      @
 	vsub.f32	s11, s11, s20
-	vstr	s28, [lr, #16]
+	vstr		s28, [lr, #16]
 	vadd.f32	s3, s3, s24     @
-	vstr	s16, [lr, #20]
+	vstr		s16, [lr, #20]
 	vsub.f32	s6, s6, s10
-	vstr	s13, [lr, #32]
+	vstr		s13, [lr, #32]
 	vsub.f32	s13, s12, s4    @
 	vsub.f32	s8, s8, s1
 	vadd.f32	s0, s12, s4     @
-	vstr	s11, [lr, #36]
-	vstr	s7, [lr, #48]
-	vstr	s3, [lr, #52]
-	vstr	s14, [lr, #8]
-	vstr	s5, [lr, #12]
-	vstr	s9, [lr, #24]
-	vstr	s13, [lr, #28]
-	vstr	s6, [lr, #40]
-	vstr	s8, [lr, #44]
-	vstr	s2, [lr, #56]
-	vstr	s0, [lr, #60]
-	bne _vfp_e_loop
+	vstr		s11, [lr, #36]
+	vstr		s7, [lr, #48]
+	vstr		s3, [lr, #52]
+	vstr		s14, [lr, #8]
+	vstr		s5, [lr, #12]
+	vstr		s9, [lr, #24]
+	vstr		s13, [lr, #28]
+	vstr		s6, [lr, #40]
+	vstr		s8, [lr, #44]
+	vstr		s2, [lr, #56]
+	vstr		s0, [lr, #60]
+	bne			_vfp_e_loop

@ assumes r0 = out 
@         r1 = in ? 
@ -461,7 +461,6 @@ _vfp_x8_loop:
 	bne _vfp_x8_loop	
 	bx lr
 	
-	
 	.align 4
 #ifdef __APPLE__
 	.globl	_vfp_end
--- a/lib/ffts/tests/Makefile.in
+++ b/lib/ffts/tests/Makefile.in
@ -1,7 +1,7 @@
-# Makefile.in generated by automake 1.12.4 from Makefile.am.
+# Makefile.in generated by automake 1.14 from Makefile.am.
 # @configure_input@

-# Copyright (C) 1994-2012 Free Software Foundation, Inc.
+# Copyright (C) 1994-2013 Free Software Foundation, Inc.

 # This Makefile.in is free software; the Free Software Foundation
 # gives unlimited permission to copy and/or distribute it,
@ -15,23 +15,51 @@
@SET_MAKE@

 VPATH = @srcdir@
-am__make_dryrun = \
-  { \
-    am__dry=no; \
+am__is_gnu_make = test -n '$(MAKEFILE_LIST)' && test -n '$(MAKELEVEL)'
+am__make_running_with_option = \
+  case $${target_option-} in \
+      ?) ;; \
+      *) echo "am__make_running_with_option: internal error: invalid" \
+              "target option '$${target_option-}' specified" >&2; \
+         exit 1;; \
+  esac; \
+  has_opt=no; \
+  sane_makeflags=$$MAKEFLAGS; \
+  if $(am__is_gnu_make); then \
+    sane_makeflags=$$MFLAGS; \
+  else \
    case $$MAKEFLAGS in \
      *\\[\ \	]*) \
-        echo 'am--echo: ; @echo "AM"  OK' | $(MAKE) -f - 2>/dev/null \
-          | grep '^AM OK$$' >/dev/null || am__dry=yes;; \
-      *) \
-        for am__flg in $$MAKEFLAGS; do \
-          case $$am__flg in \
-            *=*|--*) ;; \
-            *n*) am__dry=yes; break;; \
-          esac; \
-        done;; \
+        bs=\\; \
+        sane_makeflags=`printf '%s\n' "$$MAKEFLAGS" \
+          | sed "s/$$bs$$bs[$$bs $$bs	]*//g"`;; \
+    esac; \
+  fi; \
+  skip_next=no; \
+  strip_trailopt () \
+  { \
+    flg=`printf '%s\n' "$$flg" | sed "s/$$1.*$$//"`; \
+  }; \
+  for flg in $$sane_makeflags; do \
+    test $$skip_next = yes && { skip_next=no; continue; }; \
+    case $$flg in \
+      *=*|--*) continue;; \
+        -*I) strip_trailopt 'I'; skip_next=yes;; \
+      -*I?*) strip_trailopt 'I';; \
+        -*O) strip_trailopt 'O'; skip_next=yes;; \
+      -*O?*) strip_trailopt 'O';; \
+        -*l) strip_trailopt 'l'; skip_next=yes;; \
+      -*l?*) strip_trailopt 'l';; \
+      -[dEDm]) skip_next=yes;; \
+      -[JT]) skip_next=yes;; \
+    esac; \
+    case $$flg in \
+      *$$target_option*) has_opt=yes; break;; \
    esac; \
-    test $$am__dry = yes; \
-  }
+  done; \
+  test $$has_opt = yes
+am__make_dryrun = (target_option=n; $(am__make_running_with_option))
+am__make_keepgoing = (target_option=k; $(am__make_running_with_option))
 pkgdatadir = $(datadir)/@PACKAGE@
 pkgincludedir = $(includedir)/@PACKAGE@
 pkglibdir = $(libdir)/@PACKAGE@
@ -52,7 +80,7 @@ build_triplet = @build@
 host_triplet = @host@
 noinst_PROGRAMS = test$(EXEEXT)
 subdir = tests
-DIST_COMMON = $(srcdir)/Makefile.am $(srcdir)/Makefile.in \
+DIST_COMMON = $(srcdir)/Makefile.in $(srcdir)/Makefile.am \
 	$(top_srcdir)/depcomp
 ACLOCAL_M4 = $(top_srcdir)/aclocal.m4
 am__aclocal_m4_deps = $(top_srcdir)/m4/ax_check_classpath.m4 \
@ -73,19 +101,44 @@ PROGRAMS = $(noinst_PROGRAMS)
 am_test_OBJECTS = test.$(OBJEXT)
 test_OBJECTS = $(am_test_OBJECTS)
 test_DEPENDENCIES = $(top_builddir)/src/libffts.la
+AM_V_lt = $(am__v_lt_@AM_V@)
+am__v_lt_ = $(am__v_lt_@AM_DEFAULT_V@)
+am__v_lt_0 = --silent
+am__v_lt_1 = 
+AM_V_P = $(am__v_P_@AM_V@)
+am__v_P_ = $(am__v_P_@AM_DEFAULT_V@)
+am__v_P_0 = false
+am__v_P_1 = :
+AM_V_GEN = $(am__v_GEN_@AM_V@)
+am__v_GEN_ = $(am__v_GEN_@AM_DEFAULT_V@)
+am__v_GEN_0 = @echo "  GEN     " $@;
+am__v_GEN_1 = 
+AM_V_at = $(am__v_at_@AM_V@)
+am__v_at_ = $(am__v_at_@AM_DEFAULT_V@)
+am__v_at_0 = @
+am__v_at_1 = 
 DEFAULT_INCLUDES = -I.@am__isrc@ -I$(top_builddir)
 depcomp = $(SHELL) $(top_srcdir)/depcomp
 am__depfiles_maybe = depfiles
 am__mv = mv -f
 COMPILE = $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) \
 	$(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
-LTCOMPILE = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
-	--mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
-	$(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS)
+LTCOMPILE = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) \
+	$(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) \
+	$(AM_CFLAGS) $(CFLAGS)
+AM_V_CC = $(am__v_CC_@AM_V@)
+am__v_CC_ = $(am__v_CC_@AM_DEFAULT_V@)
+am__v_CC_0 = @echo "  CC      " $@;
+am__v_CC_1 = 
 CCLD = $(CC)
-LINK = $(LIBTOOL) --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) \
-	--mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) $(AM_LDFLAGS) \
-	$(LDFLAGS) -o $@
+LINK = $(LIBTOOL) $(AM_V_lt) --tag=CC $(AM_LIBTOOLFLAGS) \
+	$(LIBTOOLFLAGS) --mode=link $(CCLD) $(AM_CFLAGS) $(CFLAGS) \
+	$(AM_LDFLAGS) $(LDFLAGS) -o $@
+AM_V_CCLD = $(am__v_CCLD_@AM_V@)
+am__v_CCLD_ = $(am__v_CCLD_@AM_DEFAULT_V@)
+am__v_CCLD_0 = @echo "  CCLD    " $@;
+am__v_CCLD_1 = 
 SOURCES = $(test_SOURCES)
 DIST_SOURCES = $(test_SOURCES)
 am__can_run_installinfo = \
@ -93,11 +146,29 @@ am__can_run_installinfo = \
    n|no|NO) false;; \
    *) (install-info --version) >/dev/null 2>&1;; \
  esac
+am__tagged_files = $(HEADERS) $(SOURCES) $(TAGS_FILES) $(LISP)
+# Read a list of newline-separated strings from the standard input,
+# and print each of them once, without duplicates.  Input order is
+# *not* preserved.
+am__uniquify_input = $(AWK) '\
+  BEGIN { nonempty = 0; } \
+  { items[$$0] = 1; nonempty = 1; } \
+  END { if (nonempty) { for (i in items) print i; }; } \
+'
+# Make sure the list of sources is unique.  This is necessary because,
+# e.g., the same source file might be shared among _SOURCES variables
+# for different programs/libraries.
+am__define_uniq_tagged_files = \
+  list='$(am__tagged_files)'; \
+  unique=`for i in $$list; do \
+    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
+  done | $(am__uniquify_input)`
 ETAGS = etags
 CTAGS = ctags
 DISTFILES = $(DIST_COMMON) $(DIST_SOURCES) $(TEXINFOS) $(EXTRA_DIST)
 ACLOCAL = @ACLOCAL@
 AMTAR = @AMTAR@
+AM_DEFAULT_VERBOSITY = @AM_DEFAULT_VERBOSITY@
 AR = @AR@
 AUTOCONF = @AUTOCONF@
 AUTOHEADER = @AUTOHEADER@
@ -271,9 +342,10 @@ clean-noinstPROGRAMS:
 	list=`for p in $$list; do echo "$$p"; done | sed 's/$(EXEEXT)$$//'`; \
 	echo " rm -f" $$list; \
 	rm -f $$list
+
 test$(EXEEXT): $(test_OBJECTS) $(test_DEPENDENCIES) $(EXTRA_test_DEPENDENCIES) 
 	@rm -f test$(EXEEXT)
-	$(LINK) $(test_OBJECTS) $(test_LDADD) $(LIBS)
+	$(AM_V_CCLD)$(LINK) $(test_OBJECTS) $(test_LDADD) $(LIBS)

 mostlyclean-compile:
 	-rm -f *.$(OBJEXT)
@ -284,25 +356,25 @@ distclean-compile:
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/test.Po@am__quote@

 .c.o:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c $<
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ $<

 .c.obj:
-@am__fastdepCC_TRUE@	$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(COMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ `$(CYGPATH_W) '$<'`
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Po
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=no @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(COMPILE) -c `$(CYGPATH_W) '$<'`
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(COMPILE) -c -o $@ `$(CYGPATH_W) '$<'`

 .c.lo:
-@am__fastdepCC_TRUE@	$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
-@am__fastdepCC_TRUE@	$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
-@AMDEP_TRUE@@am__fastdepCC_FALSE@	source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
+@am__fastdepCC_TRUE@	$(AM_V_CC)$(LTCOMPILE) -MT $@ -MD -MP -MF $(DEPDIR)/$*.Tpo -c -o $@ $<
+@am__fastdepCC_TRUE@	$(AM_V_at)$(am__mv) $(DEPDIR)/$*.Tpo $(DEPDIR)/$*.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@	$(AM_V_CC)source='$<' object='$@' libtool=yes @AMDEPBACKSLASH@
@AMDEP_TRUE@@am__fastdepCC_FALSE@	DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
-@am__fastdepCC_FALSE@	$(LTCOMPILE) -c -o $@ $<
+@am__fastdepCC_FALSE@	$(AM_V_CC@am__nodep@)$(LTCOMPILE) -c -o $@ $<

 mostlyclean-libtool:
 	-rm -f *.lo
@ -310,26 +382,15 @@ mostlyclean-libtool:
 clean-libtool:
 	-rm -rf .libs _libs

-ID: $(HEADERS) $(SOURCES) $(LISP) $(TAGS_FILES)
-	list='$(SOURCES) $(HEADERS) $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
-	mkid -fID $$unique
-tags: TAGS
-
-TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
+ID: $(am__tagged_files)
+	$(am__define_uniq_tagged_files); mkid -fID $$unique
+tags: tags-am
+TAGS: tags
+
+tags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
 	set x; \
 	here=`pwd`; \
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+	$(am__define_uniq_tagged_files); \
 	shift; \
 	if test -z "$(ETAGS_ARGS)$$*$$unique"; then :; else \
 	  test -n "$$unique" || unique=$$empty_fix; \
@ -341,15 +402,11 @@ TAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
 	      $$unique; \
 	  fi; \
 	fi
-ctags: CTAGS
-CTAGS:  $(HEADERS) $(SOURCES)  $(TAGS_DEPENDENCIES) \
-		$(TAGS_FILES) $(LISP)
-	list='$(SOURCES) $(HEADERS)  $(LISP) $(TAGS_FILES)'; \
-	unique=`for i in $$list; do \
-	    if test -f "$$i"; then echo $$i; else echo $(srcdir)/$$i; fi; \
-	  done | \
-	  $(AWK) '{ files[$$0] = 1; nonempty = 1; } \
-	      END { if (nonempty) { for (i in files) print i; }; }'`; \
+ctags: ctags-am
+
+CTAGS: ctags
+ctags-am: $(TAGS_DEPENDENCIES) $(am__tagged_files)
+	$(am__define_uniq_tagged_files); \
 	test -z "$(CTAGS_ARGS)$$unique" \
 	  || $(CTAGS) $(CTAGSFLAGS) $(AM_CTAGSFLAGS) $(CTAGS_ARGS) \
 	     $$unique
@ -358,9 +415,10 @@ GTAGS:
 	here=`$(am__cd) $(top_builddir) && pwd` \
 	  && $(am__cd) $(top_srcdir) \
 	  && gtags -i $(GTAGS_ARGS) "$$here"
+cscopelist: cscopelist-am

-cscopelist:  $(HEADERS) $(SOURCES) $(LISP)
-	list='$(SOURCES) $(HEADERS) $(LISP)'; \
+cscopelist-am: $(am__tagged_files)
+	list='$(am__tagged_files)'; \
 	case "$(srcdir)" in \
 	  [\\/]* | ?:[\\/]*) sdir="$(srcdir)" ;; \
 	  *) sdir=$(subdir)/$(srcdir) ;; \
@ -513,18 +571,19 @@ uninstall-am:

 .MAKE: install-am install-strip

-.PHONY: CTAGS GTAGS all all-am check check-am clean clean-generic \
-	clean-libtool clean-noinstPROGRAMS cscopelist ctags distclean \
-	distclean-compile distclean-generic distclean-libtool \
-	distclean-tags distdir dvi dvi-am html html-am info info-am \
-	install install-am install-data install-data-am install-dvi \
-	install-dvi-am install-exec install-exec-am install-html \
-	install-html-am install-info install-info-am install-man \
-	install-pdf install-pdf-am install-ps install-ps-am \
-	install-strip installcheck installcheck-am installdirs \
-	maintainer-clean maintainer-clean-generic mostlyclean \
-	mostlyclean-compile mostlyclean-generic mostlyclean-libtool \
-	pdf pdf-am ps ps-am tags uninstall uninstall-am
+.PHONY: CTAGS GTAGS TAGS all all-am check check-am clean clean-generic \
+	clean-libtool clean-noinstPROGRAMS cscopelist-am ctags \
+	ctags-am distclean distclean-compile distclean-generic \
+	distclean-libtool distclean-tags distdir dvi dvi-am html \
+	html-am info info-am install install-am install-data \
+	install-data-am install-dvi install-dvi-am install-exec \
+	install-exec-am install-html install-html-am install-info \
+	install-info-am install-man install-pdf install-pdf-am \
+	install-ps install-ps-am install-strip installcheck \
+	installcheck-am installdirs maintainer-clean \
+	maintainer-clean-generic mostlyclean mostlyclean-compile \
+	mostlyclean-generic mostlyclean-libtool pdf pdf-am ps ps-am \
+	tags tags-am uninstall uninstall-am


 # Tell versions [3.59,3.63) of GNU make to not export all variables.
--- a/lib/ffts/tests/test.c
+++ b/lib/ffts/tests/test.c
@ -1,7 +1,7 @@
 /*
- 
- This file is part of SFFT.
-  
+
+ This file is part of FFTS.
+
 Copyright (c) 2012, Anthony M. Blake
 All rights reserved.

@ -29,148 +29,164 @@

 */

-#include <stdio.h>
-#include <math.h>
+#include "../include/ffts.h"
+#include "../src/ffts_attributes.h"

 #ifdef __ARM_NEON__
 #endif
-#ifdef HAVE_SSE
-	#include <xmmintrin.h>
-#endif 

-#include "../include/ffts.h"
+#ifdef HAVE_SSE
+#include <xmmintrin.h>
+#endif

+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>

-#define PI 3.1415926535897932384626433832795028841971693993751058209
+#ifndef M_PI
+#define M_PI 3.1415926535897932384626433832795028841971693993751058209
+#endif

-float impulse_error(int N, int sign, float *data) {
+static float impulse_error(int N, int sign, float *data)
+{
 #ifdef __ANDROID__
-	double delta_sum = 0.0f;
-	double sum = 0.0f;
+    double delta_sum = 0.0f;
+    double sum = 0.0f;
 #else
-	long double delta_sum = 0.0f;
-	long double sum = 0.0f;
-#endif 
+    long double delta_sum = 0.0f;
+    long double sum = 0.0f;
+#endif
+    int i;

-	int i;
-	for(i=0;i<N;i++) {
+    for (i = 0; i < N; i++) {
 #ifdef __ANDROID__
-		double re, im;
-		if(sign < 0) {
-			re = cos(2 * PI * (double)i / (double)N); 
-			im = -sin(2 * PI * (double)i / (double)N); 
-		}else{
-			re = cos(2 * PI * (double)i / (double)N); 
-			im = sin(2 * PI * (double)i / (double)N); 
-		}
+        double re, im;
+
+        if (sign < 0) {
+            re = cos(2 * M_PI * (double) i / (double) N);
+            im = -sin(2 * M_PI * (double) i / (double) N);
+        } else {
+            re = cos(2 * M_PI * (double) i / (double) N);
+            im = sin(2 * M_PI * (double) i / (double) N);
+        }
 #else
-		long double re, im;
-		if(sign < 0) {
-			re = cosl(2 * PI * (long double)i / (long double)N); 
-			im = -sinl(2 * PI * (long double)i / (long double)N); 
-		}else{
-			re = cosl(2 * PI * (long double)i / (long double)N); 
-			im = sinl(2 * PI * (long double)i / (long double)N); 
-		}
+        long double re, im;
+
+        if (sign < 0) {
+            re = cosl(2 * M_PI * (long double) i / (long double) N);
+            im = -sinl(2 * M_PI * (long double) i / (long double) N);
+        } else {
+            re = cosl(2 * M_PI * (long double) i / (long double) N);
+            im = sinl(2 * M_PI * (long double) i / (long double) N);
+        }
 #endif
-		sum += re * re + im * im;

-		re = re - data[2*i];
-		im = im - data[2*i+1];
-		
-		delta_sum += re * re + im * im;
+        sum += re * re + im * im;
+
+        re = re - data[2*i];
+        im = im - data[2*i+1];
+
+        delta_sum += re * re + im * im;
+    }

-	}
 #ifdef __ANDROID__
-	return sqrt(delta_sum) / sqrt(sum);
+    return (float) (sqrt(delta_sum) / sqrt(sum));
 #else
-	return sqrtl(delta_sum) / sqrtl(sum);
+    return (float) (sqrtl(delta_sum) / sqrtl(sum));
 #endif
 }

-int 
-test_transform(int n, int sign) {
+int test_transform(int n, int sign)
+{
+    ffts_plan_t *p;

-#ifdef HAVE_SSE 
-	float __attribute__ ((aligned(32))) *input = _mm_malloc(2 * n * sizeof(float), 32);
-  float __attribute__ ((aligned(32))) *output = _mm_malloc(2 * n * sizeof(float), 32);
+#ifdef HAVE_SSE
+    float FFTS_ALIGN(32) *input = _mm_malloc(2 * n * sizeof(float), 32);
+    float FFTS_ALIGN(32) *output = _mm_malloc(2 * n * sizeof(float), 32);
 #else
-	float __attribute__ ((aligned(32))) *input = valloc(2 * n * sizeof(float));
-  float __attribute__ ((aligned(32))) *output = valloc(2 * n * sizeof(float));
+    float FFTS_ALIGN(32) *input = valloc(2 * n * sizeof(float));
+    float FFTS_ALIGN(32) *output = valloc(2 * n * sizeof(float));
 #endif
-	int i;	
-	for(i=0;i<n;i++) {
-		input[2*i]   = 0.0f;
-		input[2*i+1] = 0.0f;
-	}
-
-	input[2] = 1.0f;
-
-	ffts_plan_t *p = ffts_init_1d(i, sign);
-	if(p) {
-		ffts_execute(p, input, output);
-		printf(" %3d  | %9d | %10E\n", sign, n, impulse_error(n, sign, output));
-  	ffts_free(p);
-	}else{
-		printf("Plan unsupported\n");
-		return 0;
-	}
-
-	return 1;
+    int i;
+
+    for (i = 0; i < n; i++) {
+        input[2*i + 0] = 0.0f;
+        input[2*i + 1] = 0.0f;
+    }
+
+    input[2] = 1.0f;
+
+    p = ffts_init_1d(i, sign);
+    if (!p) {
+        printf("Plan unsupported\n");
+        return 0;
+    }
+
+    ffts_execute(p, input, output);
+    printf(" %3d  | %9d | %10E\n", sign, n, impulse_error(n, sign, output));
+    ffts_free(p);
+    return 1;
 }

-int
-main(int argc, char *argv[]) {
-	
-	if(argc == 3) {
-		// test specific transform with test pattern and display output
-		int n = atoi(argv[1]);
-		int sign = atoi(argv[2]);
+int main(int argc, char *argv[])
+{
+    if (argc == 3) {
+        ffts_plan_t *p;
+        int i;
+
+        /* test specific transform with test pattern and display output */
+        int n = atoi(argv[1]);
+        int sign = atoi(argv[2]);

 #ifdef HAVE_SSE
-		float __attribute__ ((aligned(32))) *input = _mm_malloc(2 * n * sizeof(float), 32);
-		float __attribute__ ((aligned(32))) *output = _mm_malloc(2 * n * sizeof(float), 32);
+        float FFTS_ALIGN(32) *input = _mm_malloc(2 * n * sizeof(float), 32);
+        float FFTS_ALIGN(32) *output = _mm_malloc(2 * n * sizeof(float), 32);
 #else
-		float __attribute__ ((aligned(32))) *input = valloc(2 * n * sizeof(float));
-		float __attribute__ ((aligned(32))) *output = valloc(2 * n * sizeof(float));
+        float FFTS_ALIGN(32) *input = valloc(2 * n * sizeof(float));
+        float FFTS_ALIGN(32) *output = valloc(2 * n * sizeof(float));
 #endif
-		int i;	
-		for(i=0;i<n;i++) {
-			input[2*i]   = i;
-			input[2*i+1] = 0.0f;
-		}
-
-	//	input[2] = 1.0f;
-
-		ffts_plan_t *p = ffts_init_1d(i, sign);
-		if(p) {
-			ffts_execute(p, input, output);
-			for(i=0;i<n;i++) printf("%d %d %f %f\n", i, sign, output[2*i], output[2*i+1]);
-			ffts_free(p);
-		}else{
-			printf("Plan unsupported\n");
-			return 0;
-		}
-
-#ifdef HAVE_NEON 
-		_mm_free(input);
-		_mm_free(output);
+
+        for (i = 0; i < n; i++) {
+            input[2*i + 0] = (float) i;
+            input[2*i + 1] = 0.0f;
+        }
+
+        /* input[2] = 1.0f; */
+
+        p = ffts_init_1d(i, sign);
+        if (!p) {
+            printf("Plan unsupported\n");
+            return 0;
+        }
+
+        ffts_execute(p, input, output);
+
+        for (i = 0; i < n; i++)
+            printf("%d %d %f %f\n", i, sign, output[2*i], output[2*i+1]);
+        ffts_free(p);
+
+#ifdef HAVE_SSE
+        _mm_free(input);
+        _mm_free(output);
 #else
-		free(input);
-		free(output);
+        free(input);
+        free(output);
 #endif
+    } else {
+        int n, power2;
+
+        /* test various sizes and display error */
+        printf(" Sign |      Size |     L2 Error\n");
+        printf("------+-----------+-------------\n");
+
+        for (n = 1, power2 = 2; n <= 18; n++, power2 <<= 1) {
+            test_transform(power2, -1);
+        }
+
+        for (n = 1, power2 = 2; n <= 18; n++, power2 <<= 1) {
+            test_transform(power2, 1);
+        }
+    }

-	}else{
-		// test various sizes and display error
-		printf(" Sign |      Size |     L2 Error\n");
-		printf("------+-----------+-------------\n");
-		int n;
-		for(n=1;n<=18;n++) {
-			test_transform(pow(2,n), -1);
-		}
-		for(n=1;n<=18;n++) {
-			test_transform(pow(2,n), 1);
-		}
-	}
-  return 0;
+    return 0;
 }
				`@ -0,0 +1,3 @@`
				`#include "../../../../mono-extensions/mono/arch/arm64/arm64-codegen.h"`