Taken from https://github.com/linkotec/ffts Fixes ppc64el support and a handful of other bugsmaster
parent
c40a208abb
commit
2ef6dba872
@ -0,0 +1,225 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "ffts_chirp_z.h"
|
||||
|
||||
#include "ffts_internal.h"
|
||||
#include "ffts_trig.h"
|
||||
|
||||
/*
|
||||
* For more information on algorithms:
|
||||
*
|
||||
* L. I. Bluestein, A linear filtering approach to the computation of
|
||||
* the discrete Fourier transform, 1968 NEREM Rec., pp. 218-219
|
||||
*
|
||||
* Lawrence R. Rabiner, Ronald W. Schafer, Charles M. Rader,
|
||||
* The Chirp z-Transform Algorithm and Its Application
|
||||
* Bell Sys. Tech. J., vol. 48, pp. 1249-1292, May 1969.
|
||||
*
|
||||
* Rick Lyons, Four Ways to Compute an Inverse FFT Using the Forward FFT Algorithm
|
||||
* https://www.dsprelated.com/showarticle/800.php, July 7, 2015
|
||||
*/
|
||||
|
||||
/* forward declarations */
|
||||
static void
|
||||
ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out);
|
||||
|
||||
static void
|
||||
ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out);
|
||||
|
||||
static void
|
||||
ffts_chirp_z_free(ffts_plan_t *p)
|
||||
{
|
||||
if (p->B)
|
||||
ffts_aligned_free(p->B);
|
||||
|
||||
if (p->A)
|
||||
ffts_aligned_free(p->A);
|
||||
|
||||
if (p->buf)
|
||||
ffts_aligned_free(p->buf);
|
||||
|
||||
if (p->plans[0])
|
||||
ffts_free(p->plans[0]);
|
||||
|
||||
free(p);
|
||||
}
|
||||
|
||||
ffts_plan_t*
|
||||
ffts_chirp_z_init(size_t N, int sign)
|
||||
{
|
||||
float *A, *B, reciprocal_M, *tmp;
|
||||
ffts_plan_t *p;
|
||||
size_t i, M;
|
||||
|
||||
FFTS_ASSUME(N > 2);
|
||||
|
||||
p = (ffts_plan_t*) calloc(1, sizeof(*p) + sizeof(*p->plans));
|
||||
if (!p)
|
||||
return NULL;
|
||||
|
||||
p->destroy = ffts_chirp_z_free;
|
||||
p->N = N;
|
||||
p->rank = 1;
|
||||
p->plans = (ffts_plan_t**) &p[1];
|
||||
|
||||
if (sign < 0)
|
||||
p->transform = ffts_chirp_z_transform_f_32f;
|
||||
else
|
||||
p->transform = ffts_chirp_z_transform_i_32f;
|
||||
|
||||
/* determinate next power of two such that M >= 2*N-1 */
|
||||
M = ffts_next_power_of_2(2*N-1);
|
||||
p->plans[0] = ffts_init_1d(M, FFTS_FORWARD);
|
||||
if (!p->plans[0])
|
||||
goto cleanup;
|
||||
|
||||
p->A = A = (float*) ffts_aligned_malloc(2 * N * sizeof(float));
|
||||
if (!p->A)
|
||||
goto cleanup;
|
||||
|
||||
p->B = B = (float*) ffts_aligned_malloc(2 * M * sizeof(float));
|
||||
if (!p->B)
|
||||
goto cleanup;
|
||||
|
||||
p->buf = tmp = (float*) ffts_aligned_malloc(2 * 2 * M * sizeof(float));
|
||||
|
||||
ffts_generate_chirp_32f((ffts_cpx_32f*) A, N);
|
||||
|
||||
/* scale with reciprocal of length */
|
||||
reciprocal_M = 1.0f / M;
|
||||
tmp[0] = A[0] * reciprocal_M;
|
||||
tmp[1] = A[1] * reciprocal_M;
|
||||
for (i = 1; i < N; ++i) {
|
||||
tmp[2 * i + 0] = tmp[2 * (M - i) + 0] = A[2 * i + 0] * reciprocal_M;
|
||||
tmp[2 * i + 1] = tmp[2 * (M - i) + 1] = A[2 * i + 1] * reciprocal_M;
|
||||
}
|
||||
|
||||
/* zero pad */
|
||||
for (; i <= M - N; ++i)
|
||||
tmp[2 * i] = tmp[2 * i + 1] = 0.0f;
|
||||
|
||||
/* FFT */
|
||||
p->plans[0]->transform(p->plans[0], tmp, B);
|
||||
return p;
|
||||
|
||||
cleanup:
|
||||
ffts_chirp_z_free(p);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
ffts_chirp_z_transform_f_32f(struct _ffts_plan_t *p, const void *in, void *out)
|
||||
{
|
||||
const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
|
||||
const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
|
||||
size_t i, M = p->plans[0]->N, N = p->N;
|
||||
float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
|
||||
float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
|
||||
const float *din = (const float*) in;
|
||||
float *dout = (float*) out;
|
||||
|
||||
/* we know this */
|
||||
FFTS_ASSUME(M >= 8);
|
||||
|
||||
/* multiply input with conjugated sequence */
|
||||
for (i = 0; i < N; ++i) {
|
||||
t1[2 * i + 0] = din[2 * i + 0] * A[2 * i + 0] + din[2 * i + 1] * A[2 * i + 1];
|
||||
t1[2 * i + 1] = din[2 * i + 1] * A[2 * i + 0] - din[2 * i + 0] * A[2 * i + 1];
|
||||
}
|
||||
|
||||
/* zero pad */
|
||||
for (; i < M; ++i)
|
||||
t1[2 * i] = t1[2 * i + 1] = 0.0f;
|
||||
|
||||
/* convolution using FFT */
|
||||
p->plans[0]->transform(p->plans[0], t1, t2);
|
||||
|
||||
/* complex multiply */
|
||||
for (i = 0; i < M; ++i) {
|
||||
t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
|
||||
t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
|
||||
}
|
||||
|
||||
/* IFFT using FFT with real and imaginary parts swapped */
|
||||
p->plans[0]->transform(p->plans[0], t1, t2);
|
||||
|
||||
/* multiply output with conjugated sequence */
|
||||
for (i = 0; i < N; ++i) {
|
||||
dout[2 * i + 0] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
|
||||
dout[2 * i + 1] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
/* IFFT using FFT with real and imaginary parts swapped */
|
||||
static void
|
||||
ffts_chirp_z_transform_i_32f(struct _ffts_plan_t *p, const void *in, void *out)
|
||||
{
|
||||
const float *A = FFTS_ASSUME_ALIGNED_32(p->A);
|
||||
const float *B = FFTS_ASSUME_ALIGNED_32(p->B);
|
||||
size_t i, M = p->plans[0]->N, N = p->N;
|
||||
float *t1 = (float*) FFTS_ASSUME_ALIGNED_32(p->buf);
|
||||
float *t2 = FFTS_ASSUME_ALIGNED_32(&t1[2 * M]);
|
||||
const float *din = (const float*) in;
|
||||
float *dout = (float*) out;
|
||||
|
||||
/* we know this */
|
||||
FFTS_ASSUME(M >= 8);
|
||||
|
||||
/* multiply input with conjugated sequence */
|
||||
for (i = 0; i < N; ++i) {
|
||||
t1[2 * i + 0] = din[2 * i + 1] * A[2 * i + 0] + din[2 * i + 0] * A[2 * i + 1];
|
||||
t1[2 * i + 1] = din[2 * i + 0] * A[2 * i + 0] - din[2 * i + 1] * A[2 * i + 1];
|
||||
}
|
||||
|
||||
/* zero pad */
|
||||
for (; i < M; ++i)
|
||||
t1[2 * i] = t1[2 * i + 1] = 0.0f;
|
||||
|
||||
/* convolution using FFT */
|
||||
p->plans[0]->transform(p->plans[0], t1, t2);
|
||||
|
||||
/* complex multiply */
|
||||
for (i = 0; i < M; ++i) {
|
||||
t1[2 * i + 0] = t2[2 * i + 1] * B[2 * i + 0] + t2[2 * i + 0] * B[2 * i + 1];
|
||||
t1[2 * i + 1] = t2[2 * i + 0] * B[2 * i + 0] - t2[2 * i + 1] * B[2 * i + 1];
|
||||
}
|
||||
|
||||
/* IFFT using FFT with real and imaginary parts swapped */
|
||||
p->plans[0]->transform(p->plans[0], t1, t2);
|
||||
|
||||
/* multiply output with conjugated sequence */
|
||||
for (i = 0; i < N; ++i) {
|
||||
dout[2 * i + 0] = t2[2 * i + 0] * A[2 * i + 0] - t2[2 * i + 1] * A[2 * i + 1];
|
||||
dout[2 * i + 1] = t2[2 * i + 1] * A[2 * i + 0] + t2[2 * i + 0] * A[2 * i + 1];
|
||||
}
|
||||
}
|
@ -0,0 +1,45 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2016, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef FFTS_CHIRP_Z_H
|
||||
#define FFTS_CHIRP_Z_H
|
||||
|
||||
#if defined (_MSC_VER) && (_MSC_VER >= 1020)
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#include "ffts.h"
|
||||
|
||||
ffts_plan_t*
|
||||
ffts_chirp_z_init(size_t N, int sign);
|
||||
|
||||
#endif /* FFTS_CHIRP_Z_H */
|
@ -0,0 +1,371 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#include "ffts_cpu.h"
|
||||
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
#include <stdio.h>
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#include <intrin.h>
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
/* TODO: add detection/declaration of these to CMake phase */
|
||||
#if !defined(FFTS_CPU_X64)
|
||||
#if defined(_M_AMD64) || defined(__amd64) || defined(__amd64__) || defined(_M_X64) || defined(__x86_64) || defined(__x86_64__)
|
||||
/* 64 bit x86 detected */
|
||||
#define FFTS_CPU_X64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if !defined(FFTS_CPU_X64) && !defined(FFTS_CPU_X86)
|
||||
#if defined(i386) || defined(__i386) || defined(__i386__) || defined(_M_IX86) || defined(__X86__) || defined(_X86_)
|
||||
/* 32 bit x86 detected */
|
||||
#define FFTS_CPU_X86
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* check if build is 32 bit or 64 bit x86 */
|
||||
#if defined(FFTS_CPU_X64) || defined(FFTS_CPU_X86)
|
||||
|
||||
/* Build and tested on
|
||||
CentOS 6.8 2.6.32-642.11.1.el6.x86_64 - gcc version 4.4.7 20120313
|
||||
Mac OSX 10.9 - Apple Clang 6.0
|
||||
Ubuntu 14.04 LTS 4.2.0-42 x86_64 - gcc version 4.8.4
|
||||
Windows XP SP3 - Visual Studio 2005 SP1 x86/x64
|
||||
Windows Vista SP2 - Visual Studio 2010 SP1 x86/x64
|
||||
Windows 7 Ultimate SP1 - Visual Studio 2015 x86/x64
|
||||
Windows 7 Ultimate SP1 - gcc version 4.9.2 (i686-posix-dwarf-rev1)
|
||||
Windows 7 Ultimate SP1 - gcc version 4.9.2 (x86_64-posix-seh-rev3)
|
||||
Windows 10 Pro - Visual Studio 2017 x86/x64
|
||||
*/
|
||||
|
||||
/* Visual Studio 2010 SP1 or newer have _xgetbv intrinsic */
|
||||
#if (defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219)
|
||||
#define FFTS_HAVE_XGETBV
|
||||
#endif
|
||||
|
||||
#ifndef BIT
|
||||
#define BIT(n) (1u << n)
|
||||
#endif
|
||||
|
||||
/* bit masks */
|
||||
#define FFTS_CPU_X86_SSE_BITS (BIT(0) | BIT(15) | BIT(23) | BIT(24) | BIT(25))
|
||||
#define FFTS_CPU_X86_SSE2_BITS (BIT(26))
|
||||
#define FFTS_CPU_X86_SSE3_BITS (BIT(0))
|
||||
#define FFTS_CPU_X86_SSSE3_BITS (BIT(9))
|
||||
#define FFTS_CPU_X86_SSE4_1_BITS (BIT(19))
|
||||
#define FFTS_CPU_X86_SSE4_2_BITS (BIT(20) | BIT(23))
|
||||
#define FFTS_CPU_X86_AVX_BITS (BIT(26) | BIT(27) | BIT(28))
|
||||
#define FFTS_CPU_X86_XCR0_BITS (
|
||||
#define FFTS_CPU_X86_AVX2_BITS (BIT(5))
|
||||
#define FFTS_CPU_X86_AVX512_BITS (BIT(16))
|
||||
|
||||
/* Visual Studio 2008 or older */
|
||||
#if defined(FFTS_CPU_X64) && defined(_MSC_VER) && _MSC_VER <= 1500
|
||||
#pragma optimize("", off)
|
||||
static void __fastcall ffts_cpuidex(int subleaf, int regs[4], int leaf)
|
||||
{
|
||||
/* x64 uses a four register fast-call calling convention by default and
|
||||
arguments are passed in registers RCX, RDX, R8, and R9. By disabling
|
||||
optimization and passing subleaf as first argument we get __cpuidex
|
||||
*/
|
||||
(void) subleaf;
|
||||
__cpuid(regs, leaf);
|
||||
}
|
||||
#pragma optimize("", on)
|
||||
#endif
|
||||
|
||||
static FFTS_INLINE void ffts_cpuid(int regs[4], int leaf, int subleaf)
|
||||
{
|
||||
#if defined(_MSC_VER)
|
||||
#if defined(FFTS_CPU_X64)
|
||||
/* Visual Studio 2010 or newer */
|
||||
#if _MSC_VER > 1500
|
||||
__cpuidex(regs, leaf, subleaf);
|
||||
#else
|
||||
ffts_cpuidex(subleaf, regs, leaf);
|
||||
#endif
|
||||
#else
|
||||
__asm {
|
||||
mov eax, leaf
|
||||
mov ecx, subleaf
|
||||
mov esi, regs
|
||||
cpuid
|
||||
mov [esi + 0x0], eax
|
||||
mov [esi + 0x4], ebx
|
||||
mov [esi + 0x8], ecx
|
||||
mov [esi + 0xc], edx
|
||||
}
|
||||
#endif
|
||||
#elif defined(__GNUC__) && __GNUC__
|
||||
#if defined(FFTS_CPU_X64)
|
||||
__asm__ __volatile__(
|
||||
"cpuid\n\t"
|
||||
: "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
|
||||
: "a"(leaf), "c"(subleaf));
|
||||
#elif defined(__PIC__)
|
||||
__asm__ __volatile__(
|
||||
"xchgl %%ebx, %1\n\t"
|
||||
"cpuid \n\t"
|
||||
"xchgl %%ebx, %1\n\t"
|
||||
: "=a"(regs[0]), "=r"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
|
||||
: "a"(leaf), "c"(subleaf));
|
||||
#else
|
||||
__asm__ __volatile__(
|
||||
"cpuid\n\t"
|
||||
: "=a"(regs[0]), "=b"(regs[1]), "=c"(regs[2]), "=d"(regs[3])
|
||||
: "a"(leaf), "c"(subleaf));
|
||||
#endif
|
||||
#else
|
||||
/* unknown compiler for x86 */
|
||||
regs[0] = regs[1] = regs[2] = regs[3] = 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
/* at least Visual Studio 2010 generates invalidate optimized _xgetbv */
|
||||
#if defined(FFTS_HAVE_XGETBV)
|
||||
#pragma optimize("", off)
|
||||
#endif
|
||||
static FFTS_INLINE unsigned int ffts_get_xcr0(void)
|
||||
{
|
||||
#if defined(FFTS_HAVE_XGETBV)
|
||||
return (unsigned int) _xgetbv(0);
|
||||
#elif defined(_MSC_VER)
|
||||
#if defined(FFTS_CPU_X64)
|
||||
/* emulate xgetbv(0) on Windows 7 SP1 or newer */
|
||||
typedef DWORD64 (WINAPI *PGETENABLEDXSTATEFEATURES)(VOID);
|
||||
PGETENABLEDXSTATEFEATURES pfnGetEnabledXStateFeatures =
|
||||
(PGETENABLEDXSTATEFEATURES) GetProcAddress(
|
||||
GetModuleHandle(TEXT("kernel32.dll")), "GetEnabledXStateFeatures");
|
||||
return pfnGetEnabledXStateFeatures ? (unsigned int) pfnGetEnabledXStateFeatures() : 0;
|
||||
#else
|
||||
/* note that we have to touch edx register to tell compiler it's used by emited xgetbv */
|
||||
unsigned __int32 hi, lo;
|
||||
__asm {
|
||||
xor ecx, ecx
|
||||
_emit 0x0f
|
||||
_emit 0x01
|
||||
_emit 0xd0
|
||||
mov lo, eax
|
||||
mov hi, edx
|
||||
}
|
||||
return (unsigned int) lo;
|
||||
#endif
|
||||
#elif defined(__GNUC__) && __GNUC__
|
||||
unsigned int lo;
|
||||
__asm__ __volatile__(".byte 0x0f, 0x01, 0xd0\n"
|
||||
: "=a"(lo)
|
||||
: "c"(0)
|
||||
: "edx");
|
||||
return lo;
|
||||
#else
|
||||
/* unknown x86 compiler */
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
#if defined(FFTS_HAVE_XGETBV)
|
||||
#pragma optimize("", on)
|
||||
#endif
|
||||
|
||||
int
|
||||
ffts_cpu_detect(int *extra_flags)
|
||||
{
|
||||
static int cpu_flags = -1;
|
||||
static int cpu_extra_flags = -1;
|
||||
int max_basic_func;
|
||||
int regs[4];
|
||||
unsigned int xcr0;
|
||||
|
||||
if (cpu_flags >= 0) {
|
||||
goto exit;
|
||||
}
|
||||
|
||||
/* initialize */
|
||||
cpu_flags = cpu_extra_flags = 0;
|
||||
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
printf("cpuid check: ");
|
||||
#endif
|
||||
#if defined(FFTS_CPU_X64)
|
||||
/* cpuid is always supported on x64 */
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
printf("skipped\n");
|
||||
#endif
|
||||
#else
|
||||
#if defined(_MSC_VER)
|
||||
_asm {
|
||||
pushfd
|
||||
pop eax
|
||||
mov ebx,eax
|
||||
xor eax,200000h
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
push ebx
|
||||
popfd
|
||||
mov regs[0 * TYPE regs],eax
|
||||
mov regs[1 * TYPE regs],ebx
|
||||
}
|
||||
#else
|
||||
__asm__ (
|
||||
"pushfl\n\t"
|
||||
"pop %0\n\t"
|
||||
"movl %0,%1\n\t"
|
||||
"xorl $0x200000,%0\n\t"
|
||||
"pushl %0\n\t"
|
||||
"popfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %0\n\t"
|
||||
"pushl %1\n\t"
|
||||
"popfl\n\t"
|
||||
: "=r" (regs[0]), "=r" (regs[1])
|
||||
);
|
||||
#endif
|
||||
/* check CPUID bit (bit 21) in EFLAGS register can be toggled */
|
||||
if (((regs[0] ^ regs[1]) & 0x200000) == 0) {
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
printf("not supported\n");
|
||||
#endif
|
||||
goto exit;
|
||||
}
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
printf("supported\n");
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* get the number of basic functions */
|
||||
ffts_cpuid(regs, 0, 0);
|
||||
max_basic_func = regs[0];
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
printf("cpuid eax=0, ecx=0: %d\n", max_basic_func);
|
||||
#endif
|
||||
if (max_basic_func == 0)
|
||||
goto exit;
|
||||
|
||||
/* get feature flags */
|
||||
ffts_cpuid(regs, 1, 0);
|
||||
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
printf("cpuid eax=1, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
|
||||
#endif
|
||||
|
||||
#if defined(FFTS_CPU_X64)
|
||||
/* minimum for any x64 */
|
||||
cpu_flags = FFTS_CPU_X86_SSE | FFTS_CPU_X86_SSE2;
|
||||
#else
|
||||
/* test if SSE is supported */
|
||||
if ((regs[3] & FFTS_CPU_X86_SSE_BITS) != FFTS_CPU_X86_SSE_BITS)
|
||||
goto exit;
|
||||
cpu_flags = FFTS_CPU_X86_SSE;
|
||||
|
||||
/* test if SSE2 is supported */
|
||||
if (!(regs[3] & FFTS_CPU_X86_SSE2_BITS))
|
||||
goto exit;
|
||||
cpu_flags |= FFTS_CPU_X86_SSE2;
|
||||
#endif
|
||||
|
||||
/* test if SSE3 is supported */
|
||||
if (!(regs[2] & FFTS_CPU_X86_SSE3_BITS))
|
||||
goto exit;
|
||||
cpu_flags |= FFTS_CPU_X86_SSE3;
|
||||
|
||||
/* test if SSSE3 is supported */
|
||||
if (!(regs[2] & FFTS_CPU_X86_SSSE3_BITS))
|
||||
goto exit;
|
||||
cpu_flags |= FFTS_CPU_X86_SSSE3;
|
||||
|
||||
/* test if SSE4.1 is supported */
|
||||
if (!(regs[2] & FFTS_CPU_X86_SSE4_1_BITS))
|
||||
goto exit;
|
||||
cpu_flags |= FFTS_CPU_X86_SSE4_1;
|
||||
|
||||
/* test if SSE4.2 is supported */
|
||||
if ((regs[2] & FFTS_CPU_X86_SSE4_2_BITS) != FFTS_CPU_X86_SSE4_2_BITS)
|
||||
goto exit;
|
||||
cpu_flags |= FFTS_CPU_X86_SSE4_2;
|
||||
|
||||
/* test if AVX is supported */
|
||||
if ((regs[2] & FFTS_CPU_X86_AVX_BITS) != FFTS_CPU_X86_AVX_BITS)
|
||||
goto exit;
|
||||
|
||||
/* test if legaxy x87, 128-bit SSE and 256-bit AVX states are enabled in XCR0 */
|
||||
xcr0 = ffts_get_xcr0();
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
printf("xcr0: %u\n", xcr0);
|
||||
#endif
|
||||
if ((xcr0 & 0x6) != 0x6)
|
||||
goto exit;
|
||||
|
||||
cpu_flags |= FFTS_CPU_X86_AVX;
|
||||
|
||||
/* check that cpuid extended features exist */
|
||||
if (max_basic_func < 7)
|
||||
goto exit;
|
||||
|
||||
/* get extended features */
|
||||
ffts_cpuid(regs, 7, 0);
|
||||
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
printf("cpuid eax=7, ecx=0: eax=%08x ebx=%08x ecx=%08x edx=%08x\n", regs[0], regs[1], regs[2], regs[3]);
|
||||
#endif
|
||||
|
||||
/* test if AVX2 is supported */
|
||||
if ((regs[1] & FFTS_CPU_X86_AVX2_BITS) != FFTS_CPU_X86_AVX2_BITS)
|
||||
goto exit;
|
||||
cpu_flags |= FFTS_CPU_X86_AVX2;
|
||||
|
||||
/* test if AVX512 is supported */
|
||||
if ((regs[1] & FFTS_CPU_X86_AVX512_BITS) != FFTS_CPU_X86_AVX512_BITS)
|
||||
goto exit;
|
||||
cpu_flags |= FFTS_CPU_X86_AVX512;
|
||||
|
||||
exit:
|
||||
if (extra_flags) {
|
||||
*extra_flags = cpu_extra_flags;
|
||||
}
|
||||
return cpu_flags;
|
||||
}
|
||||
#else
|
||||
int
|
||||
ffts_cpu_detect(int *extra_flags)
|
||||
{
|
||||
/* not implemented */
|
||||
#if defined(FFTS_BUILDING_CPU_TEST)
|
||||
printf("CPU detection not implemented!!\n");
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
#endif
|
@ -0,0 +1,54 @@
|
||||
/*
|
||||
|
||||
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
||||
|
||||
Copyright (c) 2018, Jukka Ojanen <jukka.ojanen@kolumbus.fi>
|
||||
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are met:
|
||||
* Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
* Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
* Neither the name of the organization nor the
|
||||
names of its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
||||
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
||||
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
||||
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef FFTS_CPU_H
|
||||
|
||||
#if defined (_MSC_VER) && (_MSC_VER >= 1020)
|
||||
#pragma once
|
||||
#endif
|
||||
|
||||
#include "ffts_internal.h"
|
||||
|
||||
#define FFTS_CPU_X86_SSE 0x001
|
||||
#define FFTS_CPU_X86_SSE2 0x002
|
||||
#define FFTS_CPU_X86_SSE3 0x004
|
||||
#define FFTS_CPU_X86_SSSE3 0x008
|
||||
#define FFTS_CPU_X86_SSE4_1 0x010
|
||||
#define FFTS_CPU_X86_SSE4_2 0x020
|
||||
#define FFTS_CPU_X86_AVX 0x040
|
||||
#define FFTS_CPU_X86_AVX2 0x080
|
||||
#define FFTS_CPU_X86_AVX512 0x100
|
||||
|
||||
int
|
||||
ffts_cpu_detect(int *extra_flags);
|
||||
|
||||
#endif /* FFTS_CPU_H */
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in new issue