You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

879 lines
42 KiB

/*
This file is part of FFTS -- The Fastest Fourier Transform in the South
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
Copyright (c) 2012, The University of Waikato
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of the organization nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
.globl _neon_x4
.align 4
_neon_x4:
.globl _neon_x8
.align 4
_neon_x8:
.globl _neon_x8_t
.align 4
_neon_x8_t:
#ifdef __APPLE__
.globl _leaf_ee_init
_leaf_ee_init:
#else
.globl leaf_ee_init
leaf_ee_init:
#endif
#lea L_sse_constants(%rip), %r9
movq 0xe0(%rdi), %r9
xorl %eax, %eax
# eax is loop counter (init to 0)
# rcx is loop max count
# rsi is 'in' base pointer
# rdx is 'out' base pointer
# r8 is offsets pointer
# r9 is constants pointer
# scratch: rax r11 r12
# .align 4, 0x90
# _leaf_ee + 9 needs 16 byte alignment
#ifdef __APPLE__
.globl _leaf_ee
_leaf_ee:
#else
.globl leaf_ee
leaf_ee:
#endif
movaps 32(%r9), %xmm0 #83.5
movaps (%r9), %xmm8 #83.5
LEAF_EE_1:
LEAF_EE_const_0:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #83.5
LEAF_EE_const_2:
movaps 0xFECA(%rsi,%rax,4), %xmm12 #83.5
movaps %xmm7, %xmm6 #83.5
LEAF_EE_const_3:
movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
movaps %xmm12, %xmm11 #83.5
subps %xmm10, %xmm12 #83.5
addps %xmm10, %xmm11 #83.5
xorps %xmm8, %xmm12 #83.5
LEAF_EE_const_1:
movaps 0xFECA(%rsi,%rax,4), %xmm9 #83.5
LEAF_EE_const_4:
movaps 0xFECA(%rsi,%rax,4), %xmm10 #83.5
addps %xmm9, %xmm6 #83.5
subps %xmm9, %xmm7 #83.5
LEAF_EE_const_5:
movaps 0xFECA(%rsi,%rax,4), %xmm13 #83.5
movaps %xmm10, %xmm9 #83.5
LEAF_EE_const_6:
movaps 0xFECA(%rsi,%rax,4), %xmm3 #83.5
movaps %xmm6, %xmm5 #83.5
LEAF_EE_const_7:
movaps 0xFECA(%rsi,%rax,4), %xmm14 #83.5
movaps %xmm3, %xmm15 #83.5
shufps $177, %xmm12, %xmm12 #83.5
movaps %xmm7, %xmm4 #83.5
movslq (%r8, %rax, 4), %r11 #83.44
subps %xmm13, %xmm10 #83.5
subps %xmm14, %xmm3 #83.5
addps %xmm11, %xmm5 #83.5
subps %xmm11, %xmm6 #83.5
subps %xmm12, %xmm4 #83.5
addps %xmm12, %xmm7 #83.5
addps %xmm13, %xmm9 #83.5
addps %xmm14, %xmm15 #83.5
movaps 16(%r9), %xmm12 #83.5
movaps %xmm9, %xmm1 #83.5
movaps 16(%r9), %xmm11 #83.5
movaps %xmm5, %xmm2 #83.5
mulps %xmm10, %xmm12 #83.5
subps %xmm15, %xmm9 #83.5
addps %xmm15, %xmm1 #83.5
mulps %xmm3, %xmm11 #83.5
addps %xmm1, %xmm2 #83.5
subps %xmm1, %xmm5 #83.5
shufps $177, %xmm10, %xmm10 #83.5
xorps %xmm8, %xmm9 #83.5
shufps $177, %xmm3, %xmm3 #83.5
movaps %xmm6, %xmm1 #83.5
mulps %xmm0, %xmm10 #83.5
movaps %xmm4, %xmm13 #83.5
mulps %xmm0, %xmm3 #83.5
subps %xmm10, %xmm12 #83.5
addps %xmm3, %xmm11 #83.5
movaps %xmm12, %xmm3 #83.5
movaps %xmm7, %xmm14 #83.5
shufps $177, %xmm9, %xmm9 #83.5
subps %xmm11, %xmm12 #83.5
addps %xmm11, %xmm3 #83.5
subps %xmm9, %xmm1 #83.5
addps %xmm9, %xmm6 #83.5
addps %xmm3, %xmm4 #83.5
subps %xmm3, %xmm13 #83.5
xorps %xmm8, %xmm12 #83.5
movaps %xmm2, %xmm3 #83.5
shufps $177, %xmm12, %xmm12 #83.5
movaps %xmm6, %xmm9 #83.5
movslq 8(%r8, %rax, 4), %r12 #83.59
movlhps %xmm4, %xmm3 #83.5
addq $4, %rax
shufps $238, %xmm4, %xmm2 #83.5
movaps %xmm1, %xmm4 #83.5
#movntdq %xmm3, (%rdx,%r11,4) #83.5
subps %xmm12, %xmm7 #83.5
addps %xmm12, %xmm14 #83.5
movlhps %xmm7, %xmm4 #83.5
shufps $238, %xmm7, %xmm1 #83.5
movaps %xmm5, %xmm7 #83.5
movlhps %xmm13, %xmm7 #83.5
movlhps %xmm14, %xmm9 #83.5
shufps $238, %xmm13, %xmm5 #83.5
shufps $238, %xmm14, %xmm6 #83.5
movaps %xmm3, (%rdx,%r11,4) #83.5
movaps %xmm4, 16(%rdx,%r11,4) #83.5
movaps %xmm7, 32(%rdx,%r11,4) #83.5
movaps %xmm9, 48(%rdx,%r11,4) #83.5
movaps %xmm2, (%rdx,%r12,4) #83.5
movaps %xmm1, 16(%rdx,%r12,4) #83.5
movaps %xmm5, 32(%rdx,%r12,4) #83.5
movaps %xmm6, 48(%rdx,%r12,4) #83.5
cmpq %rcx, %rax
jne LEAF_EE_1
# _leaf_oo + 4 needs to be 16 byte aligned
#ifdef __APPLE__
.globl _leaf_oo
_leaf_oo:
#else
.globl leaf_oo
leaf_oo:
#endif
movaps (%r9), %xmm5 #92.7
LEAF_OO_1:
LEAF_OO_const_0:
movaps 0xFECA(%rsi,%rax,4), %xmm4 #93.5
movaps %xmm4, %xmm6 #93.5
LEAF_OO_const_1:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #93.5
LEAF_OO_const_2:
movaps 0xFECA(%rsi,%rax,4), %xmm10 #93.5
addps %xmm7, %xmm6 #93.5
subps %xmm7, %xmm4 #93.5
LEAF_OO_const_3:
movaps 0xFECA(%rsi,%rax,4), %xmm8 #93.5
movaps %xmm10, %xmm9 #93.5
LEAF_OO_const_4:
movaps 0xFECA(%rsi,%rax,4), %xmm1 #93.5
movaps %xmm6, %xmm3 #93.5
LEAF_OO_const_5:
movaps 0xFECA(%rsi,%rax,4), %xmm11 #93.5
movaps %xmm1, %xmm2 #93.5
LEAF_OO_const_6:
movaps 0xFECA(%rsi,%rax,4), %xmm14 #93.5
movaps %xmm4, %xmm15 #93.5
LEAF_OO_const_7:
movaps 0xFECA(%rsi,%rax,4), %xmm12 #93.5
movaps %xmm14, %xmm13 #93.5
movslq (%r8, %rax, 4), %r11 #83.44
subps %xmm8, %xmm10 #93.5
addps %xmm8, %xmm9 #93.5
addps %xmm11, %xmm2 #93.5
subps %xmm12, %xmm14 #93.5
subps %xmm11, %xmm1 #93.5
addps %xmm12, %xmm13 #93.5
addps %xmm9, %xmm3 #93.5
subps %xmm9, %xmm6 #93.5
xorps %xmm5, %xmm10 #93.5
xorps %xmm5, %xmm14 #93.5
shufps $177, %xmm10, %xmm10 #93.5
movaps %xmm2, %xmm9 #93.5
shufps $177, %xmm14, %xmm14 #93.5
movaps %xmm6, %xmm7 #93.5
movslq 8(%r8, %rax, 4), %r12 #83.59
addq $4, %rax #92.18
addps %xmm10, %xmm4 #93.5
addps %xmm13, %xmm9 #93.5
subps %xmm13, %xmm2 #93.5
subps %xmm10, %xmm15 #93.5
movaps %xmm1, %xmm13 #93.5
movaps %xmm2, %xmm8 #93.5
movlhps %xmm4, %xmm7 #93.5
subps %xmm14, %xmm13 #93.5
addps %xmm14, %xmm1 #93.5
shufps $238, %xmm4, %xmm6 #93.5
movaps %xmm3, %xmm14 #93.5
movaps %xmm9, %xmm4 #93.5
movlhps %xmm15, %xmm14 #93.5
movlhps %xmm13, %xmm4 #93.5
movlhps %xmm1, %xmm8 #93.5
shufps $238, %xmm15, %xmm3 #93.5
shufps $238, %xmm13, %xmm9 #93.5
shufps $238, %xmm1, %xmm2 #93.5
movaps %xmm14, (%rdx,%r11,4) #93.5
movaps %xmm7, 16(%rdx,%r11,4) #93.5
movaps %xmm4, 32(%rdx,%r11,4) #93.5
movaps %xmm8, 48(%rdx,%r11,4) #93.5
movaps %xmm3, (%rdx,%r12,4) #93.5
movaps %xmm6, 16(%rdx,%r12,4) #93.5
movaps %xmm9, 32(%rdx,%r12,4) #93.5
movaps %xmm2, 48(%rdx,%r12,4) #93.5
cmpq %rcx, %rax
jne LEAF_OO_1 # Prob 95% #92.14
#ifdef __APPLE__
.globl _leaf_eo
_leaf_eo:
#else
.globl leaf_eo
leaf_eo:
#endif
LEAF_EO_const_0:
movaps 0xFECA(%rsi,%rax,4), %xmm9 #88.5
LEAF_EO_const_2:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #88.5
movaps %xmm9, %xmm11 #88.5
LEAF_EO_const_3:
movaps 0xFECA(%rsi,%rax,4), %xmm5 #88.5
movaps %xmm7, %xmm6 #88.5
LEAF_EO_const_1:
movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
subps %xmm5, %xmm7 #88.5
addps %xmm4, %xmm11 #88.5
subps %xmm4, %xmm9 #88.5
addps %xmm5, %xmm6 #88.5
movaps (%r9), %xmm3 #88.5
movaps %xmm11, %xmm10 #88.5
xorps %xmm3, %xmm7 #88.5
movaps %xmm9, %xmm8 #88.5
shufps $177, %xmm7, %xmm7 #88.5
addps %xmm6, %xmm10 #88.5
subps %xmm6, %xmm11 #88.5
subps %xmm7, %xmm8 #88.5
addps %xmm7, %xmm9 #88.5
movslq 8(%r8, %rax, 4), %r12 #83.59
movaps %xmm10, %xmm2 #88.5
movslq (%r8, %rax, 4), %r11 #83.44
movaps %xmm11, %xmm1 #88.5
shufps $238, %xmm8, %xmm10 #88.5
shufps $238, %xmm9, %xmm11 #88.5
movaps %xmm10, (%rdx,%r12,4) #88.5
movaps %xmm11, 16(%rdx,%r12,4) #88.5
LEAF_EO_const_4:
movaps 0xFECA(%rsi,%rax,4), %xmm15 #88.5
LEAF_EO_const_5:
movaps 0xFECA(%rsi,%rax,4), %xmm12 #88.5
movaps %xmm15, %xmm14 #88.5
LEAF_EO_const_6:
movaps 0xFECA(%rsi,%rax,4), %xmm4 #88.5
addps %xmm12, %xmm14 #88.5
subps %xmm12, %xmm15 #88.5
LEAF_EO_const_7:
movaps 0xFECA(%rsi,%rax,4), %xmm13 #88.5
movaps %xmm4, %xmm5 #88.5
movaps %xmm14, %xmm7 #88.5
addps %xmm13, %xmm5 #88.5
subps %xmm13, %xmm4 #88.5
movlhps %xmm8, %xmm2 #88.5
movaps %xmm5, %xmm8 #88.5
movlhps %xmm15, %xmm7 #88.5
xorps %xmm3, %xmm15 #88.5
movaps %xmm5, %xmm6 #88.5
subps %xmm14, %xmm5 #88.5
addps %xmm14, %xmm6 #88.5
movlhps %xmm9, %xmm1 #88.5
movaps %xmm4, %xmm14 #88.5
movlhps %xmm4, %xmm8 #88.5
movaps %xmm1, %xmm12 #88.5
shufps $177, %xmm15, %xmm15 #88.5
movaps 0x30(%r9), %xmm11 #88.5
addq $4, %rax #90.5
subps %xmm15, %xmm14 #88.5
mulps %xmm7, %xmm11 #88.5
addps %xmm15, %xmm4 #88.5
movaps 0x30(%r9), %xmm9 #88.5
movaps 0x40(%r9), %xmm15 #88.5
shufps $177, %xmm7, %xmm7 #88.5
mulps %xmm8, %xmm9 #88.5
mulps %xmm15, %xmm7 #88.5
shufps $177, %xmm8, %xmm8 #88.5
subps %xmm7, %xmm11 #88.5
mulps %xmm15, %xmm8 #88.5
movaps %xmm11, %xmm10 #88.5
addps %xmm8, %xmm9 #88.5
shufps $238, %xmm14, %xmm6 #88.5
subps %xmm9, %xmm11 #88.5
addps %xmm9, %xmm10 #88.5
xorps %xmm3, %xmm11 #88.5
movaps %xmm2, %xmm3 #88.5
shufps $177, %xmm11, %xmm11 #88.5
subps %xmm10, %xmm3 #88.5
addps %xmm10, %xmm2 #88.5
addps %xmm11, %xmm12 #88.5
subps %xmm11, %xmm1 #88.5
shufps $238, %xmm4, %xmm5 #88.5
movaps %xmm5, 48(%rdx,%r12,4) #88.5
movaps %xmm6, 32(%rdx,%r12,4) #88.5
movaps %xmm2, (%rdx,%r11,4) #88.5
movaps %xmm1, 16(%rdx,%r11,4) #88.5
movaps %xmm3, 32(%rdx,%r11,4) #88.5
movaps %xmm12, 48(%rdx,%r11,4) #88.5
#ifdef __APPLE__
.globl _leaf_oe
_leaf_oe:
#else
.globl leaf_oe
leaf_oe:
#endif
movaps (%r9), %xmm0 #59.5
#movaps 0x20(%r9), %xmm1 #59.5
LEAF_OE_const_2:
movaps 0xFECA(%rsi,%rax,4), %xmm6 #70.5
LEAF_OE_const_3:
movaps 0xFECA(%rsi,%rax,4), %xmm8 #70.5
movaps %xmm6, %xmm10 #70.5
shufps $228, %xmm8, %xmm10 #70.5
movaps %xmm10, %xmm9 #70.5
shufps $228, %xmm6, %xmm8 #70.5
LEAF_OE_const_0:
movaps 0xFECA(%rsi,%rax,4), %xmm12 #70.5
LEAF_OE_const_1:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
movaps %xmm12, %xmm14 #70.5
movslq (%r8, %rax, 4), %r11 #83.44
addps %xmm8, %xmm9 #70.5
subps %xmm8, %xmm10 #70.5
addps %xmm7, %xmm14 #70.5
subps %xmm7, %xmm12 #70.5
movaps %xmm9, %xmm4 #70.5
movaps %xmm14, %xmm13 #70.5
shufps $238, %xmm10, %xmm4 #70.5
xorps %xmm0, %xmm10 #70.5
shufps $177, %xmm10, %xmm10 #70.5
movaps %xmm12, %xmm11 #70.5
movaps %xmm14, %xmm5 #70.5
addps %xmm9, %xmm13 #70.5
subps %xmm10, %xmm11 #70.5
subps %xmm9, %xmm14 #70.5
shufps $238, %xmm12, %xmm5 #70.5
addps %xmm10, %xmm12 #70.5
movslq 8(%r8, %rax, 4), %r12 #83.59
movlhps %xmm11, %xmm13 #70.5
movaps %xmm13, (%rdx,%r11,4) #70.5
movaps 0x30(%r9), %xmm13 #70.5
movlhps %xmm12, %xmm14 #70.5
movaps 0x40(%r9), %xmm12 #70.5
mulps %xmm5, %xmm13 #70.5
shufps $177, %xmm5, %xmm5 #70.5
mulps %xmm12, %xmm5 #70.5
movaps %xmm14, 16(%rdx,%r11,4) #70.5
subps %xmm5, %xmm13 #70.5
movaps 0x30(%r9), %xmm5 #70.5
mulps %xmm4, %xmm5 #70.5
shufps $177, %xmm4, %xmm4 #70.5
mulps %xmm12, %xmm4 #70.5
LEAF_OE_const_4:
movaps 0xFECA(%rsi,%rax,4), %xmm9 #70.5
addps %xmm4, %xmm5 #70.5
LEAF_OE_const_6:
movaps 0xFECA(%rsi,%rax,4), %xmm7 #70.5
movaps %xmm9, %xmm3 #70.5
LEAF_OE_const_7:
movaps 0xFECA(%rsi,%rax,4), %xmm2 #70.5
movaps %xmm7, %xmm6 #70.5
LEAF_OE_const_5:
movaps 0xFECA(%rsi,%rax,4), %xmm15 #70.5
movaps %xmm13, %xmm4 #70.5
subps %xmm2, %xmm7 #70.5
addps %xmm15, %xmm3 #70.5
subps %xmm15, %xmm9 #70.5
addps %xmm2, %xmm6 #70.5
subps %xmm5, %xmm13 #70.5
addps %xmm5, %xmm4 #70.5
xorps %xmm0, %xmm7 #70.5
addq $4, %rax #72.5
movaps %xmm3, %xmm2 #70.5
shufps $177, %xmm7, %xmm7 #70.5
movaps %xmm9, %xmm8 #70.5
xorps %xmm0, %xmm13 #70.5
addps %xmm6, %xmm2 #70.5
subps %xmm7, %xmm8 #70.5
subps %xmm6, %xmm3 #70.5
addps %xmm7, %xmm9 #70.5
movaps %xmm2, %xmm10 #70.5
movaps %xmm3, %xmm11 #70.5
shufps $238, %xmm8, %xmm2 #70.5
shufps $238, %xmm9, %xmm3 #70.5
movaps %xmm2, %xmm14 #70.5
shufps $177, %xmm13, %xmm13 #70.5
subps %xmm4, %xmm14 #70.5
addps %xmm4, %xmm2 #70.5
movaps %xmm3, %xmm4 #70.5
subps %xmm13, %xmm3 #70.5
addps %xmm13, %xmm4 #70.5
movlhps %xmm8, %xmm10 #70.5
movlhps %xmm9, %xmm11 #70.5
movaps %xmm10, 32(%rdx,%r11,4) #70.5
movaps %xmm11, 48(%rdx,%r11,4) #70.5
movaps %xmm2, (%rdx,%r12,4) #70.5
movaps %xmm3, 16(%rdx,%r12,4) #70.5
movaps %xmm14, 32(%rdx,%r12,4) #70.5
movaps %xmm4, 48(%rdx,%r12,4) #70.5
#ifdef __APPLE__
.globl _leaf_end
_leaf_end:
#else
.globl leaf_end
leaf_end:
#endif
#ifdef __APPLE__
.globl _x_init
_x_init:
#else
.globl x_init
x_init:
#endif
#movaps L_sse_constants(%rip), %xmm3 #34.3
movaps (%r9), %xmm3 #34.3
movq 0x20(%rdi),%r8
#ifdef __APPLE__
.globl _x4
_x4:
#else
.globl x4
x4:
#endif
movaps 64(%rdx), %xmm0 #34.3
movaps 96(%rdx), %xmm1 #34.3
movaps (%rdx), %xmm7 #34.3
movaps (%r8), %xmm4 #const
movaps %xmm7, %xmm9 #34.3
movaps %xmm4, %xmm6 #34.3
movaps 16(%r8), %xmm2 #const
mulps %xmm0, %xmm6 #34.3
mulps %xmm1, %xmm4 #34.3
shufps $177, %xmm0, %xmm0 #34.3
shufps $177, %xmm1, %xmm1 #34.3
mulps %xmm2, %xmm0 #34.3
mulps %xmm1, %xmm2 #34.3
subps %xmm0, %xmm6 #34.3
addps %xmm2, %xmm4 #34.3
movaps %xmm6, %xmm5 #34.3
subps %xmm4, %xmm6 #34.3
addps %xmm4, %xmm5 #34.3
movaps 32(%rdx), %xmm8 #34.3
xorps %xmm3, %xmm6 #34.3
shufps $177, %xmm6, %xmm6 #34.3
movaps %xmm8, %xmm10 #34.3
movaps 112(%rdx), %xmm12 #34.3
subps %xmm5, %xmm9 #34.3
addps %xmm5, %xmm7 #34.3
addps %xmm6, %xmm10 #34.3
subps %xmm6, %xmm8 #34.3
movaps %xmm7, (%rdx) #34.3
movaps %xmm8, 32(%rdx) #34.3
movaps %xmm9, 64(%rdx) #34.3
movaps %xmm10, 96(%rdx) #34.3
movaps 32(%r8), %xmm14 #const #34.3
movaps 80(%rdx), %xmm11 #34.3
movaps %xmm14, %xmm0 #34.3
movaps 48(%r8), %xmm13 #const #34.3
mulps %xmm11, %xmm0 #34.3
mulps %xmm12, %xmm14 #34.3
shufps $177, %xmm11, %xmm11 #34.3
shufps $177, %xmm12, %xmm12 #34.3
mulps %xmm13, %xmm11 #34.3
mulps %xmm12, %xmm13 #34.3
subps %xmm11, %xmm0 #34.3
addps %xmm13, %xmm14 #34.3
movaps %xmm0, %xmm15 #34.3
subps %xmm14, %xmm0 #34.3
addps %xmm14, %xmm15 #34.3
xorps %xmm3, %xmm0 #34.3
movaps 16(%rdx), %xmm1 #34.3
movaps 48(%rdx), %xmm2 #34.3
movaps %xmm1, %xmm4 #34.3
shufps $177, %xmm0, %xmm0 #34.3
movaps %xmm2, %xmm5 #34.3
addps %xmm15, %xmm1 #34.3
subps %xmm0, %xmm2 #34.3
subps %xmm15, %xmm4 #34.3
addps %xmm0, %xmm5 #34.3
movaps %xmm1, 16(%rdx) #34.3
movaps %xmm2, 48(%rdx) #34.3
movaps %xmm4, 80(%rdx) #34.3
movaps %xmm5, 112(%rdx) #34.3
ret
# _x8_soft + 5 needs to be 16 byte aligned
#ifdef __APPLE__
.globl _x8_soft
_x8_soft:
#else
.globl x8_soft
x8_soft:
#endif
xorl %eax, %eax
movq %rdx, %rbx
movq %r8, %rsi
leaq (%rdx,%rcx,4), %r9
leaq (%r9,%rcx,4), %r10
leaq (%r10,%rcx,4), %r11
leaq (%r11,%rcx,4), %r12
leaq (%r12,%rcx,4), %r13
leaq (%r13,%rcx,4), %r14
leaq (%r14,%rcx,4), %r15
X8_soft_loop:
movaps (%rsi), %xmm9
movaps (%r10,%rax,4), %xmm6
movaps %xmm9, %xmm11
movaps (%r11,%rax,4), %xmm7
movaps 16(%rsi), %xmm8
mulps %xmm6, %xmm11
mulps %xmm7, %xmm9
shufps $177, %xmm6, %xmm6
mulps %xmm8, %xmm6
shufps $177, %xmm7, %xmm7
subps %xmm6, %xmm11
mulps %xmm7, %xmm8
movaps %xmm11, %xmm10
addps %xmm8, %xmm9
movaps 32(%rsi), %xmm15
addps %xmm9, %xmm10
subps %xmm9, %xmm11
movaps (%rbx,%rax,4), %xmm5
movaps %xmm15, %xmm6
movaps (%r12,%rax,4), %xmm12
movaps %xmm5, %xmm2
movaps (%r14,%rax,4), %xmm13
xorps %xmm3, %xmm11 #const
movaps 48(%rsi), %xmm14
subps %xmm10, %xmm2
mulps %xmm12, %xmm6
addps %xmm10, %xmm5
mulps %xmm13, %xmm15
movaps 64(%rsi), %xmm10
movaps %xmm5, %xmm0
shufps $177, %xmm12, %xmm12
shufps $177, %xmm13, %xmm13
mulps %xmm14, %xmm12
mulps %xmm13, %xmm14
subps %xmm12, %xmm6
addps %xmm14, %xmm15
movaps (%r13,%rax,4), %xmm7
movaps %xmm10, %xmm13
movaps (%r15,%rax,4), %xmm8
movaps %xmm6, %xmm12
movaps 80(%rsi), %xmm9
addq $96, %rsi
mulps %xmm7, %xmm13
subps %xmm15, %xmm6
addps %xmm15, %xmm12
mulps %xmm8, %xmm10
subps %xmm12, %xmm0
addps %xmm12, %xmm5
shufps $177, %xmm7, %xmm7
xorps %xmm3, %xmm6 #const
shufps $177, %xmm8, %xmm8
movaps %xmm2, %xmm12
mulps %xmm9, %xmm7
mulps %xmm8, %xmm9
subps %xmm7, %xmm13
addps %xmm9, %xmm10
movaps (%r9,%rax,4), %xmm4
shufps $177, %xmm11, %xmm11
movaps %xmm4, %xmm1
shufps $177, %xmm6, %xmm6
addps %xmm11, %xmm1
subps %xmm11, %xmm4
addps %xmm6, %xmm12
subps %xmm6, %xmm2
movaps %xmm13, %xmm11
movaps %xmm4, %xmm14
movaps %xmm1, %xmm6
subps %xmm10, %xmm13
addps %xmm10, %xmm11
xorps %xmm3, %xmm13 #const
addps %xmm11, %xmm4
subps %xmm11, %xmm14
shufps $177, %xmm13, %xmm13
movaps %xmm5, (%rbx,%rax,4)
movaps %xmm4, (%r9,%rax,4)
movaps %xmm2, (%r10,%rax,4)
subps %xmm13, %xmm1
addps %xmm13, %xmm6
movaps %xmm1, (%r11,%rax,4)
movaps %xmm0, (%r12,%rax,4)
movaps %xmm14, (%r13,%rax,4)
movaps %xmm12, (%r14,%rax,4)
movaps %xmm6, (%r15,%rax,4)
addq $4, %rax
cmpq %rcx, %rax
jne X8_soft_loop
ret
#ifdef __APPLE__
.globl _x8_hard
_x8_hard:
#else
.globl x8_hard
x8_hard:
#endif
movaps (%r9), %xmm5
X8_loop:
movaps (%r8), %xmm9
X8_const_2:
movaps 0xFECA(%rdx,%rax,4), %xmm6
movaps %xmm9, %xmm11
X8_const_3:
movaps 0xFECA(%rdx,%rax,4), %xmm7
movaps 16(%r8), %xmm8
mulps %xmm6, %xmm11
mulps %xmm7, %xmm9
shufps $177, %xmm6, %xmm6
mulps %xmm8, %xmm6
shufps $177, %xmm7, %xmm7
subps %xmm6, %xmm11
mulps %xmm7, %xmm8
movaps %xmm11, %xmm10
addps %xmm8, %xmm9
movaps 32(%r8), %xmm15
addps %xmm9, %xmm10
subps %xmm9, %xmm11
X8_const_0:
movaps 0xFECA(%rdx,%rax,4), %xmm3
movaps %xmm15, %xmm6
X8_const_4:
movaps 0xFECA(%rdx,%rax,4), %xmm12
movaps %xmm3, %xmm2
X8_const_6:
movaps 0xFECA(%rdx,%rax,4), %xmm13
xorps %xmm5, %xmm11
movaps 48(%r8), %xmm14
subps %xmm10, %xmm2
mulps %xmm12, %xmm6
addps %xmm10, %xmm3
mulps %xmm13, %xmm15
movaps 64(%r8), %xmm10
movaps %xmm3, %xmm0
shufps $177, %xmm12, %xmm12
shufps $177, %xmm13, %xmm13
mulps %xmm14, %xmm12
mulps %xmm13, %xmm14
subps %xmm12, %xmm6
addps %xmm14, %xmm15
X8_const_5:
movaps 0xFECA(%rdx,%rax,4), %xmm7
movaps %xmm10, %xmm13
X8_const_7:
movaps 0xFECA(%rdx,%rax,4), %xmm8
movaps %xmm6, %xmm12
movaps 80(%r8), %xmm9
addq $96, %r8
mulps %xmm7, %xmm13
subps %xmm15, %xmm6
addps %xmm15, %xmm12
mulps %xmm8, %xmm10
subps %xmm12, %xmm0
addps %xmm12, %xmm3
shufps $177, %xmm7, %xmm7
xorps %xmm5, %xmm6
shufps $177, %xmm8, %xmm8
movaps %xmm2, %xmm12
mulps %xmm9, %xmm7
mulps %xmm8, %xmm9
subps %xmm7, %xmm13
addps %xmm9, %xmm10
X8_const_1:
movaps 0xFECA(%rdx,%rax,4), %xmm4
shufps $177, %xmm11, %xmm11
movaps %xmm4, %xmm1
shufps $177, %xmm6, %xmm6
addps %xmm11, %xmm1
subps %xmm11, %xmm4
addps %xmm6, %xmm12
subps %xmm6, %xmm2
movaps %xmm13, %xmm11
movaps %xmm4, %xmm14
movaps %xmm1, %xmm6
subps %xmm10, %xmm13
addps %xmm10, %xmm11
xorps %xmm5, %xmm13
addps %xmm11, %xmm4
subps %xmm11, %xmm14
shufps $177, %xmm13, %xmm13
X8_const1_0:
movaps %xmm3, 0xFECA(%rdx,%rax,4)
X8_const1_1:
movaps %xmm4, 0xFECA(%rdx,%rax,4)
X8_const1_2:
movaps %xmm2, 0xFECA(%rdx,%rax,4)
subps %xmm13, %xmm1
addps %xmm13, %xmm6
X8_const1_3:
movaps %xmm1, 0xFECA(%rdx,%rax,4)
X8_const1_4:
movaps %xmm0, 0xFECA(%rdx,%rax,4)
X8_const1_5:
movaps %xmm14, 0xFECA(%rdx,%rax,4)
X8_const1_6:
movaps %xmm12, 0xFECA(%rdx,%rax,4)
X8_const1_7:
movaps %xmm6, 0xFECA(%rdx,%rax,4)
addq $4, %rax
cmpq %rcx, %rax
jne X8_loop
#ifdef __APPLE__
.globl _sse_leaf_ee_offsets
.globl _sse_leaf_oo_offsets
.globl _sse_leaf_eo_offsets
.globl _sse_leaf_oe_offsets
.align 4
_sse_leaf_ee_offsets:
.long LEAF_EE_const_0-_leaf_ee+0x4
.long LEAF_EE_const_1-_leaf_ee+0x5
.long LEAF_EE_const_2-_leaf_ee+0x5
.long LEAF_EE_const_3-_leaf_ee+0x5
.long LEAF_EE_const_4-_leaf_ee+0x5
.long LEAF_EE_const_5-_leaf_ee+0x5
.long LEAF_EE_const_6-_leaf_ee+0x4
.long LEAF_EE_const_7-_leaf_ee+0x5
_sse_leaf_oo_offsets:
.long LEAF_OO_const_0-_leaf_oo+0x4
.long LEAF_OO_const_1-_leaf_oo+0x4
.long LEAF_OO_const_2-_leaf_oo+0x5
.long LEAF_OO_const_3-_leaf_oo+0x5
.long LEAF_OO_const_4-_leaf_oo+0x4
.long LEAF_OO_const_5-_leaf_oo+0x5
.long LEAF_OO_const_6-_leaf_oo+0x5
.long LEAF_OO_const_7-_leaf_oo+0x5
_sse_leaf_eo_offsets:
.long LEAF_EO_const_0-_leaf_eo+0x5
.long LEAF_EO_const_1-_leaf_eo+0x4
.long LEAF_EO_const_2-_leaf_eo+0x4
.long LEAF_EO_const_3-_leaf_eo+0x4
.long LEAF_EO_const_4-_leaf_eo+0x5
.long LEAF_EO_const_5-_leaf_eo+0x5
.long LEAF_EO_const_6-_leaf_eo+0x4
.long LEAF_EO_const_7-_leaf_eo+0x5
_sse_leaf_oe_offsets:
.long LEAF_OE_const_0-_leaf_oe+0x5
.long LEAF_OE_const_1-_leaf_oe+0x4
.long LEAF_OE_const_2-_leaf_oe+0x4
.long LEAF_OE_const_3-_leaf_oe+0x5
.long LEAF_OE_const_4-_leaf_oe+0x5
.long LEAF_OE_const_5-_leaf_oe+0x5
.long LEAF_OE_const_6-_leaf_oe+0x4
.long LEAF_OE_const_7-_leaf_oe+0x4
#else
.globl sse_leaf_ee_offsets
.globl sse_leaf_oo_offsets
.globl sse_leaf_eo_offsets
.globl sse_leaf_oe_offsets
.align 4
sse_leaf_ee_offsets:
.long LEAF_EE_const_0-leaf_ee+0x4
.long LEAF_EE_const_1-leaf_ee+0x5
.long LEAF_EE_const_2-leaf_ee+0x5
.long LEAF_EE_const_3-leaf_ee+0x5
.long LEAF_EE_const_4-leaf_ee+0x5
.long LEAF_EE_const_5-leaf_ee+0x5
.long LEAF_EE_const_6-leaf_ee+0x4
.long LEAF_EE_const_7-leaf_ee+0x5
sse_leaf_oo_offsets:
.long LEAF_OO_const_0-leaf_oo+0x4
.long LEAF_OO_const_1-leaf_oo+0x4
.long LEAF_OO_const_2-leaf_oo+0x5
.long LEAF_OO_const_3-leaf_oo+0x5
.long LEAF_OO_const_4-leaf_oo+0x4
.long LEAF_OO_const_5-leaf_oo+0x5
.long LEAF_OO_const_6-leaf_oo+0x5
.long LEAF_OO_const_7-leaf_oo+0x5
sse_leaf_eo_offsets:
.long LEAF_EO_const_0-leaf_eo+0x5
.long LEAF_EO_const_1-leaf_eo+0x4
.long LEAF_EO_const_2-leaf_eo+0x4
.long LEAF_EO_const_3-leaf_eo+0x4
.long LEAF_EO_const_4-leaf_eo+0x5
.long LEAF_EO_const_5-leaf_eo+0x5
.long LEAF_EO_const_6-leaf_eo+0x4
.long LEAF_EO_const_7-leaf_eo+0x5
sse_leaf_oe_offsets:
.long LEAF_OE_const_0-leaf_oe+0x5
.long LEAF_OE_const_1-leaf_oe+0x4
.long LEAF_OE_const_2-leaf_oe+0x4
.long LEAF_OE_const_3-leaf_oe+0x5
.long LEAF_OE_const_4-leaf_oe+0x5
.long LEAF_OE_const_5-leaf_oe+0x5
.long LEAF_OE_const_6-leaf_oe+0x4
.long LEAF_OE_const_7-leaf_oe+0x4
#endif
#ifdef __APPLE__
.data
#else
.section .data
#endif
.p2align 4
#ifdef __APPLE__
.globl _sse_constants
_sse_constants:
#else
.globl sse_constants
sse_constants:
#endif
.long 0x00000000,0x80000000,0x00000000,0x80000000
.long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
.long 0xbf3504f3,0x3f3504f3,0xbf3504f3,0x3f3504f3
.long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
.long 0x00000000,0x00000000,0xbf3504f3,0x3f3504f3
#ifdef __APPLE__
.globl _sse_constants_inv
_sse_constants_inv:
#else
.globl sse_constants_inv
sse_constants_inv:
#endif
.long 0x80000000,0x00000000,0x80000000,0x00000000
.long 0x3f3504f3,0x3f3504f3,0x3f3504f3,0x3f3504f3
.long 0x3f3504f3,0xbf3504f3,0x3f3504f3,0xbf3504f3
.long 0x3f800000,0x3f800000,0x3f3504f3,0x3f3504f3
.long 0x00000000,0x00000000,0x3f3504f3,0xbf3504f3