You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
732 lines
19 KiB
732 lines
19 KiB
/*
|
|
|
|
This file is part of FFTS -- The Fastest Fourier Transform in the South
|
|
|
|
Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
|
|
Copyright (c) 2012, The University of Waikato
|
|
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
* Neither the name of the organization nor the
|
|
names of its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
|
|
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
|
|
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
#include "codegen.h"
|
|
#include "macros.h"
|
|
#include "ffts.h"
|
|
|
|
#ifdef __APPLE__
|
|
#include <libkern/OSCacheControl.h>
|
|
#endif
|
|
|
|
#include <sys/types.h>
|
|
#include <sys/mman.h>
|
|
|
|
#ifdef HAVE_NEON
|
|
#include "codegen_arm.h"
|
|
#include "neon.h"
|
|
#elif HAVE_VFP
|
|
#include "codegen_arm.h"
|
|
#include "vfp.h"
|
|
#else
|
|
#include "codegen_sse.h"
|
|
#include "macros-sse.h"
|
|
#endif
|
|
|
|
#ifdef __ANDROID__
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
int tree_count(int N, int leafN, int offset) {
|
|
|
|
if(N <= leafN) return 0;
|
|
int count = 0;
|
|
count += tree_count(N/4, leafN, offset);
|
|
count += tree_count(N/8, leafN, offset + N/4);
|
|
count += tree_count(N/8, leafN, offset + N/4 + N/8);
|
|
count += tree_count(N/4, leafN, offset + N/2);
|
|
count += tree_count(N/4, leafN, offset + 3*N/4);
|
|
|
|
return 1 + count;
|
|
}
|
|
|
|
void elaborate_tree(size_t **p, int N, int leafN, int offset) {
|
|
|
|
if(N <= leafN) return;
|
|
elaborate_tree(p, N/4, leafN, offset);
|
|
elaborate_tree(p, N/8, leafN, offset + N/4);
|
|
elaborate_tree(p, N/8, leafN, offset + N/4 + N/8);
|
|
elaborate_tree(p, N/4, leafN, offset + N/2);
|
|
elaborate_tree(p, N/4, leafN, offset + 3*N/4);
|
|
|
|
(*p)[0] = N;
|
|
(*p)[1] = offset*2;
|
|
|
|
(*p)+=2;
|
|
}
|
|
|
|
|
|
|
|
|
|
uint32_t LUT_offset(size_t N, size_t leafN) {
|
|
int i;
|
|
size_t p_lut_size = 0;
|
|
size_t lut_size = 0;
|
|
int hardcoded = 0;
|
|
size_t n_luts = __builtin_ctzl(N/leafN);
|
|
int n = leafN*2;
|
|
//if(N <= 32) { n_luts = __builtin_ctzl(N/4); hardcoded = 1; }
|
|
|
|
for(i=0;i<n_luts-1;i++) {
|
|
p_lut_size = lut_size;
|
|
if(!i || hardcoded) {
|
|
#ifdef __arm__
|
|
if(N <= 32) lut_size += n/4 * 2 * sizeof(cdata_t);
|
|
else lut_size += n/4 * sizeof(cdata_t);
|
|
#else
|
|
lut_size += n/4 * 2 * sizeof(cdata_t);
|
|
#endif
|
|
// n *= 2;
|
|
} else {
|
|
#ifdef __arm__
|
|
lut_size += n/8 * 3 * sizeof(cdata_t);
|
|
#else
|
|
lut_size += n/8 * 3 * 2 * sizeof(cdata_t);
|
|
#endif
|
|
}
|
|
n *= 2;
|
|
}
|
|
return lut_size;
|
|
}
|
|
|
|
#ifdef __arm__
|
|
typedef uint32_t insns_t;
|
|
#else
|
|
typedef uint8_t insns_t;
|
|
#endif
|
|
|
|
#define P(x) (*(*p)++ = x)
|
|
|
|
void insert_nops(uint8_t **p, uint32_t count) {
|
|
switch(count) {
|
|
case 0: break;
|
|
case 2: P(0x66);
|
|
case 1: P(0x90); break;
|
|
case 3: P(0x0F); P(0x1F); P(0x00); break;
|
|
case 4: P(0x0F); P(0x1F); P(0x40); P(0x00); break;
|
|
case 5: P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
|
|
case 6: P(0x66); P(0x0F); P(0x1F); P(0x44); P(0x00); P(0x00); break;
|
|
case 7: P(0x0F); P(0x1F); P(0x80); P(0x00); P(0x00); P(0x00); P(0x00); break;
|
|
case 8: P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
|
|
case 9: P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00); break;
|
|
default:
|
|
P(0x66); P(0x0F); P(0x1F); P(0x84); P(0x00); P(0x00); P(0x00); P(0x00); P(0x00);
|
|
insert_nops(p, count-9);
|
|
break;
|
|
}
|
|
}
|
|
|
|
|
|
void align_mem16(uint8_t **p, uint32_t offset) {
|
|
#ifdef __x86_64__
|
|
int r = (16 - (offset & 0xf)) - ((uint32_t)(*p) & 0xf);
|
|
r = (16 + r) & 0xf;
|
|
insert_nops(p, r);
|
|
#endif
|
|
}
|
|
|
|
void ffts_generate_func_code(ffts_plan_t *p, size_t N, size_t leafN, int sign) {
|
|
int count = tree_count(N, leafN, 0) + 1;
|
|
size_t *ps = malloc(count * 2 * sizeof(size_t));
|
|
size_t *pps = ps;
|
|
|
|
#ifdef __x86_64__
|
|
if(sign < 0) p->constants = sse_constants;
|
|
else p->constants = sse_constants_inv;
|
|
#endif
|
|
|
|
elaborate_tree(&pps, N, leafN, 0);
|
|
pps[0] = 0;
|
|
pps[1] = 0;
|
|
|
|
pps = ps;
|
|
|
|
#ifdef __arm__
|
|
if(N < 8192) p->transform_size = 8192;
|
|
else p->transform_size = N;
|
|
#else
|
|
if(N < 2048) p->transform_size = 16384;
|
|
else p->transform_size = 16384 + 2*N/8 * __builtin_ctzl(N);
|
|
#endif
|
|
|
|
#ifdef __APPLE__
|
|
p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANON | MAP_SHARED, -1, 0);
|
|
#else
|
|
#define MAP_ANONYMOUS 0x20
|
|
p->transform_base = mmap(NULL, p->transform_size, PROT_WRITE | PROT_READ, MAP_ANONYMOUS | MAP_SHARED, -1, 0);
|
|
#endif
|
|
|
|
/*
|
|
if(p->transform_base == MAP_FAILED) {
|
|
fprintf(stderr, "MAP FAILED\n");
|
|
exit(1);
|
|
}*/
|
|
insns_t *func = p->transform_base;//valloc(8192);
|
|
insns_t *fp = func;
|
|
|
|
//fprintf(stderr, "Allocating %d bytes \n", p->transform_size);
|
|
//fprintf(stderr, "Base address = %016p\n", func);
|
|
|
|
if(!func) {
|
|
fprintf(stderr, "NOMEM\n");
|
|
exit(1);
|
|
}
|
|
|
|
insns_t *x_8_addr = fp;
|
|
#ifdef __arm__
|
|
#ifdef HAVE_NEON
|
|
memcpy(fp, neon_x8, neon_x8_t - neon_x8);
|
|
/*
|
|
* Changes adds to subtracts and vice versa to allow the computation
|
|
* of both the IFFT and FFT
|
|
*/
|
|
if(sign < 0) {
|
|
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
|
|
fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
|
|
fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
|
|
}
|
|
fp += (neon_x8_t - neon_x8) / 4;
|
|
#else
|
|
memcpy(fp, vfp_x8, vfp_end - vfp_x8);
|
|
if(sign > 0) {
|
|
fp[65] ^= 0x00000040;
|
|
fp[66] ^= 0x00000040;
|
|
fp[68] ^= 0x00000040;
|
|
fp[70] ^= 0x00000040;
|
|
fp[103] ^= 0x00000040;
|
|
fp[104] ^= 0x00000040;
|
|
fp[105] ^= 0x00000040;
|
|
fp[108] ^= 0x00000040;
|
|
fp[113] ^= 0x00000040;
|
|
fp[114] ^= 0x00000040;
|
|
fp[117] ^= 0x00000040;
|
|
fp[118] ^= 0x00000040;
|
|
}
|
|
fp += (vfp_end - vfp_x8) / 4;
|
|
#endif
|
|
#else
|
|
align_mem16(&fp, 0);
|
|
x_8_addr = fp;
|
|
align_mem16(&fp, 5);
|
|
memcpy(fp, x8_soft, x8_hard - x8_soft);
|
|
fp += (x8_hard - x8_soft);
|
|
//fprintf(stderr, "X8 start address = %016p\n", x_8_addr);
|
|
#endif
|
|
//uint32_t *x_8_t_addr = fp;
|
|
//memcpy(fp, neon_x8_t, neon_end - neon_x8_t);
|
|
//fp += (neon_end - neon_x8_t) / 4;
|
|
insns_t *x_4_addr = fp;
|
|
#ifdef __arm__
|
|
#ifdef HAVE_NEON
|
|
memcpy(fp, neon_x4, neon_x8 - neon_x4);
|
|
if(sign < 0) {
|
|
fp[26] ^= 0x00200000; fp[28] ^= 0x00200000; fp[31] ^= 0x00200000; fp[32] ^= 0x00200000;
|
|
}
|
|
fp += (neon_x8 - neon_x4) / 4;
|
|
#else
|
|
memcpy(fp, vfp_x4, vfp_x8 - vfp_x4);
|
|
if(sign > 0) {
|
|
fp[36] ^= 0x00000040;
|
|
fp[38] ^= 0x00000040;
|
|
fp[43] ^= 0x00000040;
|
|
fp[44] ^= 0x00000040;
|
|
}
|
|
fp += (vfp_x8 - vfp_x4) / 4;
|
|
#endif
|
|
#else
|
|
align_mem16(&fp, 0);
|
|
x_4_addr = fp;
|
|
memcpy(fp, x4, x8_soft - x4);
|
|
fp += (x8_soft - x4);
|
|
|
|
#endif
|
|
insns_t *start = fp;
|
|
|
|
#ifdef __arm__
|
|
*fp = PUSH_LR(); fp++;
|
|
*fp = 0xed2d8b10; fp++;
|
|
|
|
ADDI(&fp, 3, 1, 0);
|
|
ADDI(&fp, 7, 1, N);
|
|
ADDI(&fp, 5, 1, 2*N);
|
|
ADDI(&fp, 10, 7, 2*N);
|
|
ADDI(&fp, 4, 5, 2*N);
|
|
ADDI(&fp, 8, 10, 2*N);
|
|
ADDI(&fp, 6, 4, 2*N);
|
|
ADDI(&fp, 9, 8, 2*N);
|
|
|
|
*fp = LDRI(12, 0, ((uint32_t)&p->offsets) - ((uint32_t)p)); fp++; // load offsets into r12
|
|
// *fp++ = LDRI(1, 0, 4); // load ws into r1
|
|
ADDI(&fp, 1, 0, 0);
|
|
|
|
ADDI(&fp, 0, 2, 0), // mov out into r0
|
|
#endif
|
|
|
|
|
|
#ifdef __arm__
|
|
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
|
|
#ifdef HAVE_NEON
|
|
MOVI(&fp, 11, p->i0);
|
|
#else
|
|
MOVI(&fp, 11, p->i0);
|
|
#endif
|
|
|
|
#else
|
|
align_mem16(&fp, 0);
|
|
start = fp;
|
|
|
|
*fp++ = 0x4c;
|
|
*fp++ = 0x8b;
|
|
*fp++ = 0x07;
|
|
uint32_t lp_cnt = p->i0 * 4;
|
|
MOVI(&fp, RCX, lp_cnt);
|
|
|
|
//LEA(&fp, R8, RDI, ((uint32_t)&p->offsets) - ((uint32_t)p));
|
|
#endif
|
|
//fp++;
|
|
#ifdef __arm__
|
|
#ifdef HAVE_NEON
|
|
memcpy(fp, neon_ee, neon_oo - neon_ee);
|
|
if(sign < 0) {
|
|
fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
|
|
fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
|
|
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
|
|
}
|
|
fp += (neon_oo - neon_ee) / 4;
|
|
#else
|
|
memcpy(fp, vfp_e, vfp_o - vfp_e);
|
|
if(sign > 0) {
|
|
fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
|
|
fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
|
|
fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
|
|
}
|
|
fp += (vfp_o - vfp_e) / 4;
|
|
#endif
|
|
#else
|
|
//fprintf(stderr, "Body start address = %016p\n", start);
|
|
|
|
PUSH(&fp, RBP);
|
|
PUSH(&fp, RBX);
|
|
PUSH(&fp, R10);
|
|
PUSH(&fp, R11);
|
|
PUSH(&fp, R12);
|
|
PUSH(&fp, R13);
|
|
PUSH(&fp, R14);
|
|
PUSH(&fp, R15);
|
|
|
|
int i;
|
|
memcpy(fp, leaf_ee_init, leaf_ee - leaf_ee_init);
|
|
|
|
//fprintf(stderr, "Leaf ee init address = %016p\n", leaf_ee_init);
|
|
//fprintf(stderr, "Constants address = %016p\n", sse_constants);
|
|
//fprintf(stderr, "Constants address = %016p\n", p->constants);
|
|
|
|
//int32_t val = READ_IMM32(fp + 3);
|
|
//fprintf(stderr, "diff = 0x%x\n", ((uint32_t)&p->constants) - ((uint32_t)p));
|
|
|
|
//int64_t v2 = val + (int64_t)((void *)leaf_ee_init - (void *)fp );
|
|
//fprintf(stderr, "IMM = 0x%llx\n", v2);
|
|
|
|
//IMM32_NI(fp + 3, ((int64_t) READ_IMM32(fp + 3)) + ((void *)leaf_ee_init - (void *)fp ));
|
|
fp += (leaf_ee - leaf_ee_init);
|
|
|
|
//fprintf(stderr, "Leaf start address = %016p\n", fp);
|
|
align_mem16(&fp, 9);
|
|
memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
|
|
|
|
|
|
uint32_t offsets[8] = {0, N, N/2, 3*N/2, N/4, 5*N/4, 7*N/4, 3*N/4};
|
|
uint32_t offsets_o[8] = {0, N, N/2, 3*N/2, 7*N/4, 3*N/4, N/4, 5*N/4};
|
|
uint32_t offsets_oe[8] = {7*N/4, 3*N/4, N/4, 5*N/4, 0, N, 3*N/2, N/2};
|
|
|
|
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets[i]*4);
|
|
|
|
fp += (leaf_oo - leaf_ee);
|
|
|
|
if(__builtin_ctzl(N) & 1){
|
|
|
|
if(p->i1) {
|
|
lp_cnt += p->i1 * 4;
|
|
MOVI(&fp, RCX, lp_cnt);
|
|
align_mem16(&fp, 4);
|
|
memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
|
|
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
|
|
fp += (leaf_eo - leaf_oo);
|
|
}
|
|
|
|
|
|
memcpy(fp, leaf_oe, leaf_end - leaf_oe);
|
|
lp_cnt += 4;
|
|
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oe_offsets[i], offsets_o[i]*4);
|
|
fp += (leaf_end - leaf_oe);
|
|
|
|
}else{
|
|
|
|
|
|
memcpy(fp, leaf_eo, leaf_oe - leaf_eo);
|
|
lp_cnt += 4;
|
|
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_eo_offsets[i], offsets[i]*4);
|
|
fp += (leaf_oe - leaf_eo);
|
|
|
|
if(p->i1) {
|
|
lp_cnt += p->i1 * 4;
|
|
MOVI(&fp, RCX, lp_cnt);
|
|
align_mem16(&fp, 4);
|
|
memcpy(fp, leaf_oo, leaf_eo - leaf_oo);
|
|
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_oo_offsets[i], offsets_o[i]*4);
|
|
fp += (leaf_eo - leaf_oo);
|
|
}
|
|
|
|
}
|
|
if(p->i1) {
|
|
lp_cnt += p->i1 * 4;
|
|
MOVI(&fp, RCX, lp_cnt);
|
|
align_mem16(&fp, 9);
|
|
memcpy(fp, leaf_ee, leaf_oo - leaf_ee);
|
|
for(i=0;i<8;i++) IMM32_NI(fp + sse_leaf_ee_offsets[i], offsets_oe[i]*4);
|
|
fp += (leaf_oo - leaf_ee);
|
|
|
|
}
|
|
|
|
//fprintf(stderr, "Body start address = %016p\n", fp);
|
|
//LEA(&fp, R8, RDI, ((uint32_t)&p->ws) - ((uint32_t)p));
|
|
memcpy(fp, x_init, x4 - x_init);
|
|
//IMM32_NI(fp + 3, ((int64_t)READ_IMM32(fp + 3)) + ((void *)x_init - (void *)fp ));
|
|
fp += (x4 - x_init);
|
|
|
|
int32_t pAddr = 0;
|
|
int32_t pN = 0;
|
|
int32_t pLUT = 0;
|
|
count = 2;
|
|
while(pps[0]) {
|
|
|
|
if(!pN) {
|
|
MOVI(&fp, RCX, pps[0] / 4);
|
|
}else{
|
|
if((pps[1]*4)-pAddr) ADDI(&fp, RDX, (pps[1] * 4)- pAddr);
|
|
if(pps[0] > leafN && pps[0] - pN) {
|
|
|
|
int diff = __builtin_ctzl(pps[0]) - __builtin_ctzl(pN);
|
|
*fp++ = 0xc1;
|
|
|
|
if(diff > 0) {
|
|
*fp++ = 0xe1;
|
|
*fp++ = (diff & 0xff);
|
|
}else{
|
|
*fp++ = 0xe9;
|
|
*fp++ = ((-diff) & 0xff);
|
|
}
|
|
}
|
|
}
|
|
|
|
if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
|
|
ADDI(&fp, R8, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
|
|
|
|
|
|
if(pps[0] == 2*leafN) {
|
|
CALL(&fp, x_4_addr);
|
|
// }else if(!pps[2]){
|
|
// //uint32_t *x_8_t_addr = fp;
|
|
// memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
|
|
// fp += (neon_ee - neon_x8_t) / 4;
|
|
// //*fp++ = BL(fp+2, x_8_t_addr);
|
|
}else{
|
|
CALL(&fp, x_8_addr);
|
|
}
|
|
|
|
pAddr = pps[1] * 4;
|
|
if(pps[0] > leafN)
|
|
pN = pps[0];
|
|
pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
|
|
// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
|
|
count += 4;
|
|
pps += 2;
|
|
}
|
|
#endif
|
|
#ifdef __arm__
|
|
#ifdef HAVE_NEON
|
|
if(__builtin_ctzl(N) & 1){
|
|
ADDI(&fp, 2, 7, 0);
|
|
ADDI(&fp, 7, 9, 0);
|
|
ADDI(&fp, 9, 2, 0);
|
|
|
|
ADDI(&fp, 2, 8, 0);
|
|
ADDI(&fp, 8, 10, 0);
|
|
ADDI(&fp, 10, 2, 0);
|
|
|
|
if(p->i1) {
|
|
MOVI(&fp, 11, p->i1);
|
|
memcpy(fp, neon_oo, neon_eo - neon_oo);
|
|
if(sign < 0) {
|
|
fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
|
|
fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
|
|
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
|
|
}
|
|
fp += (neon_eo - neon_oo) / 4;
|
|
}
|
|
|
|
*fp = LDRI(11, 1, ((uint32_t)&p->oe_ws) - ((uint32_t)p)); fp++;
|
|
|
|
memcpy(fp, neon_oe, neon_end - neon_oe);
|
|
if(sign < 0) {
|
|
fp[19] ^= 0x00200000; fp[20] ^= 0x00200000; fp[22] ^= 0x00200000; fp[23] ^= 0x00200000;
|
|
fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[40] ^= 0x00200000; fp[41] ^= 0x00200000;
|
|
fp[64] ^= 0x00200000; fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[67] ^= 0x00200000;
|
|
}
|
|
fp += (neon_end - neon_oe) / 4;
|
|
|
|
}else{
|
|
|
|
*fp = LDRI(11, 1, ((uint32_t)&p->eo_ws) - ((uint32_t)p)); fp++;
|
|
|
|
memcpy(fp, neon_eo, neon_oe - neon_eo);
|
|
if(sign < 0) {
|
|
fp[10] ^= 0x00200000; fp[11] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000;
|
|
fp[31] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000; fp[35] ^= 0x00200000;
|
|
fp[59] ^= 0x00200000; fp[60] ^= 0x00200000; fp[61] ^= 0x00200000; fp[62] ^= 0x00200000;
|
|
}
|
|
fp += (neon_oe - neon_eo) / 4;
|
|
|
|
ADDI(&fp, 2, 7, 0);
|
|
ADDI(&fp, 7, 9, 0);
|
|
ADDI(&fp, 9, 2, 0);
|
|
|
|
ADDI(&fp, 2, 8, 0);
|
|
ADDI(&fp, 8, 10, 0);
|
|
ADDI(&fp, 10, 2, 0);
|
|
|
|
if(p->i1) {
|
|
MOVI(&fp, 11, p->i1);
|
|
memcpy(fp, neon_oo, neon_eo - neon_oo);
|
|
if(sign < 0) {
|
|
fp[12] ^= 0x00200000; fp[13] ^= 0x00200000; fp[14] ^= 0x00200000; fp[15] ^= 0x00200000;
|
|
fp[27] ^= 0x00200000; fp[29] ^= 0x00200000; fp[30] ^= 0x00200000; fp[31] ^= 0x00200000;
|
|
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
|
|
}
|
|
fp += (neon_eo - neon_oo) / 4;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
if(p->i1) {
|
|
ADDI(&fp, 2, 3, 0);
|
|
ADDI(&fp, 3, 7, 0);
|
|
ADDI(&fp, 7, 2, 0);
|
|
|
|
ADDI(&fp, 2, 4, 0);
|
|
ADDI(&fp, 4, 8, 0);
|
|
ADDI(&fp, 8, 2, 0);
|
|
|
|
ADDI(&fp, 2, 5, 0);
|
|
ADDI(&fp, 5, 9, 0);
|
|
ADDI(&fp, 9, 2, 0);
|
|
|
|
ADDI(&fp, 2, 6, 0);
|
|
ADDI(&fp, 6, 10, 0);
|
|
ADDI(&fp, 10, 2, 0);
|
|
|
|
ADDI(&fp, 2, 9, 0);
|
|
ADDI(&fp, 9, 10, 0);
|
|
ADDI(&fp, 10, 2, 0);
|
|
|
|
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
|
|
MOVI(&fp, 11, p->i1);
|
|
memcpy(fp, neon_ee, neon_oo - neon_ee);
|
|
if(sign < 0) {
|
|
fp[33] ^= 0x00200000; fp[37] ^= 0x00200000; fp[38] ^= 0x00200000; fp[39] ^= 0x00200000;
|
|
fp[40] ^= 0x00200000; fp[41] ^= 0x00200000; fp[44] ^= 0x00200000; fp[45] ^= 0x00200000;
|
|
fp[46] ^= 0x00200000; fp[47] ^= 0x00200000; fp[48] ^= 0x00200000; fp[57] ^= 0x00200000;
|
|
}
|
|
fp += (neon_oo - neon_ee) / 4;
|
|
|
|
}
|
|
#else
|
|
ADDI(&fp, 2, 7, 0);
|
|
ADDI(&fp, 7, 9, 0);
|
|
ADDI(&fp, 9, 2, 0);
|
|
|
|
ADDI(&fp, 2, 8, 0);
|
|
ADDI(&fp, 8, 10, 0);
|
|
ADDI(&fp, 10, 2, 0);
|
|
|
|
MOVI(&fp, 11, (p->i1>0) ? p->i1 : 1);
|
|
memcpy(fp, vfp_o, vfp_x4 - vfp_o);
|
|
if(sign > 0) {
|
|
fp[22] ^= 0x00000040; fp[24] ^= 0x00000040; fp[25] ^= 0x00000040; fp[26] ^= 0x00000040;
|
|
fp[62] ^= 0x00000040; fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[66] ^= 0x00000040;
|
|
}
|
|
fp += (vfp_x4 - vfp_o) / 4;
|
|
|
|
ADDI(&fp, 2, 3, 0);
|
|
ADDI(&fp, 3, 7, 0);
|
|
ADDI(&fp, 7, 2, 0);
|
|
|
|
ADDI(&fp, 2, 4, 0);
|
|
ADDI(&fp, 4, 8, 0);
|
|
ADDI(&fp, 8, 2, 0);
|
|
|
|
ADDI(&fp, 2, 5, 0);
|
|
ADDI(&fp, 5, 9, 0);
|
|
ADDI(&fp, 9, 2, 0);
|
|
|
|
ADDI(&fp, 2, 6, 0);
|
|
ADDI(&fp, 6, 10, 0);
|
|
ADDI(&fp, 10, 2, 0);
|
|
|
|
ADDI(&fp, 2, 9, 0);
|
|
ADDI(&fp, 9, 10, 0);
|
|
ADDI(&fp, 10, 2, 0);
|
|
|
|
*fp = LDRI(2, 1, ((uint32_t)&p->ee_ws) - ((uint32_t)p)); fp++;
|
|
MOVI(&fp, 11, (p->i2>0) ? p->i2 : 1);
|
|
memcpy(fp, vfp_e, vfp_o - vfp_e);
|
|
if(sign > 0) {
|
|
fp[64] ^= 0x00000040; fp[65] ^= 0x00000040; fp[68] ^= 0x00000040; fp[75] ^= 0x00000040;
|
|
fp[76] ^= 0x00000040; fp[79] ^= 0x00000040; fp[80] ^= 0x00000040; fp[83] ^= 0x00000040;
|
|
fp[84] ^= 0x00000040; fp[87] ^= 0x00000040; fp[91] ^= 0x00000040; fp[93] ^= 0x00000040;
|
|
}
|
|
fp += (vfp_o - vfp_e) / 4;
|
|
|
|
#endif
|
|
*fp = LDRI(2, 1, ((uint32_t)&p->ws) - ((uint32_t)p)); fp++; // load offsets into r12
|
|
//ADDI(&fp, 2, 1, 0);
|
|
MOVI(&fp, 1, 0);
|
|
|
|
// args: r0 - out
|
|
// r1 - N
|
|
// r2 - ws
|
|
// ADDI(&fp, 3, 1, 0); // put N into r3 for counter
|
|
|
|
int32_t pAddr = 0;
|
|
int32_t pN = 0;
|
|
int32_t pLUT = 0;
|
|
count = 2;
|
|
while(pps[0]) {
|
|
|
|
// fprintf(stderr, "size %zu at %zu - diff %zu\n", pps[0], pps[1]*4, (pps[1]*4) - pAddr);
|
|
if(!pN) {
|
|
MOVI(&fp, 1, pps[0]);
|
|
}else{
|
|
if((pps[1]*4)-pAddr) ADDI(&fp, 0, 0, (pps[1] * 4)- pAddr);
|
|
if(pps[0] - pN) ADDI(&fp, 1, 1, pps[0] - pN);
|
|
}
|
|
|
|
if(p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT)
|
|
ADDI(&fp, 2, 2, p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8 - pLUT);
|
|
|
|
|
|
if(pps[0] == 2*leafN) {
|
|
*fp = BL(fp+2, x_4_addr); fp++;
|
|
}else if(!pps[2]){
|
|
//uint32_t *x_8_t_addr = fp;
|
|
#ifdef HAVE_NEON
|
|
memcpy(fp, neon_x8_t, neon_ee - neon_x8_t);
|
|
if(sign < 0) {
|
|
fp[31] ^= 0x00200000; fp[32] ^= 0x00200000; fp[33] ^= 0x00200000; fp[34] ^= 0x00200000;
|
|
fp[65] ^= 0x00200000; fp[66] ^= 0x00200000; fp[70] ^= 0x00200000; fp[74] ^= 0x00200000;
|
|
fp[97] ^= 0x00200000; fp[98] ^= 0x00200000; fp[102] ^= 0x00200000; fp[104] ^= 0x00200000;
|
|
}
|
|
fp += (neon_ee - neon_x8_t) / 4;
|
|
//*fp++ = BL(fp+2, x_8_t_addr);
|
|
|
|
#else
|
|
*fp = BL(fp+2, x_8_addr); fp++;
|
|
#endif
|
|
}else{
|
|
*fp = BL(fp+2, x_8_addr); fp++;
|
|
}
|
|
|
|
pAddr = pps[1] * 4;
|
|
pN = pps[0];
|
|
pLUT = p->ws_is[__builtin_ctzl(pps[0]/leafN)-1]*8;//LUT_offset(pps[0], leafN);
|
|
// fprintf(stderr, "LUT offset for %d is %d\n", pN, pLUT);
|
|
count += 4;
|
|
pps += 2;
|
|
}
|
|
|
|
*fp++ = 0xecbd8b10;
|
|
*fp++ = POP_LR(); count++;
|
|
#else
|
|
POP(&fp, R15);
|
|
POP(&fp, R14);
|
|
POP(&fp, R13);
|
|
POP(&fp, R12);
|
|
POP(&fp, R11);
|
|
POP(&fp, R10);
|
|
POP(&fp, RBX);
|
|
POP(&fp, RBP);
|
|
RET(&fp);
|
|
|
|
|
|
//uint8_t *pp = func;
|
|
//int counter = 0;
|
|
//do{
|
|
// printf("%02x ", *pp);
|
|
// if(counter++ % 16 == 15) printf("\n");
|
|
//} while(++pp < fp);
|
|
|
|
//printf("\n");
|
|
|
|
|
|
#endif
|
|
|
|
|
|
// *fp++ = B(14); count++;
|
|
|
|
//for(int i=0;i<(neon_x8 - neon_x4)/4;i++)
|
|
// fprintf(stderr, "%08x\n", x_4_addr[i]);
|
|
//fprintf(stderr, "\n");
|
|
//for(int i=0;i<count;i++)
|
|
|
|
free(ps);
|
|
|
|
if (mprotect(func, p->transform_size, PROT_READ | PROT_EXEC)) {
|
|
perror("Couldn't mprotect");
|
|
exit(1);
|
|
}
|
|
#ifdef __APPLE__
|
|
sys_icache_invalidate(func, p->transform_size);
|
|
#elif __ANDROID__
|
|
cacheflush((long)(func), (long)(func) + p->transform_size, 0);
|
|
#elif __linux__
|
|
#ifdef __GNUC__
|
|
__clear_cache((long)(func), (long)(func) + p->transform_size);
|
|
#endif
|
|
#endif
|
|
|
|
//fprintf(stderr, "size of transform %zu = %d\n", N, (fp-func)*4);
|
|
|
|
p->transform = (void *) (start);
|
|
}
|