/* * img_x86_common.h - common x86/x86-64 assembly macros * Written by Andrew Church * * This file is part of transcode, a video stream processing tool. * transcode is free software, distributable under the terms of the GNU * General Public License (version 2 or later). See the file COPYING * for details. */ #ifndef ACLIB_IMG_X86_COMMON_H #define ACLIB_IMG_X86_COMMON_H /*************************************************************************/ /* Register names for pointers */ #ifdef ARCH_X86_64 # define EAX "%%rax" # define EBX "%%rbx" # define ECX "%%rcx" # define EDX "%%rdx" # define ESP "%%rsp" # define EBP "%%rbp" # define ESI "%%rsi" # define EDI "%%rdi" #else # define EAX "%%eax" # define EBX "%%ebx" # define ECX "%%ecx" # define EDX "%%edx" # define ESP "%%esp" # define EBP "%%ebp" # define ESI "%%esi" # define EDI "%%edi" #endif /* Macros to push and pop one or two registers within an assembly block. * The x86-64 ABI allows leaf functions to write to 128 bytes BELOW * (yes, below) the stack pointer, so we can't just push our own stuff * there. Argh. */ #ifdef ARCH_X86_64 # define FAKE_PUSH_REG "r12" # define FAKE_PUSH_REG_2 "r13" # define COMMA_FAKE_PUSH_REG ,FAKE_PUSH_REG # define PUSH(reg) "mov " reg ", %%" FAKE_PUSH_REG # define POP(reg) "mov %%" FAKE_PUSH_REG ", " reg # define PUSH2(reg1,reg2) PUSH(reg1) "; mov " reg2 ", %%" FAKE_PUSH_REG_2 # define POP2(reg2,reg1) "mov %%" FAKE_PUSH_REG_2 ", " reg2 "; " POP(reg1) #else # define COMMA_FAKE_PUSH_REG /*nothing*/ # define PUSH(reg) "push " reg # define POP(reg) "pop " reg # define PUSH2(reg1,reg2) "push " reg1 "; push " reg2 # define POP2(reg2,reg1) "pop " reg2 "; pop " reg1 #endif /* Data for isolating particular bytes. Used by the SWAP32 macros; if you * use them, make sure to define DEFINE_MASK_DATA before including this * file! */ #ifdef DEFINE_MASK_DATA static const struct { uint32_t n[64]; } __attribute__((aligned(16))) mask_data = {{ 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF, 0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FF00, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF0000, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FFFF00, 0x00FFFF00, 0x00FFFF00, 0x00FFFF00, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF000000, 0xFF0000FF, 0xFF0000FF, 0xFF0000FF, 0xFF0000FF, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FF00, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFF00FFFF, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF0000, 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, 0xFFFF00FF, 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFF00, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, }}; #endif /*************************************************************************/ /* Basic assembly macros, used for odd-count loops */ /* Swap bytes in pairs of 16-bit values */ #define X86_SWAP16_2 \ "movl -4("ESI","ECX",4), %%eax \n\ movl %%eax, %%edx \n\ shll $8, %%eax \n\ andl $0xFF00FF00, %%eax \n\ shrl $8, %%edx \n\ andl $0x00FF00FF, %%edx \n\ orl %%edx, %%eax \n\ movl %%eax, -4("EDI","ECX",4)" /* Swap words in a 32-bit value */ #define X86_SWAP32 \ "movl -4("ESI","ECX",4), %%eax \n\ roll $16, %%eax \n\ movl %%eax, -4("EDI","ECX",4)" /* Swap bytes 0 and 2 of a 32-bit value */ #define X86_SWAP32_02 \ "movw -4("ESI","ECX",4), %%ax \n\ movw -2("ESI","ECX",4), %%dx \n\ xchg %%dl, %%al \n\ movw %%ax, -4("EDI","ECX",4) \n\ movw %%dx, -2("EDI","ECX",4)" /* Swap bytes 1 and 3 of a 32-bit value */ #define X86_SWAP32_13 \ "movw -4("ESI","ECX",4), %%ax \n\ movw -2("ESI","ECX",4), %%dx \n\ xchg %%dh, %%ah \n\ movw %%ax, -4("EDI","ECX",4) \n\ movw %%dx, -2("EDI","ECX",4)" /* Reverse the order of bytes in a 32-bit value */ #define X86_REV32 \ "movl -4("ESI","ECX",4), %%eax \n\ xchg %%ah, %%al \n\ roll $16, %%eax \n\ xchg %%ah, %%al \n\ movl %%eax, -4("EDI","ECX",4)" /* The same, using the BSWAP instruction */ #define X86_REV32_BSWAP \ "movl -4("ESI","ECX",4), %%eax \n\ bswap %%eax \n\ movl %%eax, -4("EDI","ECX",4)" /* Rotate a 32-bit value left 8 bits */ #define X86_ROL32 \ "movl -4("ESI","ECX",4), %%eax \n\ roll $8, %%eax \n\ movl %%eax, -4("EDI","ECX",4)" /* Rotate a 32-bit value right 8 bits */ #define X86_ROR32 \ "movl -4("ESI","ECX",4), %%eax \n\ rorl $8, %%eax \n\ movl %%eax, -4("EDI","ECX",4)" /*************************************************************************/ /* Basic assembly routines. Sizes are all given in 32-bit units. */ #define ASM_SWAP16_2_X86(size) \ asm("0: "X86_SWAP16_2" \n\ subl $1, %%ecx \n\ jnz 0b" \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax", "edx") #define ASM_SWAP32_X86(size) \ asm("0: "X86_SWAP32" \n\ subl $1, %%ecx \n\ jnz 0b" \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax", "edx") #define ASM_SWAP32_02_X86(size) \ asm("0: "X86_SWAP32_02" \n\ subl $1, %%ecx \n\ jnz 0b" \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax", "edx") #define ASM_SWAP32_13_X86(size) \ asm("0: "X86_SWAP32_13" \n\ subl $1, %%ecx \n\ jnz 0b" \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax", "edx") #define ASM_REV32_X86(size) \ asm("0: "X86_REV32" \n\ subl $1, %%ecx \n\ jnz 0b" \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax") #define ASM_ROL32_X86(size) \ asm("0: "X86_ROL32" \n\ subl $1, %%ecx \n\ jnz 0b" \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax") #define ASM_ROR32_X86(size) \ asm("0: "X86_ROR32" \n\ subl $1, %%ecx \n\ jnz 0b" \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax") /*************************************************************************/ /*************************************************************************/ /* Wrapper for SIMD loops. This generates the body of an asm() construct * (the string only, not the input/output/clobber lists) given the data * block size (number of data units processed per SIMD loop iteration), * instructions to save and restore unclobberable registers (such as EBX), * and the bodies of the odd-count and main loops. The data count is * assumed to be preloaded in ECX. Parameters are: * blocksize: number of units of data processed per SIMD loop (must be * a power of 2); can be a constant or a numerical * expression containing only constants * push_regs: string constant containing instructions to push registers * that must be saved over the small loop * pop_regs: string constant containing instructions to pop registers * saved by `push_regs' (restored before the main loop) * small_loop: loop for handling data elements one at a time (when the * count is not a multiple of `blocksize' * main_loop: main SIMD loop for processing data * emms: EMMS/SFENCE instructions to end main loop with, as needed */ #define SIMD_LOOP_WRAPPER(blocksize,push_regs,pop_regs,small_loop,main_loop,emms) \ /* Check whether the count is a multiple of the blocksize (this \ * can cause branch mispredicts but seems to be faster overall) */ \ "testl $(("#blocksize")-1), %%ecx; " \ "jz 1f; " \ /* It's not--run the small loop to align the count */ \ push_regs"; " \ "0: " \ small_loop"; " \ "subl $1, %%ecx; " \ "testl $(("#blocksize")-1), %%ecx; " \ "jnz 0b; " \ pop_regs"; " \ /* Make sure there's some data left */ \ "testl %%ecx, %%ecx; " \ "jz 2f; " \ /* Now run the main SIMD loop */ \ "1: " \ main_loop"; " \ "subl $("#blocksize"), %%ecx; " \ "jnz 1b; " \ /* Clear MMX state and/or SFENCE, as needed */ \ emms"; " \ /* Done */ \ "2: " /*************************************************************************/ /* MMX- and SSE2-optimized swap/rotate routines. These routines are * identical save for data size, so we use common macros to implement them, * with register names and data offsets replaced by parameters to the * macros. */ #define ASM_SIMD_MMX(name,size) \ name((size), 64, \ "movq", "movq", "movq", "", \ "%%mm0", "%%mm1", "%%mm2", "%%mm3", \ "%%mm4", "%%mm5", "%%mm6", "%%mm7") #define ASM_SIMD_SSE2(name,size) \ name((size), 128, \ "movdqu", "movdqa", "movdqu", "", \ "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\ "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7") #define ASM_SIMD_SSE2_ALIGNED(name,size) \ name((size), 128, \ "movdqa", "movdqa", "movntdq", "sfence",\ "%%xmm0", "%%xmm1", "%%xmm2", "%%xmm3",\ "%%xmm4", "%%xmm5", "%%xmm6", "%%xmm7") #define ASM_SWAP16_2_MMX(size) ASM_SIMD_MMX(ASM_SWAP16_2_SIMD,(size)) #define ASM_SWAP16_2_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP16_2_SIMD,(size)) #define ASM_SWAP16_2_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP16_2_SIMD,(size)) #define ASM_SWAP32_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_SIMD,(size)) #define ASM_SWAP32_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_SIMD,(size)) #define ASM_SWAP32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_SIMD,(size)) #define ASM_SWAP32_02_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_02_SIMD,(size)) #define ASM_SWAP32_02_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_02_SIMD,(size)) #define ASM_SWAP32_02_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_02_SIMD,(size)) #define ASM_SWAP32_13_MMX(size) ASM_SIMD_MMX(ASM_SWAP32_13_SIMD,(size)) #define ASM_SWAP32_13_SSE2(size) ASM_SIMD_SSE2(ASM_SWAP32_13_SIMD,(size)) #define ASM_SWAP32_13_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_SWAP32_13_SIMD,(size)) #define ASM_REV32_MMX(size) ASM_SIMD_MMX(ASM_REV32_SIMD,(size)) #define ASM_REV32_SSE2(size) ASM_SIMD_SSE2(ASM_REV32_SIMD,(size)) #define ASM_REV32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_REV32_SIMD,(size)) #define ASM_ROL32_MMX(size) ASM_SIMD_MMX(ASM_ROL32_SIMD,(size)) #define ASM_ROL32_SSE2(size) ASM_SIMD_SSE2(ASM_ROL32_SIMD,(size)) #define ASM_ROL32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROL32_SIMD,(size)) #define ASM_ROR32_MMX(size) ASM_SIMD_MMX(ASM_ROR32_SIMD,(size)) #define ASM_ROR32_SSE2(size) ASM_SIMD_SSE2(ASM_ROR32_SIMD,(size)) #define ASM_ROR32_SSE2A(size) ASM_SIMD_SSE2_ALIGNED(ASM_ROR32_SIMD,(size)) /*************************************************************************/ /* Actual implementations. Note that unrolling the SIMD loops doesn't seem * to be a win (only 2-3% improvement at most), and in fact can lose by a * bit in short loops. */ #define ASM_SWAP16_2_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ asm(SIMD_LOOP_WRAPPER( \ /* blocksize */ (regsize)/32, \ /* push_regs */ "", \ /* pop_regs */ "", \ /* small_loop */ X86_SWAP16_2, \ /* main_loop */ \ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ # MM0: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ psrlw $8, "MM0" # MM0: - 7 - 5 - 3 - 1 \n\ psllw $8, "MM1" # MM1: 6 - 4 - 2 - 0 - \n\ por "MM1", "MM0" # MM0: 6 7 4 5 2 3 0 1 \n\ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ /* emms */ "emms; "sfence) \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax", "edx") #define ASM_SWAP32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ asm(SIMD_LOOP_WRAPPER( \ /* blocksize */ (regsize)/32, \ /* push_regs */ "", \ /* pop_regs */ "", \ /* small_loop */ X86_SWAP32, \ /* main_loop */ \ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ # MM0: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ psrld $16, "MM0" # MM0: - - 7 6 - - 3 2 \n\ pslld $16, "MM1" # MM1: 5 4 - - 1 0 - - \n\ por "MM1", "MM0" # MM0: 5 4 7 6 1 0 3 2 \n\ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ /* emms */ "emms; "sfence) \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax") #define ASM_SWAP32_02_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ asm(SIMD_LOOP_WRAPPER( \ /* blocksize */ (regsize)/32, \ /* push_regs */ "push "EDX, \ /* pop_regs */ "pop "EDX, \ /* small_loop */ X86_SWAP32_02, \ /* main_loop */ \ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ # MM0: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\ pand 16("EDX"), "MM1" # MM1: - - - 4 - - - 0 \n\ pslld $16, "MM1" # MM1: - 4 - - - 0 - - \n\ pand 64("EDX"), "MM2" # MM2: - 6 - - - 2 - - \n\ psrld $16, "MM2" # MM2: - - - 6 - - - 2 \n\ pand 160("EDX"), "MM0" # MM0: 7 - 5 - 3 - 1 - \n\ por "MM1", "MM0" # MM0: 7 4 5 - 3 0 1 - \n\ por "MM2", "MM0" # MM0: 7 4 5 6 3 0 1 2 \n\ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ /* emms */ "emms; "sfence) \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \ "m" (mask_data) \ : "eax") #define ASM_SWAP32_13_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ asm(SIMD_LOOP_WRAPPER( \ /* blocksize */ (regsize)/32, \ /* push_regs */ "push "EDX, \ /* pop_regs */ "pop "EDX, \ /* small_loop */ X86_SWAP32_13, \ /* main_loop */ \ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ # MM0: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\ pand 32("EDX"), "MM1" # MM1: - - 5 - - - 1 - \n\ pslld $16, "MM1" # MM1: 5 - - - 1 - - - \n\ pand 128("EDX"), "MM2" # MM2: 7 - - - 3 - - - \n\ psrld $16, "MM2" # MM2: - - 7 - - - 3 - \n\ pand 80("EDX"), "MM0" # MM0: - 6 - 4 - 2 - 0 \n\ por "MM1", "MM0" # MM0: 5 6 - 4 1 2 - 0 \n\ por "MM2", "MM0" # MM0: 5 6 7 4 1 2 3 0 \n\ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ /* emms */ "emms; "sfence) \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \ "m" (mask_data) \ : "eax"); #define ASM_REV32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ asm(SIMD_LOOP_WRAPPER( \ /* blocksize */ (regsize)/32, \ /* push_regs */ "", \ /* pop_regs */ "", \ /* small_loop */ X86_REV32_BSWAP, \ /* main_loop */ \ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ # MM0: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM2" # MM2: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM3" # MM3: 7 6 5 4 3 2 1 0 \n\ psrld $24, "MM0" # MM0: - - - 7 - - - 3 \n\ pand 32("EDX"), "MM2" # MM2: - - 5 - - - 1 - \n\ psrld $8, "MM1" # MM1: - 7 6 5 - 3 2 1 \n\ pand 32("EDX"), "MM1" # MM1: - - 6 - - - 2 - \n\ pslld $8, "MM2" # MM2: - 5 - - - 1 - - \n\ pslld $24, "MM3" # MM3: 4 - - - 0 - - - \n\ por "MM1", "MM0" # MM0: - - 6 7 - - 2 3 \n\ por "MM2", "MM0" # MM0: - 5 6 7 - 1 2 3 \n\ por "MM3", "MM0" # MM0: 4 5 6 7 0 1 2 3 \n\ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ /* emms */ "emms; "sfence) \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size), "d" (&mask_data), \ "m" (mask_data) \ : "eax") #define ASM_ROL32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ asm(SIMD_LOOP_WRAPPER( \ /* blocksize */ (regsize)/32, \ /* push_regs */ "", \ /* pop_regs */ "", \ /* small_loop */ X86_ROL32, \ /* main_loop */ \ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ # MM0: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ pslld $8, "MM0" # MM0: 6 5 4 - 2 1 0 - \n\ psrld $24, "MM1" # MM1: - - - 7 - - - 3 \n\ por "MM1", "MM0" # MM0: 6 5 4 7 2 1 0 3 \n\ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ /* emms */ "emms; "sfence) \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax") #define ASM_ROR32_SIMD(size,regsize,ldq,movq,stq,sfence,MM0,MM1,MM2,MM3,MM4,MM5,MM6,MM7) \ asm(SIMD_LOOP_WRAPPER( \ /* blocksize */ (regsize)/32, \ /* push_regs */ "", \ /* pop_regs */ "", \ /* small_loop */ X86_ROR32, \ /* main_loop */ \ ldq" -("#regsize"/8)("ESI","ECX",4), "MM0" \n\ # MM0: 7 6 5 4 3 2 1 0 \n\ "movq" "MM0", "MM1" # MM1: 7 6 5 4 3 2 1 0 \n\ psrld $8, "MM0" # MM0: - 7 6 5 - 3 2 1 \n\ pslld $24, "MM1" # MM1: 4 - - - 0 - - - \n\ por "MM1", "MM0" # MM0: 4 7 6 5 0 3 2 1 \n\ "stq" "MM0", -("#regsize"/8)("EDI","ECX",4)", \ /* emms */ "emms; "sfence) \ : /* no outputs */ \ : "S" (src[0]), "D" (dest[0]), "c" (size) \ : "eax") /*************************************************************************/ /* SSE2 macros to load 8 24- or 32-bit RGB pixels into XMM0/1/2 (R/G/B) as * 16-bit values, used for RGB->YUV and RGB->grayscale conversions. * ZERO is the number of the XMM register containing all zeroes. */ #define SSE2_LOAD_RGB24(ZERO) \ "movl -21("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xBGR1 \n\ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR1 ----- ----- ----- \n\ movl -18("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm0 # XMM0: xBGR1 ----- ----- xBGR2 \n\ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR2 xBGR1 ----- ----- \n\ movl -15("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm0 # XMM0: xBGR2 xBGR1 ----- xBGR3 \n\ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 ----- \n\ movl -24("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm0 # XMM0: xBGR3 xBGR2 xBGR1 xBGR0 \n\ movl -9("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xBGR5 \n\ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR5 ----- ----- ----- \n\ movl -6("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm1 # XMM1: xBGR5 ----- ----- xBGR6 \n\ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR6 xBGR5 ----- ----- \n\ movl -3("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm1 # XMM1: xBGR6 xBGR5 ----- xBGR7 \n\ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 ----- \n\ movl -12("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm1 # XMM1: xBGR7 xBGR6 xBGR5 xBGR4 \n"\ SSE2_MASSAGE_RGBA32(ZERO) #define SSE2_LOAD_BGR24(ZERO) \ "movl -21("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm0 # XMM0: ----- ----- ----- xRGB1 \n\ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB1 ----- ----- ----- \n\ movl -18("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm0 # XMM0: xRGB1 ----- ----- xRGB2 \n\ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB2 xRGB1 ----- ----- \n\ movl -15("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm0 # XMM0: xRGB2 xRGB1 ----- xRGB3 \n\ pshufd $0x39, %%xmm0, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 ----- \n\ movl -24("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm0 # XMM0: xRGB3 xRGB2 xRGB1 xRGB0 \n\ movl -9("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm1 # XMM1: ----- ----- ----- xRGB5 \n\ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB5 ----- ----- ----- \n\ movl -6("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm1 # XMM1: xRGB5 ----- ----- xRGB6 \n\ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB6 xRGB5 ----- ----- \n\ movl -3("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm1 # XMM1: xRGB6 xRGB5 ----- xRGB7 \n\ pshufd $0x39, %%xmm1, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 ----- \n\ movl -12("ESI","EBX"), %%eax \n\ movd %%eax, %%xmm2 \n\ por %%xmm2, %%xmm1 # XMM1: xRGB7 xRGB6 xRGB5 xRGB4 \n"\ SSE2_MASSAGE_BGRA32(ZERO) #define SSE2_LOAD_RGBA32(ZERO) "\ movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ABGR3 ABGR2 ABGR1 ABGR0 \n\ movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ABGR7 ABGR6 ABGR5 ABGR4 \n"\ SSE2_MASSAGE_RGBA32(ZERO) #define SSE2_MASSAGE_RGBA32(ZERO) "\ movdqa %%xmm0, %%xmm2 # XMM2: ABGR3 ABGR2 ABGR1 ABGR0 \n\ punpcklbw %%xmm1, %%xmm0 # X0.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\ punpckhbw %%xmm1, %%xmm2 # X2.l: A6 A2 B6 B2 G6 G2 R6 R2 \n\ movdqa %%xmm0, %%xmm1 # X1.l: A4 A0 B4 B0 G4 G0 R4 R0 \n\ punpcklbw %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\ punpckhbw %%xmm2, %%xmm1 # X1.l: G7 G5 G3 G1 R7 R5 R3 R1 \n\ movdqa %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 R6 R4 R2 R0 \n\ punpcklbw %%xmm1, %%xmm0 # XMM0: G7.......G0 R7.......R0 \n\ punpckhbw %%xmm1, %%xmm2 # XMM2: A7.......A0 B7.......B0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: G7.......G0 R7.......R0 \n\ punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" #define SSE2_LOAD_BGRA32(ZERO) "\ movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: ARGB3 ARGB2 ARGB1 ARGB0 \n\ movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: ARGB7 ARGB6 ARGB5 ARGB4 \n"\ SSE2_MASSAGE_BGRA32(ZERO) #define SSE2_MASSAGE_BGRA32(ZERO) "\ movdqa %%xmm0, %%xmm2 # XMM2: ARGB3 ARGB2 ARGB1 ARGB0 \n\ punpcklbw %%xmm1, %%xmm2 # X2.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\ punpckhbw %%xmm1, %%xmm0 # X0.l: A6 A2 R6 R2 G6 G2 B6 B2 \n\ movdqa %%xmm2, %%xmm1 # X1.l: A4 A0 R4 R0 G4 G0 B4 B0 \n\ punpcklbw %%xmm0, %%xmm2 # X2.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\ punpckhbw %%xmm0, %%xmm1 # X1.l: G7 G5 G3 G1 B7 B5 B3 B1 \n\ movdqa %%xmm2, %%xmm0 # X0.l: G6 G4 G2 G0 B6 B4 B2 B0 \n\ punpcklbw %%xmm1, %%xmm2 # XMM2: G7.......G0 B7.......B0 \n\ punpckhbw %%xmm1, %%xmm0 # XMM0: A7.......A0 R7.......R0 \n\ movdqa %%xmm2, %%xmm1 # XMM1: G7.......G0 B7.......B0 \n\ punpcklbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ punpckhbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ punpcklbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" #define SSE2_LOAD_ARGB32(ZERO) "\ movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: BGRA3 BGRA2 BGRA1 BGRA0 \n\ movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: BGRA7 BGRA6 BGRA5 BGRA4 \n"\ SSE2_MASSAGE_ARGB32(ZERO) #define SSE2_MASSAGE_ARGB32(ZERO) "\ movdqa %%xmm0, %%xmm2 # XMM2: BGRA3 BGRA2 BGRA1 BGRA0 \n\ punpcklbw %%xmm1, %%xmm0 # X0.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\ punpckhbw %%xmm1, %%xmm2 # X2.l: B6 B2 G6 G2 R6 R2 A6 A2 \n\ movdqa %%xmm0, %%xmm1 # X1.l: B4 B0 G4 G0 R4 R0 A4 A0 \n\ punpcklbw %%xmm2, %%xmm0 # X0.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\ punpckhbw %%xmm2, %%xmm1 # X1.l: R7 R5 R3 R1 A7 A5 A3 A1 \n\ movdqa %%xmm0, %%xmm2 # X2.l: R6 R4 R2 R0 A6 A4 A2 A0 \n\ punpcklbw %%xmm1, %%xmm0 # XMM0: R7.......G0 A7.......A0 \n\ punpckhbw %%xmm1, %%xmm2 # XMM2: B7.......G0 G7.......G0 \n\ movdqa %%xmm2, %%xmm1 # XMM1: B7.......B0 G7.......G0 \n\ punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" #define SSE2_LOAD_ABGR32(ZERO) "\ movdqu -32("ESI","ECX",4),%%xmm0 #XMM0: RGBA3 RGBA2 RGBA1 RGBA0 \n\ movdqu -16("ESI","ECX",4),%%xmm1 #XMM1: RGBA7 RGBA6 RGBA5 RGBA4 \n"\ SSE2_MASSAGE_ABGR32(ZERO) #define SSE2_MASSAGE_ABGR32(ZERO) "\ movdqa %%xmm0, %%xmm2 # XMM2: RGBA3 RGBA2 RGBA1 RGBA0 \n\ punpcklbw %%xmm1, %%xmm2 # X2.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\ punpckhbw %%xmm1, %%xmm0 # X0.l: R6 R2 G6 G2 B6 B2 A6 A2 \n\ movdqa %%xmm2, %%xmm1 # X1.l: R4 R0 G4 G0 B4 B0 A4 A0 \n\ punpcklbw %%xmm0, %%xmm2 # X2.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\ punpckhbw %%xmm0, %%xmm1 # X1.l: B7 B5 B3 B1 A7 A5 A3 A1 \n\ movdqa %%xmm2, %%xmm0 # X0.l: B6 B4 B2 B0 A6 A4 A2 A0 \n\ punpcklbw %%xmm1, %%xmm2 # XMM2: B7.......B0 A7.......A0 \n\ punpckhbw %%xmm1, %%xmm0 # XMM0: R7.......R0 G7.......G0 \n\ movdqa %%xmm0, %%xmm1 # XMM1: R7.......R0 G7.......G0 \n\ punpckhbw %%xmm4, %%xmm0 # XMM0: R7 R6 R5 R4 R3 R2 R1 R0 \n\ punpcklbw %%xmm4, %%xmm1 # XMM1: G7 G6 G5 G4 G3 G2 G1 G0 \n\ punpckhbw %%xmm4, %%xmm2 # XMM2: B7 B6 B5 B4 B3 B2 B1 B0 \n" /*************************************************************************/ #endif /* ACLIB_IMG_X86_COMMON_H */ /* * Local variables: * c-file-style: "stroustrup" * c-file-offsets: ((case-label . *) (statement-case-intro . *)) * indent-tabs-mode: nil * End: * * vim: expandtab shiftwidth=4: */