/*
 * memcpy.c - optimized memcpy() routines for aclib
 * Written by Andrew Church <achurch@achurch.org>
 *
 * This file is part of transcode, a video stream processing tool.
 * transcode is free software, distributable under the terms of the GNU
 * General Public License (version 2 or later).  See the file COPYING
 * for details.
 */

#include "ac.h"
#include "ac_internal.h"
#include <string.h>

/* Use memmove because memcpy isn't guaranteed to be ascending */
static void *(*memcpy_ptr)(void *, const void *, size_t) = memmove;

/*************************************************************************/

/* External interface */

void *ac_memcpy(void *dest, const void *src, size_t size)
{
    return (*memcpy_ptr)(dest, src, size);
}

/*************************************************************************/
/*************************************************************************/

/* Note the check for ARCH_X86 here: this is to prevent compilation of this
 * code on x86_64, since all x86_64 processors support SSE2, and because
 * this code is not set up to use the 64-bit registers for addressing on
 * x86_64. */

#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)

/* MMX-optimized routine, intended for PMMX/PII processors.
 * Nonstandard instructions used:
 *     (CPUID.MMX)   MOVQ
 */

static void *memcpy_mmx(void *dest, const void *src, size_t bytes)
{
    asm("\
PENTIUM_LINE_SIZE = 32          # PMMX/PII cache line size              \n\
PENTIUM_CACHE_SIZE = 8192       # PMMX/PII total cache size             \n\
# Use only half because writes may touch the cache too (PII)            \n\
PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE)        \n\
                                                                        \n\
        push %%ebx              # Save PIC register                     \n\
        push %%edi              # Save destination for return value     \n\
        cld                     # MOVS* should ascend                   \n\
                                                                        \n\
        mov $64, %%ebx          # Constant                              \n\
                                                                        \n\
        cmp %%ebx, %%ecx                                                \n\
        jb mmx.memcpy_last      # Just use movs if <64 bytes            \n\
                                                                        \n\
        # First align destination address to a multiple of 8 bytes      \n\
        mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
        sub %%edi, %%eax                                                \n\
        and $7, %%eax           # ... which is the number of bytes to copy\n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS  // Because "lea 0f" requires a textrel
"       xchg %%eax, %%ecx                                               \n\
        mov %%ecx, %%edx                                                \n\
        repz movsb                                                      \n\
        mov %%eax, %%ecx                                                \n\
        mov %%edx, %%eax                                                \n"
#else
"       lea 0f, %%edx           # Use a computed jump--faster than a loop\n\
        sub %%eax, %%edx                                                \n\
        jmp *%%edx              # Execute 0-7 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n"
#endif
"0:     sub %%eax, %%ecx        # Update count                          \n\
                                                                        \n\
        # Now copy data in blocks                                       \n\
0:      mov %%ecx, %%edx        # EDX <- ECX >> 6 (cache lines to copy) \n\
        shr $6, %%edx                                                   \n\
        jz mmx.memcpy_last      # <64 bytes left?  Skip to end          \n\
        cmp $PENTIUM_CACHE_BLOCK/64, %%edx                              \n\
        jb 1f                   # Limit size of block                   \n\
        mov $PENTIUM_CACHE_BLOCK/64, %%edx                              \n\
1:      mov %%edx, %%eax        # EAX <- EDX << 6 (bytes to copy)       \n\
        shl $6, %%eax                                                   \n\
        sub %%eax, %%ecx        # Update remaining count                \n\
        add %%eax, %%esi        # Point to end of region to be block-copied\n\
2:      test %%eax, -32(%%esi)  # Touch each cache line in reverse order\n\
        test %%eax, -64(%%esi)                                          \n\
        sub %%ebx, %%esi        # Update pointer                        \n\
        sub %%ebx, %%eax        # And loop                              \n\
        jnz 2b                                                          \n\
        # Note that ESI now points to the beginning of the block        \n\
3:      movq   (%%esi), %%mm0   # Do the actual copy, 64 bytes at a time\n\
        movq  8(%%esi), %%mm1                                           \n\
        movq 16(%%esi), %%mm2                                           \n\
        movq 24(%%esi), %%mm3                                           \n\
        movq 32(%%esi), %%mm4                                           \n\
        movq 40(%%esi), %%mm5                                           \n\
        movq 48(%%esi), %%mm6                                           \n\
        movq 56(%%esi), %%mm7                                           \n\
        movq %%mm0,   (%%edi)                                           \n\
        movq %%mm1,  8(%%edi)                                           \n\
        movq %%mm2, 16(%%edi)                                           \n\
        movq %%mm3, 24(%%edi)                                           \n\
        movq %%mm4, 32(%%edi)                                           \n\
        movq %%mm5, 40(%%edi)                                           \n\
        movq %%mm6, 48(%%edi)                                           \n\
        movq %%mm7, 56(%%edi)                                           \n\
        add %%ebx, %%esi        # Update pointers                       \n\
        add %%ebx, %%edi                                                \n\
        dec %%edx               # And loop                              \n\
        jnz 3b                                                          \n\
        jmp 0b                                                          \n\
                                                                        \n\
mmx.memcpy_last:                                                        \n\
        # Copy last <64 bytes, using the computed jump trick            \n\
        mov %%ecx, %%eax        # EAX <- ECX>>2                         \n\
        shr $2, %%eax                                                   \n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       xchg %%eax, %%ecx                                               \n\
        repz movsd                                                      \n\
        mov %%eax, %%ecx                                                \n"
#else
"       lea 0f, %%edx                                                   \n\
        sub %%eax, %%edx                                                \n\
        jmp *%%edx              # Execute 0-15 MOVSD's                  \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n"
#endif
"0:     and $3, %%ecx           # ECX <- ECX & 3                        \n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       repz movsb                                                      \n"
#else
"       lea 0f, %%edx                                                   \n\
        sub %%ecx, %%edx                                                \n\
        jmp *%%edx              # Execute 0-3 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n"
#endif
"0:                                                                     \n\
        # All done!                                                     \n\
        emms                    # Clean up MMX state                    \n\
        pop %%edi               # Restore destination (return value)    \n\
        pop %%ebx               # Restore PIC register                  \n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%eax", "%edx"
    );
    return dest;
}

#endif  /* HAVE_ASM_MMX && ARCH_X86 */

/*************************************************************************/

#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)

/* SSE-optimized routine.  Backported from AMD64 routine below.
 * Nonstandard instructions used:
 *     (CPUID.CMOVE) CMOVA
 *     (CPUID.MMX)   MOVQ
 *     (CPUID.SSE)   MOVNTQ
 */

static void *memcpy_sse(void *dest, const void *src, size_t bytes)
{
    asm("\
        push %%ebx              # Save PIC register                     \n\
        push %%edi              # Save destination for return value     \n\
        cld                     # MOVS* should ascend                   \n\
                                                                        \n\
        cmp $64, %%ecx          # Skip block copy for small blocks      \n\
        jb sse.memcpy_last                                              \n\
                                                                        \n\
        mov $128, %%ebx         # Constant used later                   \n\
                                                                        \n\
        # First align destination address to a multiple of 8 bytes      \n\
        mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
        sub %%edi, %%eax                                                \n\
        and $7, %%eax           # ... which is the number of bytes to copy\n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       xchg %%eax, %%ecx                                               \n\
        mov %%ecx, %%edx                                                \n\
        repz movsb                                                      \n\
        mov %%eax, %%ecx                                                \n\
        mov %%edx, %%eax                                                \n"
#else
"       lea 0f, %%edx           # Use a computed jump--faster than a loop\n\
        sub %%eax, %%edx                                                \n\
        jmp *%%edx              # Execute 0-7 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n"
#endif
"0:     sub %%eax, %%ecx        # Update count                          \n\
                                                                        \n\
        cmp $0x10040, %%ecx     # Is this a large block? (0x10040 is an \n\
                                # arbitrary value where prefetching and \n\
                                # write combining seem to start becoming\n\
                                # faster)                               \n\
        jae sse.memcpy_bp       # Yup, use prefetch copy                \n\
                                                                        \n\
sse.memcpy_small:               # Small block copy routine--no prefetch \n"
#if 0
"       mov %%ecx, %%edx        # EDX <- bytes to copy / 8              \n\
        shr $3, %%edx                                                   \n\
        mov %%edx, %%eax        # Leave remainder in ECX for later      \n\
        shl $3, %%eax                                                   \n\
        sub %%eax, %%ecx                                                \n\
        .balign 16                                                      \n\
0:      movq (%%esi), %%mm0     # Copy 8 bytes of data                  \n\
        movq %%mm0, (%%edi)                                             \n\
        add $8, %%esi           # Update pointers                       \n\
        add $8, %%edi                                                   \n\
        dec %%edx               # And loop                              \n\
        jg 0b                                                           \n\
        jmp sse.memcpy_last     # Copy any remaining bytes              \n\
                                                                        \n\
        nop                     # Align loops below                     \n"
#else
"       # It appears that a simple rep movs is faster than cleverness   \n\
        # with movq...                                                  \n\
        mov %%ecx, %%edx        # EDX <- ECX & 3                        \n\
        and $3, %%edx                                                   \n\
        shr $2, %%ecx           # ECX <- ECX >> 2                       \n\
        rep movsl               # Copy away!                            \n\
        mov %%edx, %%ecx        # Take care of last 0-3 bytes           \n\
        rep movsb                                                       \n\
        jmp sse.memcpy_end      # And exit                              \n\
                                                                        \n\
        .balign 16                                                      \n\
        nop                                                             \n\
        nop                                                             \n"
#endif
"sse.memcpy_bp:                 # Block prefetch copy routine           \n\
0:      mov %%ecx, %%edx        # EDX: temp counter                     \n\
        shr $6, %%edx           # Divide by cache line size (64 bytes)  \n\
        cmp %%ebx, %%edx        # ... and cap at 128 (8192 bytes)       \n\
        cmova %%ebx, %%edx                                              \n\
        shl $3, %%edx           # EDX <- cache lines to copy * 8        \n\
        mov %%edx, %%eax        # EAX <- cache lines to preload * 8     \n\
                                #        (also used as memory offset)   \n\
1:      test %%eax, -64(%%esi,%%eax,8)  # Preload cache lines in pairs  \n\
        test %%eax, -128(%%esi,%%eax,8) # (going backwards)             \n\
        # (note that test %%eax,... seems to be faster than prefetchnta \n\
        #  on x86)                                                      \n\
        sub $16, %%eax          # And loop                              \n\
        jg 1b                                                           \n\
                                                                        \n\
        # Then copy--forward, which seems to be faster than reverse for \n\
        # certain alignments                                            \n\
        xor %%eax, %%eax                                                \n\
2:      movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop             \n\
        movntq %%mm0, (%%edi,%%eax,8)                                   \n\
        inc %%eax                                                       \n\
        cmp %%edx, %%eax                                                \n\
        jb 2b                                                           \n\
                                                                        \n\
        # Finally, update pointers and count, and loop                  \n\
        shl $3, %%edx           # EDX <- bytes copied                   \n\
        add %%edx, %%esi                                                \n\
        add %%edx, %%edi                                                \n\
        sub %%edx, %%ecx                                                \n\
        cmp $64, %%ecx          # At least one cache line left?         \n\
        jae 0b                  # Yup, loop                             \n\
                                                                        \n\
sse.memcpy_last:                                                        \n\
        # Copy last <64 bytes, using the computed jump trick            \n\
        mov %%ecx, %%eax        # EAX <- ECX>>2                         \n\
        shr $2, %%eax                                                   \n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       xchg %%eax, %%ecx                                               \n\
        repz movsd                                                      \n\
        mov %%eax, %%ecx                                                \n"
#else
"       lea 0f, %%edx                                                   \n\
        sub %%eax, %%edx                                                \n\
        jmp *%%edx              # Execute 0-15 MOVSD's                  \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n\
        movsd                                                           \n"
#endif
"0:     and $3, %%ecx           # ECX <- ECX & 3                        \n"
#ifdef ACLIB_DISABLE_X86_TEXTRELS
"       repz movsb                                                      \n"
#else
"       lea sse.memcpy_end, %%edx                                       \n\
        sub %%ecx, %%edx                                                \n\
        jmp *%%edx              # Execute 0-3 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n"
#endif
"                                                                       \n\
sse.memcpy_end:                                                         \n\
        # All done!                                                     \n\
        emms                    # Clean up after MMX instructions       \n\
        sfence                  # Flush the write buffer                \n\
        pop %%edi               # Restore destination (return value)    \n\
        pop %%ebx               # Restore PIC register                  \n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%eax", "%edx"
    );
    return dest;
}

#endif  /* HAVE_ASM_SSE && ARCH_X86 */

/*************************************************************************/

#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)

/* AMD64-optimized routine, using SSE2.  Derived from AMD64 optimization
 * guide section 5.13: Appropriate Memory Copying Routines.
 * Nonstandard instructions used:
 *     (CPUID.CMOVE) CMOVA
 *     (CPUID.SSE2)  MOVDQA, MOVDQU, MOVNTDQ
 *
 * Note that this routine will also run more or less as-is (modulo register
 * names and label(%%rip) references) on x86 CPUs, but tests have shown the
 * SSE1 version above to be faster.
 */

/* The block copying code--macroized because we use two versions of it
 * depending on whether the source is 16-byte-aligned or not.  Pass either
 * movdqa or movdqu (unquoted) for the parameter. */
#define AMD64_BLOCK_MEMCPY(movdq) \
"       # First prefetch (note that if we end on an odd number of cache \n\
        # lines, we skip prefetching the last one--faster that way than \n\
        # prefetching line by line or treating it as a special case)    \n\
0:      mov %%ecx, %%edx        # EDX: temp counter (always <32 bits)   \n\
        shr $6, %%edx           # Divide by cache line size (64 bytes)  \n\
        cmp %%ebx, %%edx        # ... and cap at 128 (8192 bytes)       \n\
        cmova %%ebx, %%edx                                              \n\
        shl $3, %%edx           # EDX <- cache lines to copy * 8        \n\
        mov %%edx, %%eax        # EAX <- cache lines to preload * 8     \n\
                                #        (also used as memory offset)   \n\
1:      prefetchnta -64(%%rsi,%%rax,8)  # Preload cache lines in pairs  \n\
        prefetchnta -128(%%rsi,%%rax,8) # (going backwards)             \n\
        sub $16, %%eax          # And loop                              \n\
        jg 1b                                                           \n\
                                                                        \n\
        # Then copy--forward, which seems to be faster than reverse for \n\
        # certain alignments                                            \n\
        xor %%eax, %%eax                                                \n\
2:      " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop     \n\
        movntdq %%xmm0, (%%rdi,%%rax,8)                                 \n\
        add $2, %%eax                                                   \n\
        cmp %%edx, %%eax                                                \n\
        jb 2b                                                           \n\
                                                                        \n\
        # Finally, update pointers and count, and loop                  \n\
        shl $3, %%edx           # EDX <- bytes copied                   \n\
        add %%rdx, %%rsi                                                \n\
        add %%rdx, %%rdi                                                \n\
        sub %%rdx, %%rcx                                                \n\
        cmp $64, %%rcx          # At least one cache line left?         \n\
        jae 0b                  # Yup, loop                             \n"

static void *memcpy_amd64(void *dest, const void *src, size_t bytes)
{
    asm("\
        push %%rdi              # Save destination for return value     \n\
        cld                     # MOVS* should ascend                   \n\
                                                                        \n\
        cmp $64, %%rcx          # Skip block copy for small blocks      \n\
        jb amd64.memcpy_last                                            \n\
                                                                        \n\
        mov $128, %%ebx         # Constant used later                   \n\
                                                                        \n\
        # First align destination address to a multiple of 16 bytes     \n\
        mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
        sub %%edi, %%eax        # (we don't care about the top 32 bits) \n\
        and $7, %%eax           # ... which is the number of bytes to copy\n\
        lea 0f(%%rip), %%rdx    # Use a computed jump--faster than a loop\n\
        sub %%rax, %%rdx                                                \n\
        jmp *%%rdx              # Execute 0-7 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
0:      sub %%rax, %%rcx        # Update count                          \n\
        test $8, %%edi          # Is destination not 16-byte aligned?   \n\
        je 1f                                                           \n\
        movsq                   # Then move 8 bytes to align it         \n\
        sub $8, %%rcx                                                   \n\
                                                                        \n\
1:      cmp $0x38000, %%rcx     # Is this a large block? (0x38000 is an \n\
                                # arbitrary value where prefetching and \n\
                                # write combining seem to start becoming\n\
                                # faster)                               \n\
        jb amd64.memcpy_small   # Nope, use small copy (no prefetch/WC) \n\
        test $15, %%esi         # Is source also 16-byte aligned?       \n\
                                # (use ESI to save a REX prefix byte)   \n\
        jnz amd64.memcpy_normal_bp # Nope, use slow copy                \n\
        jmp amd64.memcpy_fast_bp # Yup, use fast copy                   \n\
                                                                        \n\
amd64.memcpy_small:             # Small block copy routine--no prefetch \n\
        mov %%ecx, %%edx        # EDX <- bytes to copy / 16             \n\
        shr $4, %%edx           # (count known to fit in 32 bits)       \n\
        mov %%edx, %%eax        # Leave remainder in ECX for later      \n\
        shl $4, %%eax                                                   \n\
        sub %%eax, %%ecx                                                \n\
        .balign 16                                                      \n\
0:      movdqu (%%rsi), %%xmm0  # Copy 16 bytes of data                 \n\
        movdqa %%xmm0, (%%rdi)                                          \n\
        add $16, %%rsi          # Update pointers                       \n\
        add $16, %%rdi                                                  \n\
        dec %%edx               # And loop                              \n\
        jnz 0b                                                          \n\
        jmp amd64.memcpy_last   # Copy any remaining bytes              \n\
                                                                        \n\
        .balign 16                                                      \n\
        nop                                                             \n\
        nop                                                             \n\
amd64.memcpy_fast_bp:           # Fast block prefetch loop              \n"
AMD64_BLOCK_MEMCPY(movdqa)
"       jmp amd64.memcpy_last   # Copy any remaining bytes              \n\
                                                                        \n\
        .balign 16                                                      \n\
        nop                                                             \n\
        nop                                                             \n\
amd64.memcpy_normal_bp:         # Normal (unaligned) block prefetch loop\n"
AMD64_BLOCK_MEMCPY(movdqu)
"                                                                       \n\
amd64.memcpy_last:                                                      \n\
        # Copy last <64 bytes, using the computed jump trick            \n\
        mov %%ecx, %%eax        # EAX <- ECX>>3                         \n\
        shr $3, %%eax                                                   \n\
        lea 0f(%%rip), %%rdx                                            \n\
        add %%eax, %%eax        # Watch out, MOVSQ is 2 bytes!          \n\
        sub %%rax, %%rdx                                                \n\
        jmp *%%rdx              # Execute 0-7 MOVSQ's                   \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
        movsq                                                           \n\
0:      and $7, %%ecx           # ECX <- ECX & 7                        \n\
        lea 0f(%%rip), %%rdx                                            \n\
        sub %%rcx, %%rdx                                                \n\
        jmp *%%rdx              # Execute 0-7 MOVSB's                   \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
        movsb                                                           \n\
0:                                                                      \n\
        # All done!                                                     \n\
        emms                    # Clean up after MMX instructions       \n\
        sfence                  # Flush the write buffer                \n\
        pop %%rdi               # Restore destination (return value)    \n\
    " : /* no outputs */
      : "D" (dest), "S" (src), "c" (bytes)
      : "%rax", "%rbx", "%rdx"
    );
    return dest;
}

#endif  /* HAVE_ASM_SSE2 && ARCH_X86_64 */

/*************************************************************************/

/* Initialization routine. */

int ac_memcpy_init(int accel)
{
    memcpy_ptr = memmove;

#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
    if (HAS_ACCEL(accel, AC_MMX))
        memcpy_ptr = memcpy_mmx;
#endif

#if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
    if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE))
        memcpy_ptr = memcpy_sse;
#endif

#if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
    if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2))
        memcpy_ptr = memcpy_amd64;
#endif

    return 1;
}

/*************************************************************************/

/*
 * Local variables:
 *   c-file-style: "stroustrup"
 *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
 *   indent-tabs-mode: nil
 * End:
 *
 * vim: expandtab shiftwidth=4:
 */