You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
281 lines
13 KiB
281 lines
13 KiB
/*
|
|
* rescale.c -- take the weighted average of two sets of byte data
|
|
* Written by Andrew Church <achurch@achurch.org>
|
|
*
|
|
* This file is part of transcode, a video stream processing tool.
|
|
* transcode is free software, distributable under the terms of the GNU
|
|
* General Public License (version 2 or later). See the file COPYING
|
|
* for details.
|
|
*/
|
|
|
|
#include "ac.h"
|
|
#include "ac_internal.h"
|
|
|
|
static void rescale(const uint8_t *, const uint8_t *, uint8_t *, int,
|
|
uint32_t, uint32_t);
|
|
static void (*rescale_ptr)(const uint8_t *, const uint8_t *, uint8_t *, int,
|
|
uint32_t, uint32_t) = rescale;
|
|
|
|
/*************************************************************************/
|
|
|
|
/* External interface */
|
|
|
|
void ac_rescale(const uint8_t *src1, const uint8_t *src2,
|
|
uint8_t *dest, int bytes, uint32_t weight1, uint32_t weight2)
|
|
{
|
|
if (weight1 >= 0x10000)
|
|
ac_memcpy(dest, src1, bytes);
|
|
else if (weight2 >= 0x10000)
|
|
ac_memcpy(dest, src2, bytes);
|
|
else
|
|
(*rescale_ptr)(src1, src2, dest, bytes, weight1, weight2);
|
|
}
|
|
|
|
/*************************************************************************/
|
|
/*************************************************************************/
|
|
|
|
/* Vanilla C version */
|
|
|
|
static void rescale(const uint8_t *src1, const uint8_t *src2,
|
|
uint8_t *dest, int bytes,
|
|
uint32_t weight1, uint32_t weight2)
|
|
{
|
|
int i;
|
|
for (i = 0; i < bytes; i++)
|
|
dest[i] = (src1[i]*weight1 + src2[i]*weight2 + 32768) >> 16;
|
|
}
|
|
|
|
/*************************************************************************/
|
|
|
|
/* MMX version */
|
|
|
|
#if defined(HAVE_ASM_MMX) && defined(ARCH_X86) /* i.e. not x86_64 */
|
|
|
|
static void rescale_mmx(const uint8_t *src1, const uint8_t *src2,
|
|
uint8_t *dest, int bytes,
|
|
uint32_t weight1, uint32_t weight2)
|
|
{
|
|
if (bytes >= 8) {
|
|
/* First store weights in MM4/MM5 to relieve register pressure;
|
|
* save time by making 2 copies ahead of time in the general
|
|
* registers. Note that we divide by 2 for MMX due to the lack
|
|
* of an unsigned SIMD multiply instruction (PMULHUW). */
|
|
int half1 = weight1 / 2;
|
|
int half2 = weight2 / 2;
|
|
half2 += weight1 & weight2 & 1; // pick up the lost bit here
|
|
asm("movd %%eax, %%mm4; movd %%edx, %%mm5"
|
|
: : "a" (half1<<16|half1), "d" (half2<<16|half2));
|
|
asm("\
|
|
movq %%mm4, %%mm6 # MM6: 00 00 W1 W1 \n\
|
|
psllq $32, %%mm4 # MM4: W1 W1 00 00 \n\
|
|
por %%mm6, %%mm4 # MM4: W1 W1 W1 W1 \n\
|
|
movq %%mm5, %%mm7 # MM7: 00 00 W2 W2 \n\
|
|
psllq $32, %%mm5 # MM5: W2 W2 00 00 \n\
|
|
por %%mm7, %%mm5 # MM5: W2 W2 W2 W2 \n\
|
|
pxor %%mm7, %%mm7 # MM7: 00 00 00 00 \n\
|
|
pxor %%mm6, %%mm6 # Put 0x0020*4 in MM6 (rounding)\n\
|
|
pcmpeqw %%mm3, %%mm3 \n\
|
|
psubw %%mm3, %%mm6 \n\
|
|
psllw $5, %%mm6 \n\
|
|
0: \n\
|
|
movq -8(%%esi,%%ecx), %%mm0 \n\
|
|
movq %%mm0, %%mm1 \n\
|
|
punpcklbw %%mm7, %%mm0 \n\
|
|
psllw $7, %%mm0 # 9.7 fixed point \n\
|
|
pmulhw %%mm4, %%mm0 # Multiply to get 10.6 fixed \n\
|
|
punpckhbw %%mm7, %%mm1 \n\
|
|
psllw $7, %%mm1 \n\
|
|
pmulhw %%mm4, %%mm1 \n\
|
|
movq -8(%%edx,%%ecx), %%mm2 \n\
|
|
movq %%mm2, %%mm3 \n\
|
|
punpcklbw %%mm7, %%mm2 \n\
|
|
psllw $7, %%mm2 \n\
|
|
pmulhw %%mm5, %%mm2 \n\
|
|
punpckhbw %%mm7, %%mm3 \n\
|
|
psllw $7, %%mm3 \n\
|
|
pmulhw %%mm5, %%mm3 \n\
|
|
paddw %%mm2, %%mm0 \n\
|
|
paddw %%mm6, %%mm0 \n\
|
|
psrlw $6, %%mm0 \n\
|
|
paddw %%mm3, %%mm1 \n\
|
|
paddw %%mm6, %%mm1 \n\
|
|
psrlw $6, %%mm1 \n\
|
|
packuswb %%mm1, %%mm0 \n\
|
|
movq %%mm0, -8(%%edi,%%ecx) \n\
|
|
subl $8, %%ecx \n\
|
|
jnz 0b \n\
|
|
emms"
|
|
: /* no outputs */
|
|
: "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~7));
|
|
}
|
|
if (UNLIKELY(bytes & 7)) {
|
|
rescale(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
|
|
bytes & 7, weight1, weight2);
|
|
}
|
|
}
|
|
|
|
#endif /* HAVE_ASM_MMX && ARCH_X86 */
|
|
|
|
/*************************************************************************/
|
|
|
|
/* MMXEXT version (also for SSE) */
|
|
|
|
#if (defined(HAVE_ASM_MMXEXT) || defined(HAVE_ASM_SSE)) && defined(ARCH_X86)
|
|
|
|
static void rescale_mmxext(const uint8_t *src1, const uint8_t *src2,
|
|
uint8_t *dest, int bytes,
|
|
uint32_t weight1, uint32_t weight2)
|
|
{
|
|
if (bytes >= 8) {
|
|
asm("movd %%eax, %%mm4; movd %%edx, %%mm5"
|
|
: : "a" (weight1), "d" (weight2));
|
|
asm("\
|
|
pshufw $0, %%mm4, %%mm4 # MM4: W1 W1 W1 W1 \n\
|
|
pshufw $0, %%mm5, %%mm5 # MM5: W2 W2 W2 W2 \n\
|
|
pxor %%mm6, %%mm6 # Put 0x0080*4 in MM6 (rounding)\n\
|
|
pcmpeqw %%mm7, %%mm7 \n\
|
|
psubw %%mm7, %%mm6 \n\
|
|
psllw $7, %%mm6 \n\
|
|
0: \n\
|
|
movq -8(%%esi,%%ecx), %%mm7 \n\
|
|
pxor %%mm0, %%mm0 # Load data into high bytes \n\
|
|
punpcklbw %%mm7, %%mm0 # (gives 8.8 fixed point) \n\
|
|
pmulhuw %%mm4, %%mm0 # Result: 0000..FF00 \n\
|
|
pxor %%mm1, %%mm1 \n\
|
|
punpckhbw %%mm7, %%mm1 \n\
|
|
pmulhuw %%mm4, %%mm1 \n\
|
|
movq -8(%%edx,%%ecx), %%mm7 \n\
|
|
pxor %%mm2, %%mm2 \n\
|
|
punpcklbw %%mm7, %%mm2 \n\
|
|
pmulhuw %%mm5, %%mm2 \n\
|
|
pxor %%mm3, %%mm3 \n\
|
|
punpckhbw %%mm7, %%mm3 \n\
|
|
pmulhuw %%mm5, %%mm3 \n\
|
|
paddw %%mm2, %%mm0 \n\
|
|
paddw %%mm6, %%mm0 \n\
|
|
psrlw $8, %%mm0 # Shift back down to 00..FF \n\
|
|
paddw %%mm3, %%mm1 \n\
|
|
paddw %%mm6, %%mm1 \n\
|
|
psrlw $8, %%mm1 \n\
|
|
packuswb %%mm1, %%mm0 \n\
|
|
movq %%mm0, -8(%%edi,%%ecx) \n\
|
|
subl $8, %%ecx \n\
|
|
jnz 0b \n\
|
|
emms"
|
|
: /* no outputs */
|
|
: "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~7));
|
|
}
|
|
if (UNLIKELY(bytes & 7)) {
|
|
rescale(src1+(bytes & ~7), src2+(bytes & ~7), dest+(bytes & ~7),
|
|
bytes & 7, weight1, weight2);
|
|
}
|
|
}
|
|
|
|
#endif /* (HAVE_ASM_MMXEXT || HAVE_ASM_SSE) && ARCH_X86 */
|
|
|
|
/*************************************************************************/
|
|
|
|
/* SSE2 version */
|
|
|
|
#if defined(HAVE_ASM_SSE2)
|
|
|
|
#ifdef ARCH_X86_64
|
|
# define ECX "%%rcx"
|
|
# define EDX "%%rdx"
|
|
# define ESI "%%rsi"
|
|
# define EDI "%%rdi"
|
|
#else
|
|
# define ECX "%%ecx"
|
|
# define EDX "%%edx"
|
|
# define ESI "%%esi"
|
|
# define EDI "%%edi"
|
|
#endif
|
|
|
|
static void rescale_sse2(const uint8_t *src1, const uint8_t *src2,
|
|
uint8_t *dest, int bytes,
|
|
uint32_t weight1, uint32_t weight2)
|
|
{
|
|
if (bytes >= 16) {
|
|
asm("movd %%eax, %%xmm4; movd %%edx, %%xmm5"
|
|
: : "a" (weight1<<16|weight1), "d" (weight2<<16|weight2));
|
|
asm("\
|
|
pshufd $0, %%xmm4, %%xmm4 # XMM4: W1 W1 W1 W1 W1 W1 W1 W1 \n\
|
|
pshufd $0, %%xmm5, %%xmm5 # XMM5: W2 W2 W2 W2 W2 W2 W2 W2 \n\
|
|
pxor %%xmm6, %%xmm6 # Put 0x0080*4 in XMM6 (rounding)\n\
|
|
pcmpeqw %%xmm7, %%xmm7 \n\
|
|
psubw %%xmm7, %%xmm6 \n\
|
|
psllw $7, %%xmm6 \n\
|
|
0: \n\
|
|
movdqu -16("ESI","ECX"), %%xmm7 \n\
|
|
pxor %%xmm0, %%xmm0 \n\
|
|
punpcklbw %%xmm7, %%xmm0 \n\
|
|
pmulhuw %%xmm4, %%xmm0 \n\
|
|
pxor %%xmm1, %%xmm1 \n\
|
|
punpckhbw %%xmm7, %%xmm1 \n\
|
|
pmulhuw %%xmm4, %%xmm1 \n\
|
|
movdqu -16("EDX","ECX"), %%xmm7 \n\
|
|
pxor %%xmm2, %%xmm2 \n\
|
|
punpcklbw %%xmm7, %%xmm2 \n\
|
|
pmulhuw %%xmm5, %%xmm2 \n\
|
|
pxor %%xmm3, %%xmm3 \n\
|
|
punpckhbw %%xmm7, %%xmm3 \n\
|
|
pmulhuw %%xmm5, %%xmm3 \n\
|
|
paddw %%xmm2, %%xmm0 \n\
|
|
paddw %%xmm6, %%xmm0 \n\
|
|
psrlw $8, %%xmm0 \n\
|
|
paddw %%xmm3, %%xmm1 \n\
|
|
paddw %%xmm6, %%xmm1 \n\
|
|
psrlw $8, %%xmm1 \n\
|
|
packuswb %%xmm1, %%xmm0 \n\
|
|
movdqu %%xmm0, -16("EDI","ECX") \n\
|
|
subl $16, %%ecx \n\
|
|
jnz 0b \n\
|
|
emms"
|
|
: /* no outputs */
|
|
: "S" (src1), "d" (src2), "D" (dest), "c" (bytes & ~15));
|
|
}
|
|
if (UNLIKELY(bytes & 15)) {
|
|
rescale(src1+(bytes & ~15), src2+(bytes & ~15), dest+(bytes & ~15),
|
|
bytes & 15, weight1, weight2);
|
|
}
|
|
}
|
|
|
|
#endif /* HAVE_ASM_SSE2 */
|
|
|
|
/*************************************************************************/
|
|
/*************************************************************************/
|
|
|
|
/* Initialization routine. */
|
|
|
|
int ac_rescale_init(int accel)
|
|
{
|
|
rescale_ptr = rescale;
|
|
|
|
#if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
|
|
if (HAS_ACCEL(accel, AC_MMX))
|
|
rescale_ptr = rescale_mmx;
|
|
#endif
|
|
#if (defined(HAVE_ASM_MMXEXT) || defined(HAVE_ASM_SSE)) && defined(ARCH_X86)
|
|
if (HAS_ACCEL(accel, AC_MMXEXT) || HAS_ACCEL(accel, AC_SSE))
|
|
rescale_ptr = rescale_mmxext;
|
|
#endif
|
|
#if defined(HAVE_ASM_SSE2)
|
|
if (HAS_ACCEL(accel, AC_SSE2))
|
|
rescale_ptr = rescale_sse2;
|
|
#endif
|
|
|
|
return 1;
|
|
}
|
|
|
|
/*************************************************************************/
|
|
|
|
/*
|
|
* Local variables:
|
|
* c-file-style: "stroustrup"
|
|
* c-file-offsets: ((case-label . *) (statement-case-intro . *))
|
|
* indent-tabs-mode: nil
|
|
* End:
|
|
*
|
|
* vim: expandtab shiftwidth=4:
|
|
*/
|