You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

982 lines
45 KiB

/*
* img_yuv_packed.c - YUV planar<->packed image format conversion routines
* Written by Andrew Church <achurch@achurch.org>
*
* This file is part of transcode, a video stream processing tool.
* transcode is free software, distributable under the terms of the GNU
* General Public License (version 2 or later). See the file COPYING
* for details.
*/
#include "ac.h"
#include "imgconvert.h"
#include "img_internal.h"
/*************************************************************************/
/*************************************************************************/
/* Standard C implementations */
/*************************************************************************/
/* Wrappers for UYVY and YVYU */
/* Note: we rely on YUY2<->{UYVY,YVYU} working for src==dest */
/* FIXME: when converting from UYVY/YVYU, src is destroyed! */
static int uyvy_yvyu_wrapper(uint8_t **src, ImageFormat srcfmt,
uint8_t **dest, ImageFormat destfmt,
int width, int height)
{
if (srcfmt == IMG_UYVY || srcfmt == IMG_YVYU)
return ac_imgconvert(src, srcfmt, src, IMG_YUY2, width, height)
&& ac_imgconvert(src, IMG_YUY2, dest, destfmt, width, height);
else
return ac_imgconvert(src, srcfmt, dest, IMG_YUY2, width, height)
&& ac_imgconvert(dest, IMG_YUY2, dest, destfmt, width, height);
}
static int yuv420p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_UYVY, width, height); }
static int yuv420p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YUV420P, dest, IMG_YVYU, width, height); }
static int yuv411p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_UYVY, width, height); }
static int yuv411p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YUV411P, dest, IMG_YVYU, width, height); }
static int yuv422p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_UYVY, width, height); }
static int yuv422p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YUV422P, dest, IMG_YVYU, width, height); }
static int yuv444p_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_UYVY, width, height); }
static int yuv444p_yvyu(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YUV444P, dest, IMG_YVYU, width, height); }
static int uyvy_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV420P, width, height); }
static int yvyu_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV420P, width, height); }
static int uyvy_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV411P, width, height); }
static int yvyu_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV411P, width, height); }
static int uyvy_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV422P, width, height); }
static int yvyu_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV422P, width, height); }
static int uyvy_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_UYVY, dest, IMG_YUV444P, width, height); }
static int yvyu_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
{ return uyvy_yvyu_wrapper(src, IMG_YVYU, dest, IMG_YUV444P, width, height); }
/*************************************************************************/
static int yuv420p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
{
int x, y;
for (y = 0; y < (height & ~1); y++) {
for (x = 0; x < (width & ~1); x += 2) {
dest[0][(y*width+x)*2 ] = src[0][y*width+x];
dest[0][(y*width+x)*2+1] = src[1][(y/2)*(width/2)+x/2];
dest[0][(y*width+x)*2+2] = src[0][y*width+x+1];
dest[0][(y*width+x)*2+3] = src[2][(y/2)*(width/2)+x/2];
}
}
return 1;
}
static int yuv411p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
{
int x, y;
for (y = 0; y < height; y++) {
for (x = 0; x < (width & ~1); x += 2) {
dest[0][(y*width+x)*2 ] = src[0][y*width+x];
dest[0][(y*width+x)*2+1] = src[1][y*(width/4)+x/4];
dest[0][(y*width+x)*2+2] = src[0][y*width+x+1];
dest[0][(y*width+x)*2+3] = src[2][y*(width/4)+x/4];
}
}
return 1;
}
static int yuv422p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
{
int i;
for (i = 0; i < (width/2)*height; i++) {
dest[0][i*4 ] = src[0][i*2];
dest[0][i*4+1] = src[1][i];
dest[0][i*4+2] = src[0][i*2+1];
dest[0][i*4+3] = src[2][i];
}
return 1;
}
static int yuv444p_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
{
int i;
for (i = 0; i < (width/2)*height; i++) {
dest[0][i*4 ] = src[0][i*2];
dest[0][i*4+1] = (src[1][i*2] + src[1][i*2+1]) / 2;
dest[0][i*4+2] = src[0][i*2+1];
dest[0][i*4+3] = (src[2][i*2] + src[2][i*2+1]) / 2;
}
return 1;
}
/*************************************************************************/
static int yuy2_yuv420p(uint8_t **src, uint8_t **dest, int width, int height)
{
int x, y;
for (y = 0; y < (height & ~1); y++) {
for (x = 0; x < (width & ~1); x += 2) {
dest[0][y*width+x ] = src[0][(y*width+x)*2 ];
dest[0][y*width+x+1] = src[0][(y*width+x)*2+2];
if (y%2 == 0) {
dest[1][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+1];
dest[2][(y/2)*(width/2)+x/2] = src[0][(y*width+x)*2+3];
} else {
dest[1][(y/2)*(width/2)+x/2] =
(dest[1][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+1] + 1) / 2;
dest[2][(y/2)*(width/2)+x/2] =
(dest[2][(y/2)*(width/2)+x/2] + src[0][(y*width+x)*2+3] + 1) / 2;
}
}
}
return 1;
}
static int yuy2_yuv411p(uint8_t **src, uint8_t **dest, int width, int height)
{
int x, y;
for (y = 0; y < height; y++) {
for (x = 0; x < (width & ~3); x += 4) {
dest[0][y*width+x] = src[0][(y*width+x)*2 ];
dest[0][y*width+x+1] = src[0][(y*width+x)*2+2];
dest[0][y*width+x+2] = src[0][(y*width+x)*2+4];
dest[0][y*width+x+3] = src[0][(y*width+x)*2+6];
dest[1][y*(width/4)+x/4] = (src[0][(y*width+x)*2+1]
+ src[0][(y*width+x)*2+5] + 1) / 2;
dest[2][y*(width/4)+x/4] = (src[0][(y*width+x)*2+3]
+ src[0][(y*width+x)*2+7] + 1) / 2;
}
}
return 1;
}
static int yuy2_yuv422p(uint8_t **src, uint8_t **dest, int width, int height)
{
int i;
for (i = 0; i < (width/2)*height; i++) {
dest[0][i*2] = src[0][i*4 ];
dest[1][i] = src[0][i*4+1];
dest[0][i*2+1] = src[0][i*4+2];
dest[2][i] = src[0][i*4+3];
}
return 1;
}
static int yuy2_yuv444p(uint8_t **src, uint8_t **dest, int width, int height)
{
int i;
for (i = 0; i < (width & ~1)*height; i += 2) {
dest[0][i] = src[0][i*2 ];
dest[1][i] = src[0][i*2+1];
dest[1][i+1] = src[0][i*2+1];
dest[0][i+1] = src[0][i*2+2];
dest[2][i] = src[0][i*2+3];
dest[2][i+1] = src[0][i*2+3];
}
return 1;
}
/*************************************************************************/
static int y8_yuy2(uint8_t **src, uint8_t **dest, int width, int height)
{
int i;
for (i = 0; i < width*height; i++) {
dest[0][i*2 ] = src[0][i];
dest[0][i*2+1] = 128;
}
return 1;
}
static int y8_uyvy(uint8_t **src, uint8_t **dest, int width, int height)
{
int i;
for (i = 0; i < width*height; i++) {
dest[0][i*2 ] = 128;
dest[0][i*2+1] = src[0][i];
}
return 1;
}
static int yuy2_y8(uint8_t **src, uint8_t **dest, int width, int height)
{
int i;
for (i = 0; i < width*height; i++)
dest[0][i] = src[0][i*2];
return 1;
}
static int uyvy_y8(uint8_t **src, uint8_t **dest, int width, int height)
{
int i;
for (i = 0; i < width*height; i++)
dest[0][i] = src[0][i*2+1];
return 1;
}
/*************************************************************************/
/*************************************************************************/
#if defined(HAVE_ASM_SSE2)
/* SSE2 routines. See comments in img_x86_common.h for why we don't bother
* unrolling the loops. */
/* Common macros/data for x86 code */
#include "img_x86_common.h"
/* YUV420P (1 row) or YUV422P -> YUY2 (unit: 2 pixels) */
#define YUV42XP_YUY2 \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 8, \
/* push_regs */ PUSH(EBX), \
/* pop_regs */ POP(EBX), \
/* small_loop */ \
"movb -1("EDX","ECX"), %%bh \n\
movb -1("ESI","ECX",2), %%bl \n\
shll $16, %%ebx \n\
movb -1("EAX","ECX"), %%bh \n\
movb -2("ESI","ECX",2), %%bl \n\
movl %%ebx, -4("EDI","ECX",4)", \
/* main_loop */ \
"movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\
movq -8("EAX","ECX"), %%xmm2 # XMM2: U7 U6 U5 U4 U3 U2 U1 U0 \n\
movq -8("EDX","ECX"), %%xmm3 # XMM3: V7 V6 V5 V4 V3 V2 V1 V0 \n\
punpcklbw %%xmm3, %%xmm2 # XMM2: V7 U7 V6 ..... U1 V0 U0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
punpcklbw %%xmm2, %%xmm0 # XMM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
punpckhbw %%xmm2, %%xmm1 # XMM1: V7 YF U7 ..... Y9 U4 Y8 \n\
movdqu %%xmm0, -32("EDI","ECX",4) \n\
movdqu %%xmm1, -16("EDI","ECX",4)", \
/* emms */ "emms")
/* YUV411P -> YUY2 (unit: 4 pixels) */
#define YUV411P_YUY2 \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 4, \
/* push_regs */ PUSH(EBX), \
/* pop_regs */ POP(EBX), \
/* small_loop */ \
"movb -1("EDX","ECX"), %%bh \n\
movb -1("ESI","ECX",4), %%bl \n\
shll $16, %%ebx \n\
movb -1("EAX","ECX"), %%bh \n\
movb -2("ESI","ECX",4), %%bl \n\
movl %%ebx, -4("EDI","ECX",8) \n\
movb -1("EDX","ECX"), %%bh \n\
movb -3("ESI","ECX",4), %%bl \n\
shll $16, %%ebx \n\
movb -1("EAX","ECX"), %%bh \n\
movb -4("ESI","ECX",4), %%bl \n\
movl %%ebx, -8("EDI","ECX",8)", \
/* main_loop */ \
"movdqu -16("ESI","ECX",4),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\
movd -4("EAX","ECX"), %%xmm2 # XMM2: U3 U2 U1 U0 \n\
punpcklbw %%xmm2, %%xmm2 # XMM2: U3 U3 U2 U2 U1 U1 U0 U0 \n\
movd -4("EDX","ECX"), %%xmm3 # XMM3: V3 V2 V1 V0 \n\
punpcklbw %%xmm3, %%xmm3 # XMM3: V3 V3 V2 V2 V1 V1 V0 V0 \n\
punpcklbw %%xmm3, %%xmm2 # XMM2: V3 U3 V3 ..... U0 V0 U0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
punpcklbw %%xmm2, %%xmm0 # XMM0: V1 Y7 U1 ..... Y1 U0 Y0 \n\
punpckhbw %%xmm2, %%xmm1 # XMM1: V3 YF U3 ..... Y9 U2 Y8 \n\
movdqu %%xmm0, -32("EDI","ECX",8) \n\
movdqu %%xmm1, -16("EDI","ECX",8)", \
/* emms */ "emms")
/* YUV444P -> YUY2 (unit: 2 pixels) */
#define YUV444P_YUY2 \
/* Load 0x00FF*8 into XMM7 for masking */ \
"pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 8, \
/* push_regs */ PUSH2(EBX,EBP), \
/* pop_regs */ POP2(EBP,EBX), \
/* small_loop */ \
"movzbl -1("EDX","ECX",2), %%ebx \n\
movzbl -2("EDX","ECX",2), %%ebp \n\
addl %%ebp, %%ebx \n\
shrl $1, %%ebx \n\
movb %%bl, -1("EDI","ECX",4) \n\
movb -1("ESI","ECX",2), %%bl \n\
movb %%bl, -2("EDI","ECX",4) \n\
movzbl -1("EAX","ECX",2), %%ebx \n\
movzbl -2("EAX","ECX",2), %%ebp \n\
addl %%ebp, %%ebx \n\
shrl $1, %%ebx \n\
movb %%bl, -3("EDI","ECX",4) \n\
movb -2("ESI","ECX",2), %%bl \n\
movb %%bl, -4("EDI","ECX",4)", \
/* main_loop */ \
"movdqu -16("ESI","ECX",2),%%xmm0 #XM0: YF YE YD ..... Y2 Y1 Y0 \n\
movdqu -16("EAX","ECX",2), %%xmm2 #XM2: UF UE UD ..... U2 U1 U0 \n\
movdqu -16("EDX","ECX",2), %%xmm3 #XM3: VF VE VD ..... V2 V1 V0 \n\
movdqa %%xmm2, %%xmm4 # XMM4: UF UE UD ..... U2 U1 U0 \n\
pand %%xmm7, %%xmm2 # XMM2: -- UE -- ..... U2 -- U0 \n\
psrlw $8, %%xmm4 # XMM4: -- UF -- ..... U3 -- U1 \n\
pavgw %%xmm4, %%xmm2 # XMM2: -- u7 -- ..... u1 -- u0 \n\
movdqa %%xmm3, %%xmm5 # XMM4: UF UE UD ..... U2 U1 U0 \n\
pand %%xmm7, %%xmm3 # XMM3: -- VE -- ..... V2 -- V0 \n\
psrlw $8, %%xmm5 # XMM5: -- VF -- ..... V3 -- V1 \n\
pavgw %%xmm5, %%xmm3 # XMM3: -- v7 -- ..... v1 -- v0 \n\
psllw $8, %%xmm3 # XMM3: v7 -- v6 ..... -- v0 -- \n\
por %%xmm3, %%xmm2 # XMM2: v7 u7 v6 ..... u1 v0 u0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
punpcklbw %%xmm2, %%xmm0 # XMM0: v3 Y7 u3 ..... Y1 u0 Y0 \n\
punpckhbw %%xmm2, %%xmm1 # XMM1: v7 YF u7 ..... Y9 u4 Y8 \n\
movdqu %%xmm0, -32("EDI","ECX",4) \n\
movdqu %%xmm1, -16("EDI","ECX",4)", \
/* emms */ "emms")
/* YUY2 -> YUV420P (U row) (unit: 2 pixels) */
#define YUY2_YUV420P_U \
/* Load 0x00FF*8 into XMM7 for masking */ \
"pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 4, \
/* push_regs */ PUSH2(EBX,EBP), \
/* pop_regs */ POP2(EBP,EBX), \
/* small_loop */ \
"movb -4("ESI","ECX",4), %%bl \n\
movb %%bl, -2("EDI","ECX",2) \n\
movb -2("ESI","ECX",4), %%bl \n\
movb %%bl, -1("EDI","ECX",2) \n\
movzbl -3("ESI","ECX",4), %%ebx \n\
movzbl -3("EAX","ECX",4), %%ebp \n\
addl %%ebp, %%ebx \n\
shrl $1, %%ebx \n\
movb %%bl, -1("EDX","ECX")", \
/* main_loop */ \
"movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\
pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
psrlw $8, %%xmm2 # XMM2: -- Vd -- ..... Va -- Ua \n\
pavgw %%xmm2, %%xmm1 # XMM1: -- v3 -- ..... v0 -- u0 \n\
packuswb %%xmm1, %%xmm1 # XMM1: v3 u3 v2 u2 v1 u1 v0 u0 \n\
pand %%xmm7, %%xmm1 # XMM1: -- u3 -- u2 -- u1 -- u0 \n\
packuswb %%xmm1, %%xmm1 # XMM1: u3 u2 u1 u0 \n\
movq %%xmm0, -8("EDI","ECX",2) \n\
movd %%xmm1, -4("EDX","ECX")", \
/* emms */ "emms")
/* YUY2 -> YUV420P (V row) (unit: 2 pixels) */
#define YUY2_YUV420P_V \
/* Load 0x00FF*8 into XMM7 for masking */ \
"pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 4, \
/* push_regs */ PUSH2(EBX,EBP), \
/* pop_regs */ POP2(EBP,EBX), \
/* small_loop */ \
"movb -4("ESI","ECX",4), %%bl \n\
movb %%bl, -2("EDI","ECX",2) \n\
movb -2("ESI","ECX",4), %%bl \n\
movb %%bl, -1("EDI","ECX",2) \n\
movzbl -1("ESI","ECX",4), %%ebx \n\
movzbl -1("EAX","ECX",4), %%ebp \n\
addl %%ebp, %%ebx \n\
shrl $1, %%ebx \n\
movb %%bl, -1("EDX","ECX")", \
/* main_loop */ \
"movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
movdqu -16("EAX","ECX",4),%%xmm2 #XMM2: Vd Yh Ud ..... Yb Ua Ya \n\
pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
psrlw $8, %%xmm2 # XMM2: -- Vd -- ..... Va -- Ua \n\
pavgw %%xmm1, %%xmm2 # XMM2: -- v3 -- ..... v0 -- u0 \n\
packuswb %%xmm2, %%xmm2 # XMM2: v3 u3 v2 u2 v1 u1 v0 u0 \n\
psrlw $8, %%xmm2 # XMM2: -- v3 -- v2 -- v1 -- v0 \n\
packuswb %%xmm2, %%xmm2 # XMM2: v3 v2 v1 v0 \n\
movq %%xmm0, -8("EDI","ECX",2) \n\
movd %%xmm2, -4("EDX","ECX")", \
/* emms */ "emms")
/* YUY2 -> YUV411P (unit: 4 pixels) */
#define YUY2_YUV411P \
/* Load 0x000..000FFFFFFFF into XMM6, 0x00FF*8 into XMM7 for masking */ \
"pcmpeqd %%xmm6, %%xmm6; psrldq $12, %%xmm6;" \
"pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 2, \
/* push_regs */ PUSH2(EBX,EBP), \
/* pop_regs */ POP2(EBP,EBX), \
/* small_loop */ \
"movb -8("ESI","ECX",8), %%bl \n\
movb %%bl, -4("EDI","ECX",4) \n\
movb -6("ESI","ECX",8), %%bl \n\
movb %%bl, -3("EDI","ECX",4) \n\
movb -4("ESI","ECX",8), %%bl \n\
movb %%bl, -2("EDI","ECX",4) \n\
movb -2("ESI","ECX",8), %%bl \n\
movb %%bl, -1("EDI","ECX",4) \n\
movzbl -7("ESI","ECX",8), %%ebx \n\
movzbl -3("ESI","ECX",8), %%ebp \n\
addl %%ebp, %%ebx \n\
shrl $1, %%ebx \n\
movb %%bl, -1("EAX","ECX") \n\
movzbl -5("ESI","ECX",8), %%ebx \n\
movzbl -1("ESI","ECX",8), %%ebp \n\
addl %%ebp, %%ebx \n\
shrl $1, %%ebx \n\
movb %%bl, -1("EDX","ECX")", \
/* main_loop */ \
"movdqu -16("ESI","ECX",8),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\
movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\
psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\
packuswb %%xmm1, %%xmm1 # XMM1: U3 U2 U1 U0 \n\
packuswb %%xmm2, %%xmm2 # XMM2: V3 V2 V1 V0 \n\
pand %%xmm6, %%xmm1 # XMM1: -- -- -- -- U3 U2 U1 U0 \n\
psllq $32, %%xmm2 # XMM2: V3 V2 V1 V0 -- -- -- -- \n\
por %%xmm1, %%xmm2 # XMM2: V3 V2 V1 V0 U3 U2 U1 U0 \n\
movdqa %%xmm2, %%xmm1 # XMM1: V3 V2 V1 V0 U3 U2 U1 U0 \n\
pand %%xmm7, %%xmm1 # XMM1: -- V2 -- V0 -- U2 -- U0 \n\
psrlw $8, %%xmm2 # XMM2: -- V3 -- V1 -- U3 -- U1 \n\
pavgw %%xmm2, %%xmm1 # XMM1: -- v1 -- v0 -- u1 -- u0 \n\
packuswb %%xmm1, %%xmm1 # XMM1: v1 v0 u1 u0 \n\
movq %%xmm0, -8("EDI","ECX",4) \n\
movd %%xmm1, %%ebx \n\
movw %%bx, -2("EAX","ECX") \n\
shrl $16, %%ebx; \n\
movw %%bx, -2("EDX","ECX")", \
/* emms */ "emms")
/* YUY2 -> YUV422P (unit: 2 pixels) */
#define YUY2_YUV422P \
/* Load 0x00FF*8 into XMM7 for masking */ \
"pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 4, \
/* push_regs */ PUSH(EBX), \
/* pop_regs */ POP(EBX), \
/* small_loop */ \
"movb -4("ESI","ECX",4), %%bl \n\
movb %%bl, -2("EDI","ECX",2) \n\
movb -2("ESI","ECX",4), %%bl \n\
movb %%bl, -1("EDI","ECX",2) \n\
movb -3("ESI","ECX",4), %%bl \n\
movb %%bl, -1("EAX","ECX") \n\
movb -1("ESI","ECX",4), %%bl \n\
movb %%bl, -1("EDX","ECX")", \
/* main_loop */ \
"movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\
movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\
psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\
packuswb %%xmm1, %%xmm1 # XMM1: U3 U2 U1 U0 \n\
packuswb %%xmm2, %%xmm2 # XMM2: V3 V2 V1 V0 \n\
movq %%xmm0, -8("EDI","ECX",2) \n\
movd %%xmm1, -4("EAX","ECX") \n\
movd %%xmm2, -4("EDX","ECX")", \
/* emms */ "emms")
/* YUY2 -> YUV444P (unit: 2 pixels) */
#define YUY2_YUV444P \
/* Load 0x00FF*8 into XMM7 for masking */ \
"pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 4, \
/* push_regs */ PUSH(EBX), \
/* pop_regs */ POP(EBX), \
/* small_loop */ \
"movb -4("ESI","ECX",4), %%bl \n\
movb %%bl, -2("EDI","ECX",2) \n\
movb -2("ESI","ECX",4), %%bl \n\
movb %%bl, -1("EDI","ECX",2) \n\
movb -3("ESI","ECX",4), %%bl \n\
movb %%bl, -2("EAX","ECX",2) \n\
movb %%bl, -1("EAX","ECX",2) \n\
movb -1("ESI","ECX",4), %%bl \n\
movb %%bl, -2("EDX","ECX",2) \n\
movb %%bl, -1("EDX","ECX",2)", \
/* main_loop */ \
"movdqu -16("ESI","ECX",4),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: V3 Y7 U3 ..... Y1 U0 Y0 \n\
pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
psrlw $8, %%xmm1 # XMM1: -- V3 -- ..... V0 -- U0 \n\
packuswb %%xmm1, %%xmm1 # XMM1: V3 U3 V2 U2 V1 U1 V0 U0 \n\
movdqa %%xmm1, %%xmm2 # XMM2: V3 U3 V2 U2 V1 U1 V0 U0 \n\
pand %%xmm7, %%xmm1 # XMM1: -- U3 -- U2 -- U1 -- U0 \n\
psrlw $8, %%xmm2 # XMM2: -- V3 -- V2 -- V1 -- V0 \n\
movdqa %%xmm1, %%xmm3 # XMM3: -- U3 -- U2 -- U1 -- U0 \n\
psllw $8, %%xmm3 # XMM3: U3 -- U2 -- U1 -- U0 -- \n\
por %%xmm3, %%xmm1 # XMM1: U3 U3 U2 U2 U1 U1 U0 U0 \n\
movdqa %%xmm2, %%xmm3 # XMM3: -- V3 -- V2 -- V1 -- V0 \n\
psllw $8, %%xmm3 # XMM3: V3 -- V2 -- V1 -- V0 -- \n\
por %%xmm3, %%xmm2 # XMM1: V3 V3 V2 V2 V1 V1 V0 V0 \n\
movq %%xmm0, -8("EDI","ECX",2) \n\
movq %%xmm1, -8("EAX","ECX",2) \n\
movq %%xmm2, -8("EDX","ECX",2)", \
/* emms */ "emms")
/* Y8 -> YUY2/YVYU (unit: 1 pixel) */
#define Y8_YUY2 \
/* Load 0x80*16 into XMM7 for interlacing U/V */ \
"pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\
SIMD_LOOP_WRAPPER( \
/* blocksize */ 16, \
/* push_regs */ PUSH(EBX), \
/* pop_regs */ POP(EBX), \
/* small_loop */ \
"movb -1("ESI","ECX"), %%al \n\
movb %%al, -2("EDI","ECX",2) \n\
movb $0x80, -1("EDI","ECX",2)", \
/* main_loop */ \
"movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\
movdqa %%xmm0, %%xmm1 # XMM1: YF YE YD ..... Y2 Y1 Y0 \n\
punpcklbw %%xmm7, %%xmm0 # XMM0: 80 Y7 80 ..... Y1 80 Y0 \n\
movdqu %%xmm0, -32("EDI","ECX",2) \n\
punpckhbw %%xmm7, %%xmm1 # XMM1: 80 YF 80 ..... Y9 80 Y8 \n\
movdqu %%xmm1, -16("EDI","ECX",2)", \
/* emms */ "emms")
/* Y8 -> UYVY (unit: 1 pixel) */
#define Y8_UYVY \
/* Load 0x80*16 into XMM7 for interlacing U/V */ \
"pcmpeqd %%xmm7, %%xmm7; psllw $7, %%xmm7; packsswb %%xmm7, %%xmm7;"\
SIMD_LOOP_WRAPPER( \
/* blocksize */ 16, \
/* push_regs */ "", \
/* pop_regs */ "", \
/* small_loop */ \
"movb -1("ESI","ECX"), %%al \n\
movb %%al, -1("EDI","ECX",2) \n\
movb $0x80, -2("EDI","ECX",2)", \
/* main_loop */ \
"movdqu -16("ESI","ECX"),%%xmm0 # XMM0: YF YE YD ..... Y2 Y1 Y0 \n\
movdqa %%xmm7, %%xmm1 # XMM1: 80 80 80 ..... 80 80 80 \n\
punpcklbw %%xmm0, %%xmm1 # XMM1: Y7 80 Y6 ..... 80 Y0 80 \n\
movdqu %%xmm1, -32("EDI","ECX",2) \n\
movdqa %%xmm7, %%xmm2 # XMM2: 80 80 80 ..... 80 80 80 \n\
punpckhbw %%xmm0, %%xmm2 # XMM0: YF 80 YE ..... 80 Y8 80 \n\
movdqu %%xmm2, -16("EDI","ECX",2)", \
/* emms */ "emms")
/* YUY2/YVYU -> Y8 (unit: 1 pixel) */
#define YUY2_Y8 \
/* Load 0x00FF*8 into XMM7 for masking */ \
"pcmpeqd %%xmm7, %%xmm7; psrlw $8, %%xmm7;" \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 8, \
/* push_regs */ "", \
/* pop_regs */ "", \
/* small_loop */ \
"movb -2("ESI","ECX",2), %%al \n\
movb %%al, -1("EDI","ECX")", \
/* main_loop */ \
"movdqu -16("ESI","ECX",2),%%xmm0 #XM0: V3 Y7 U3 ..... Y1 U0 Y0 \n\
pand %%xmm7, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movq %%xmm0, -8("EDI","ECX")", \
/* emms */ "emms")
/* UYVY -> Y8 (unit: 1 pixel) */
#define UYVY_Y8 \
SIMD_LOOP_WRAPPER( \
/* blocksize */ 8, \
/* push_regs */ "", \
/* pop_regs */ "", \
/* small_loop */ \
"movb -1("ESI","ECX",2), %%al \n\
movb %%al, -1("EDI","ECX")", \
/* main_loop */ \
"movdqu -16("ESI","ECX",2),%%xmm0 #XM0: Y7 V3 Y6 ..... V0 Y0 U0 \n\
psrlw $8, %%xmm0 # XMM0: -- Y7 -- ..... Y1 -- Y0 \n\
packuswb %%xmm0, %%xmm0 # XMM0: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 \n\
movq %%xmm0, -8("EDI","ECX")", \
/* emms */ "emms")
/*************************************************************************/
static int yuv420p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
int y;
for (y = 0; y < (height & ~1); y++) {
int dummy;
asm volatile(YUV42XP_YUY2
: "=c" (dummy) // Ensure GCC reloads ECX each time through
: "S" (src[0]+y*width), "a" (src[1]+(y/2)*(width/2)),
"d" (src[2]+(y/2)*(width/2)), "D" (dest[0]+y*width*2),
"0" (width/2)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG
#endif
);
}
return 1;
}
static int yuv411p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
if (!(width & 3)) {
asm(YUV411P_YUY2
: /* no outputs */
: "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]),
"c" ((width/4)*height)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG
#endif
);
} else {
int y;
for (y = 0; y < height; y++) {
int dummy;
asm volatile(YUV411P_YUY2
: "=c" (dummy)
: "S" (src[0]+y*width), "a" (src[1]+y*(width/4)),
"d" (src[2]+y*(width/4)), "D" (dest[0]+y*width*2),
"0" (width/4)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG
#endif
);
}
}
return 1;
}
static int yuv422p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
if (!(width & 1)) {
asm(YUV42XP_YUY2
: /* no outputs */
: "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]),
"c" ((width/2)*height)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG
#endif
);
} else {
int y;
for (y = 0; y < height; y++) {
int dummy;
asm volatile(YUV42XP_YUY2
: "=c" (dummy)
: "S" (src[0]+y*width), "a" (src[1]+y*(width/2)),
"d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2),
"0" (width/2)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG
#endif
);
}
}
return 1;
}
static int yuv444p_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
if (!(width & 1)) {
asm(YUV444P_YUY2
: /* no outputs */
: "S" (src[0]), "a" (src[1]), "d" (src[2]), "D" (dest[0]),
"c" ((width/2)*height)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG, FAKE_PUSH_REG_2
#endif
);
} else {
int y;
for (y = 0; y < height; y++) {
int dummy;
asm volatile(YUV444P_YUY2
: "=c" (dummy)
: "S" (src[0]+y*width), "a" (src[1]+y*(width/2)),
"d" (src[2]+y*(width/2)), "D" (dest[0]+y*width*2),
"0" (width/2)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG, FAKE_PUSH_REG_2
#endif
);
}
}
return 1;
}
/*************************************************************************/
static int yuy2_yuv420p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
int y;
for (y = 0; y < (height & ~1); y += 2) {
int dummy;
asm volatile(YUY2_YUV420P_U
: "=c" (dummy)
: "S" (src[0]+y*width*2), "a" (src[0]+(y+1)*width*2),
"D" (dest[0]+y*width), "d" (dest[1]+(y/2)*(width/2)),
"0" (width/2)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG, FAKE_PUSH_REG_2
#endif
);
asm volatile(YUY2_YUV420P_V
: "=c" (dummy)
: "S" (src[0]+(y+1)*width*2), "a" (src[0]+y*width*2),
"D" (dest[0]+(y+1)*width), "d" (dest[2]+(y/2)*(width/2)),
"0" (width/2)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG, FAKE_PUSH_REG_2
#endif
);
}
return 1;
}
static int yuy2_yuv411p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
if (!(width & 3)) {
asm(YUY2_YUV411P
: /* no outputs */
: "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]),
"c" ((width/4)*height)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG, FAKE_PUSH_REG_2
#endif
);
} else {
int y;
for (y = 0; y < height; y++) {
int dummy;
asm volatile(YUY2_YUV411P
: "=c" (dummy)
: "S" (src[0]+y*width*2), "D" (dest[0]+y*width),
"a" (dest[1]+y*(width/4)), "d" (dest[2]+y*(width/4)),
"0" (width/4)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG, FAKE_PUSH_REG_2
#endif
);
}
}
return 1;
}
static int yuy2_yuv422p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
if (!(width & 1)) {
asm(YUY2_YUV422P
: /* no outputs */
: "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]),
"c" ((width/2)*height)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG
#endif
);
} else {
int y;
for (y = 0; y < height; y++) {
int dummy;
asm volatile(YUY2_YUV422P
: "=c" (dummy)
: "S" (src[0]+y*width*2), "D" (dest[0]+y*width),
"a" (dest[1]+y*(width/2)), "d" (dest[2]+y*(width/2)),
"0" (width/2)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG
#endif
);
}
}
return 1;
}
static int yuy2_yuv444p_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
if (!(width & 1)) {
asm(YUY2_YUV444P
: /* no outputs */
: "S" (src[0]), "D" (dest[0]), "a" (dest[1]), "d" (dest[2]),
"c" ((width/2)*height)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG
#endif
);
} else {
int y;
for (y = 0; y < height; y++) {
int dummy;
asm volatile(YUY2_YUV444P
: "=c" (dummy)
: "S" (src[0]+y*width*2), "D" (dest[0]+y*width),
"a" (dest[1]+y*width), "d" (dest[2]+y*width),
"0" (width/2)
#ifdef ARCH_X86_64
: FAKE_PUSH_REG
#endif
);
}
}
return 1;
}
/*************************************************************************/
static int y8_yuy2_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
asm(Y8_YUY2
: /* no outputs */
: "S" (src[0]), "D" (dest[0]), "c" (width*height)
: "eax" COMMA_FAKE_PUSH_REG
);
return 1;
}
static int y8_uyvy_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
asm(Y8_UYVY
: /* no outputs */
: "S" (src[0]), "D" (dest[0]), "c" (width*height)
: "eax");
return 1;
}
static int yuy2_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
asm(YUY2_Y8
: /* no outputs */
: "S" (src[0]), "D" (dest[0]), "c" (width*height)
: "eax");
return 1;
}
static int uyvy_y8_sse2(uint8_t **src, uint8_t **dest, int width, int height)
{
asm(UYVY_Y8
: /* no outputs */
: "S" (src[0]), "D" (dest[0]), "c" (width*height)
: "eax");
return 1;
}
/*************************************************************************/
#endif /* HAVE_ASM_SSE2 */
/*************************************************************************/
/*************************************************************************/
/* Initialization */
int ac_imgconvert_init_yuv_mixed(int accel)
{
if (!register_conversion(IMG_YUV420P, IMG_YUY2, yuv420p_yuy2)
|| !register_conversion(IMG_YUV411P, IMG_YUY2, yuv411p_yuy2)
|| !register_conversion(IMG_YUV422P, IMG_YUY2, yuv422p_yuy2)
|| !register_conversion(IMG_YUV444P, IMG_YUY2, yuv444p_yuy2)
|| !register_conversion(IMG_Y8, IMG_YUY2, y8_yuy2)
|| !register_conversion(IMG_YUV420P, IMG_UYVY, yuv420p_uyvy)
|| !register_conversion(IMG_YUV411P, IMG_UYVY, yuv411p_uyvy)
|| !register_conversion(IMG_YUV422P, IMG_UYVY, yuv422p_uyvy)
|| !register_conversion(IMG_YUV444P, IMG_UYVY, yuv444p_uyvy)
|| !register_conversion(IMG_Y8, IMG_UYVY, y8_uyvy)
|| !register_conversion(IMG_YUV420P, IMG_YVYU, yuv420p_yvyu)
|| !register_conversion(IMG_YUV411P, IMG_YVYU, yuv411p_yvyu)
|| !register_conversion(IMG_YUV422P, IMG_YVYU, yuv422p_yvyu)
|| !register_conversion(IMG_YUV444P, IMG_YVYU, yuv444p_yvyu)
|| !register_conversion(IMG_Y8, IMG_YVYU, y8_yuy2)
|| !register_conversion(IMG_YUY2, IMG_YUV420P, yuy2_yuv420p)
|| !register_conversion(IMG_YUY2, IMG_YUV411P, yuy2_yuv411p)
|| !register_conversion(IMG_YUY2, IMG_YUV422P, yuy2_yuv422p)
|| !register_conversion(IMG_YUY2, IMG_YUV444P, yuy2_yuv444p)
|| !register_conversion(IMG_YUY2, IMG_Y8, yuy2_y8)
|| !register_conversion(IMG_UYVY, IMG_YUV420P, uyvy_yuv420p)
|| !register_conversion(IMG_UYVY, IMG_YUV411P, uyvy_yuv411p)
|| !register_conversion(IMG_UYVY, IMG_YUV422P, uyvy_yuv422p)
|| !register_conversion(IMG_UYVY, IMG_YUV444P, uyvy_yuv444p)
|| !register_conversion(IMG_UYVY, IMG_Y8, uyvy_y8)
|| !register_conversion(IMG_YVYU, IMG_YUV420P, yvyu_yuv420p)
|| !register_conversion(IMG_YVYU, IMG_YUV411P, yvyu_yuv411p)
|| !register_conversion(IMG_YVYU, IMG_YUV422P, yvyu_yuv422p)
|| !register_conversion(IMG_YVYU, IMG_YUV444P, yvyu_yuv444p)
|| !register_conversion(IMG_YVYU, IMG_Y8, yuy2_y8)
) {
return 0;
}
#if defined(HAVE_ASM_SSE2)
if (accel & AC_SSE2) {
if (!register_conversion(IMG_YUV420P, IMG_YUY2, yuv420p_yuy2_sse2)
|| !register_conversion(IMG_YUV411P, IMG_YUY2, yuv411p_yuy2_sse2)
|| !register_conversion(IMG_YUV422P, IMG_YUY2, yuv422p_yuy2_sse2)
|| !register_conversion(IMG_YUV444P, IMG_YUY2, yuv444p_yuy2_sse2)
|| !register_conversion(IMG_Y8, IMG_YUY2, y8_yuy2_sse2)
|| !register_conversion(IMG_Y8, IMG_UYVY, y8_uyvy_sse2)
|| !register_conversion(IMG_Y8, IMG_YVYU, y8_yuy2_sse2)
|| !register_conversion(IMG_YUY2, IMG_YUV420P, yuy2_yuv420p_sse2)
|| !register_conversion(IMG_YUY2, IMG_YUV411P, yuy2_yuv411p_sse2)
|| !register_conversion(IMG_YUY2, IMG_YUV422P, yuy2_yuv422p_sse2)
|| !register_conversion(IMG_YUY2, IMG_YUV444P, yuy2_yuv444p_sse2)
|| !register_conversion(IMG_YUY2, IMG_Y8, yuy2_y8_sse2)
|| !register_conversion(IMG_UYVY, IMG_Y8, uyvy_y8_sse2)
|| !register_conversion(IMG_YVYU, IMG_Y8, yuy2_y8_sse2)
) {
return 0;
}
}
#endif /* HAVE_ASM_SSE2 */
return 1;
}
/*************************************************************************/
/*
* Local variables:
* c-file-style: "stroustrup"
* c-file-offsets: ((case-label . *) (statement-case-intro . *))
* indent-tabs-mode: nil
* End:
*
* vim: expandtab shiftwidth=4:
*/