Xorg: asm changes

ulab-next-nosound
Jay Sorg 10 years ago
parent 8b400ca0f3
commit 1956bc88e1

@ -1,15 +1,24 @@
; RGB to YUV
; 0.299 0.587 0.114
; -0.14713 -0.28886 0.436
; 0.615 -0.51499 -0.10001
; YUV to RGB
; 1 0 1.13983
; 1 -0.39465 -0.58060
; 1 2.03211 0
; shift left 12
; 4096 0 4669
; 4096 -1616 -2378
; 4096 9324 0
SECTION .data SECTION .data
align 16 align 16
c8 times 4 dd 8 c128 times 8 dw 128
c16 times 4 dd 16 c4669 times 8 dw 4669
c100 times 4 dd 100 c1616 times 8 dw 1616
c128 times 4 dd 128 c2378 times 8 dw 2378
c208 times 4 dd 208 c9324 times 8 dw 9324
c255 times 4 dd 255
c298 times 4 dd 298
c409 times 4 dd 409
c516 times 4 dd 516
SECTION .text SECTION .text
@ -19,136 +28,70 @@ SECTION .text
%1: %1:
%endmacro %endmacro
y1_do4: do8_uv:
; y
movd xmm0, [esi] ; 4 at a time
add esi, 4
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
punpcklwd xmm0, xmm6
movdqa xmm7, [c16]
psubd xmm0, xmm7
; u ; u
movd xmm1, [ebx] ; read 4 but only using 2 movd xmm1, [ebx] ; 4 at a time
add ebx, 2 add ebx, 4
punpcklbw xmm1, xmm1 punpcklbw xmm1, xmm1
pxor xmm6, xmm6
punpcklbw xmm1, xmm6 punpcklbw xmm1, xmm6
punpcklwd xmm1, xmm6
movdqa xmm7, [c128] movdqa xmm7, [c128]
psubd xmm1, xmm7 psubw xmm1, xmm7
psllw xmm1, 4
; v ; v
movd xmm2, [edx] ; read 4 but only using 2 movd xmm2, [edx] ; 4 at a time
add edx, 2 add edx, 4
punpcklbw xmm2, xmm2 punpcklbw xmm2, xmm2
punpcklbw xmm2, xmm6 punpcklbw xmm2, xmm6
punpcklwd xmm2, xmm6 psubw xmm2, xmm7
psubd xmm2, xmm7 psllw xmm2, 4
; t = (298 * c + 409 * e + 128) >> 8;
movdqa xmm3, [c298]
pmulld xmm3, xmm0
movdqa xmm4, [c409]
pmulld xmm4, xmm2
paddd xmm3, xmm4
paddd xmm3, xmm7
psrad xmm3, 8
; t = (298 * c - 100 * d - 208 * e + 128) >> 8;
movdqa xmm4, [c298]
pmulld xmm4, xmm0
movdqa xmm5, [c100]
pmulld xmm5, xmm1
movdqa xmm6, [c208]
pmulld xmm6, xmm2
psubd xmm4, xmm5
psubd xmm4, xmm6
paddd xmm4, xmm7
psrad xmm4, 8
; t = (298 * c + 516 * d + 128) >> 8;
movdqa xmm5, [c298]
pmulld xmm5, xmm0
movdqa xmm6, [c516]
pmulld xmm6, xmm1
paddd xmm5, xmm6
paddd xmm5, xmm7
psrad xmm5, 8
packusdw xmm3, xmm3 ; b
packuswb xmm3, xmm3
packusdw xmm4, xmm4 ; g
packuswb xmm4, xmm4
punpcklbw xmm3, xmm4 ; gb
pxor xmm4, xmm4 ; a do8:
packusdw xmm5, xmm5 ; b
packuswb xmm5, xmm5
punpcklbw xmm5, xmm4 ; ar
punpcklwd xmm3, xmm5 ; argb
movdqu [edi], xmm3
add edi, 16
ret;
y2_do4:
; y ; y
movd xmm0, [esi] ; read 4 but only using 2 movq xmm0, [esi] ; 8 at a time
add esi, 4 add esi, 8
pxor xmm6, xmm6 pxor xmm6, xmm6
punpcklbw xmm0, xmm6 punpcklbw xmm0, xmm6
punpcklwd xmm0, xmm6
movdqa xmm7, [c16]
psubd xmm0, xmm7
movdqa xmm7, [c128]
; t = (298 * c + 409 * e + 128) >> 8; ; r = y + hiword(4669 * (v << 4))
movdqa xmm3, [c298] movdqa xmm4, [c4669]
pmulld xmm3, xmm0 pmulhw xmm4, xmm2
movdqa xmm4, [c409] movdqa xmm3, xmm0
pmulld xmm4, xmm2 paddw xmm3, xmm4
paddd xmm3, xmm4
paddd xmm3, xmm7 ; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4))
psrad xmm3, 8 movdqa xmm5, [c1616]
pmulhw xmm5, xmm1
; t = (298 * c - 100 * d - 208 * e + 128) >> 8; movdqa xmm6, [c2378]
movdqa xmm4, [c298] pmulhw xmm6, xmm2
pmulld xmm4, xmm0 movdqa xmm4, xmm0
movdqa xmm5, [c100] psubw xmm4, xmm5
pmulld xmm5, xmm1 psubw xmm4, xmm6
movdqa xmm6, [c208]
pmulld xmm6, xmm2 ; b = y + hiword(9324 * (u << 4))
psubd xmm4, xmm5 movdqa xmm6, [c9324]
psubd xmm4, xmm6 pmulhw xmm6, xmm1
paddd xmm4, xmm7 movdqa xmm5, xmm0
psrad xmm4, 8 paddw xmm5, xmm6
; t = (298 * c + 516 * d + 128) >> 8; packuswb xmm3, xmm3 ; b
movdqa xmm5, [c298] packuswb xmm4, xmm4 ; g
pmulld xmm5, xmm0
movdqa xmm6, [c516]
pmulld xmm6, xmm1
paddd xmm5, xmm6
paddd xmm5, xmm7
psrad xmm5, 8
packusdw xmm3, xmm3 ; b
packuswb xmm3, xmm3
packusdw xmm4, xmm4 ; g
packuswb xmm4, xmm4
punpcklbw xmm3, xmm4 ; gb punpcklbw xmm3, xmm4 ; gb
pxor xmm4, xmm4 ; a pxor xmm4, xmm4 ; a
packusdw xmm5, xmm5 ; b packuswb xmm5, xmm5 ; r
packuswb xmm5, xmm5
punpcklbw xmm5, xmm4 ; ar punpcklbw xmm5, xmm4 ; ar
movdqa xmm4, xmm3
punpcklwd xmm3, xmm5 ; argb punpcklwd xmm3, xmm5 ; argb
movdqu [edi], xmm3 movdqu [edi], xmm3
add edi, 16 add edi, 16
punpckhwd xmm4, xmm5 ; argb
movdqu [edi], xmm4
add edi, 16
ret; ret;
@ -201,14 +144,14 @@ PROC yv12_to_rgb32_x86_sse2
loop_y: loop_y:
mov ecx, edx ; width mov ecx, edx ; width
shr ecx, 2 shr ecx, 3
; save edx ; save edx
mov [esp + 24], edx mov [esp + 24], edx
prefetchnta 4096[esp + 0] ; y prefetchnta 4096[esp + 0] ; y
prefetchnta 4096[esp + 8] ; u prefetchnta 1024[esp + 8] ; u
prefetchnta 4096[esp + 12] ; v prefetchnta 1024[esp + 12] ; v
loop_x: loop_x:
@ -218,7 +161,7 @@ loop_x:
mov edi, [esp + 16] ; rgbs1 mov edi, [esp + 16] ; rgbs1
; y1 ; y1
call y1_do4 call do8_uv
mov [esp + 0], esi ; y1 mov [esp + 0], esi ; y1
mov [esp + 16], edi ; rgbs1 mov [esp + 16], edi ; rgbs1
@ -227,7 +170,7 @@ loop_x:
mov edi, [esp + 20] ; rgbs2 mov edi, [esp + 20] ; rgbs2
; y2 ; y2
call y2_do4 call do8
mov [esp + 4], esi ; y2 mov [esp + 4], esi ; y2
mov [esp + 8], ebx ; u mov [esp + 8], ebx ; u

Loading…
Cancel
Save