Xorg: asm changes

ulab-next-nosound
Jay Sorg 10 years ago
parent 8b400ca0f3
commit 1956bc88e1

@ -1,15 +1,24 @@
; RGB to YUV
; 0.299 0.587 0.114
; -0.14713 -0.28886 0.436
; 0.615 -0.51499 -0.10001
; YUV to RGB
; 1 0 1.13983
; 1 -0.39465 -0.58060
; 1 2.03211 0
; shift left 12
; 4096 0 4669
; 4096 -1616 -2378
; 4096 9324 0
SECTION .data
align 16
c8 times 4 dd 8
c16 times 4 dd 16
c100 times 4 dd 100
c128 times 4 dd 128
c208 times 4 dd 208
c255 times 4 dd 255
c298 times 4 dd 298
c409 times 4 dd 409
c516 times 4 dd 516
c128 times 8 dw 128
c4669 times 8 dw 4669
c1616 times 8 dw 1616
c2378 times 8 dw 2378
c9324 times 8 dw 9324
SECTION .text
@ -19,136 +28,70 @@ SECTION .text
%1:
%endmacro
y1_do4:
; y
movd xmm0, [esi] ; 4 at a time
add esi, 4
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
punpcklwd xmm0, xmm6
movdqa xmm7, [c16]
psubd xmm0, xmm7
do8_uv:
; u
movd xmm1, [ebx] ; read 4 but only using 2
add ebx, 2
movd xmm1, [ebx] ; 4 at a time
add ebx, 4
punpcklbw xmm1, xmm1
pxor xmm6, xmm6
punpcklbw xmm1, xmm6
punpcklwd xmm1, xmm6
movdqa xmm7, [c128]
psubd xmm1, xmm7
psubw xmm1, xmm7
psllw xmm1, 4
; v
movd xmm2, [edx] ; read 4 but only using 2
add edx, 2
movd xmm2, [edx] ; 4 at a time
add edx, 4
punpcklbw xmm2, xmm2
punpcklbw xmm2, xmm6
punpcklwd xmm2, xmm6
psubd xmm2, xmm7
; t = (298 * c + 409 * e + 128) >> 8;
movdqa xmm3, [c298]
pmulld xmm3, xmm0
movdqa xmm4, [c409]
pmulld xmm4, xmm2
paddd xmm3, xmm4
paddd xmm3, xmm7
psrad xmm3, 8
; t = (298 * c - 100 * d - 208 * e + 128) >> 8;
movdqa xmm4, [c298]
pmulld xmm4, xmm0
movdqa xmm5, [c100]
pmulld xmm5, xmm1
movdqa xmm6, [c208]
pmulld xmm6, xmm2
psubd xmm4, xmm5
psubd xmm4, xmm6
paddd xmm4, xmm7
psrad xmm4, 8
; t = (298 * c + 516 * d + 128) >> 8;
movdqa xmm5, [c298]
pmulld xmm5, xmm0
movdqa xmm6, [c516]
pmulld xmm6, xmm1
paddd xmm5, xmm6
paddd xmm5, xmm7
psrad xmm5, 8
packusdw xmm3, xmm3 ; b
packuswb xmm3, xmm3
packusdw xmm4, xmm4 ; g
packuswb xmm4, xmm4
punpcklbw xmm3, xmm4 ; gb
psubw xmm2, xmm7
psllw xmm2, 4
pxor xmm4, xmm4 ; a
packusdw xmm5, xmm5 ; b
packuswb xmm5, xmm5
punpcklbw xmm5, xmm4 ; ar
do8:
punpcklwd xmm3, xmm5 ; argb
movdqu [edi], xmm3
add edi, 16
ret;
y2_do4:
; y
movd xmm0, [esi] ; read 4 but only using 2
add esi, 4
movq xmm0, [esi] ; 8 at a time
add esi, 8
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
punpcklwd xmm0, xmm6
movdqa xmm7, [c16]
psubd xmm0, xmm7
movdqa xmm7, [c128]
; t = (298 * c + 409 * e + 128) >> 8;
movdqa xmm3, [c298]
pmulld xmm3, xmm0
movdqa xmm4, [c409]
pmulld xmm4, xmm2
paddd xmm3, xmm4
paddd xmm3, xmm7
psrad xmm3, 8
; t = (298 * c - 100 * d - 208 * e + 128) >> 8;
movdqa xmm4, [c298]
pmulld xmm4, xmm0
movdqa xmm5, [c100]
pmulld xmm5, xmm1
movdqa xmm6, [c208]
pmulld xmm6, xmm2
psubd xmm4, xmm5
psubd xmm4, xmm6
paddd xmm4, xmm7
psrad xmm4, 8
; t = (298 * c + 516 * d + 128) >> 8;
movdqa xmm5, [c298]
pmulld xmm5, xmm0
movdqa xmm6, [c516]
pmulld xmm6, xmm1
paddd xmm5, xmm6
paddd xmm5, xmm7
psrad xmm5, 8
packusdw xmm3, xmm3 ; b
packuswb xmm3, xmm3
packusdw xmm4, xmm4 ; g
packuswb xmm4, xmm4
; r = y + hiword(4669 * (v << 4))
movdqa xmm4, [c4669]
pmulhw xmm4, xmm2
movdqa xmm3, xmm0
paddw xmm3, xmm4
; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4))
movdqa xmm5, [c1616]
pmulhw xmm5, xmm1
movdqa xmm6, [c2378]
pmulhw xmm6, xmm2
movdqa xmm4, xmm0
psubw xmm4, xmm5
psubw xmm4, xmm6
; b = y + hiword(9324 * (u << 4))
movdqa xmm6, [c9324]
pmulhw xmm6, xmm1
movdqa xmm5, xmm0
paddw xmm5, xmm6
packuswb xmm3, xmm3 ; b
packuswb xmm4, xmm4 ; g
punpcklbw xmm3, xmm4 ; gb
pxor xmm4, xmm4 ; a
packusdw xmm5, xmm5 ; b
packuswb xmm5, xmm5
packuswb xmm5, xmm5 ; r
punpcklbw xmm5, xmm4 ; ar
movdqa xmm4, xmm3
punpcklwd xmm3, xmm5 ; argb
movdqu [edi], xmm3
add edi, 16
punpckhwd xmm4, xmm5 ; argb
movdqu [edi], xmm4
add edi, 16
ret;
@ -201,14 +144,14 @@ PROC yv12_to_rgb32_x86_sse2
loop_y:
mov ecx, edx ; width
shr ecx, 2
shr ecx, 3
; save edx
mov [esp + 24], edx
prefetchnta 4096[esp + 0] ; y
prefetchnta 4096[esp + 8] ; u
prefetchnta 4096[esp + 12] ; v
prefetchnta 1024[esp + 8] ; u
prefetchnta 1024[esp + 12] ; v
loop_x:
@ -218,7 +161,7 @@ loop_x:
mov edi, [esp + 16] ; rgbs1
; y1
call y1_do4
call do8_uv
mov [esp + 0], esi ; y1
mov [esp + 16], edi ; rgbs1
@ -227,7 +170,7 @@ loop_x:
mov edi, [esp + 20] ; rgbs2
; y2
call y2_do4
call do8
mov [esp + 4], esi ; y2
mov [esp + 8], ebx ; u

Loading…
Cancel
Save