Xorg: amd64 sse2 yv12, i420 to rgb working now, other minor changes

ulab-next-nosound
Jay Sorg 10 years ago
parent 62481510f7
commit c3236e48a4

@ -17,7 +17,7 @@ OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
amd64 asm files amd64 asm functions
*/ */

@ -1,3 +1,47 @@
;
;Copyright 2014 Jay Sorg
;
;Permission to use, copy, modify, distribute, and sell this software and its
;documentation for any purpose is hereby granted without fee, provided that
;the above copyright notice appear in all copies and that both that
;copyright notice and this permission notice appear in supporting
;documentation.
;
;The above copyright notice and this permission notice shall be included in
;all copies or substantial portions of the Software.
;
;THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
;IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
;FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
;OPEN GROUP BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
;AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
;CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
;
;I420 to RGB32
;amd64 SSE2 32 bit
;
; RGB to YUV
; 0.299 0.587 0.114
; -0.14713 -0.28886 0.436
; 0.615 -0.51499 -0.10001
; YUV to RGB
; 1 0 1.13983
; 1 -0.39465 -0.58060
; 1 2.03211 0
; shift left 12
; 4096 0 4669
; 4096 -1616 -2378
; 4096 9324 0
SECTION .data
align 16
c128 times 8 dw 128
c4669 times 8 dw 4669
c1616 times 8 dw 1616
c2378 times 8 dw 2378
c9324 times 8 dw 9324
SECTION .text
%macro PROC 1 %macro PROC 1
align 16 align 16
@ -5,13 +49,200 @@
%1: %1:
%endmacro %endmacro
do8_uv:
; v
movd xmm1, [rbx] ; 4 at a time
lea rbx, [rbx + 4]
punpcklbw xmm1, xmm1
pxor xmm6, xmm6
punpcklbw xmm1, xmm6
movdqa xmm7, [rel c128]
psubw xmm1, xmm7
psllw xmm1, 4
; v
movd xmm2, [rdx] ; 4 at a time
lea rdx, [rdx + 4]
punpcklbw xmm2, xmm2
punpcklbw xmm2, xmm6
psubw xmm2, xmm7
psllw xmm2, 4
do8:
; y
movq xmm0, [rsi] ; 8 at a time
lea rsi, [rsi + 8]
pxor xmm6, xmm6
punpcklbw xmm0, xmm6
; r = y + hiword(4669 * (v << 4))
movdqa xmm4, [rel c4669]
pmulhw xmm4, xmm1
movdqa xmm3, xmm0
paddw xmm3, xmm4
; g = y - hiword(1616 * (u << 4)) - hiword(2378 * (v << 4))
movdqa xmm5, [rel c1616]
pmulhw xmm5, xmm2
movdqa xmm6, [rel c2378]
pmulhw xmm6, xmm1
movdqa xmm4, xmm0
psubw xmm4, xmm5
psubw xmm4, xmm6
; b = y + hiword(9324 * (u << 4))
movdqa xmm6, [rel c9324]
pmulhw xmm6, xmm2
movdqa xmm5, xmm0
paddw xmm5, xmm6
packuswb xmm3, xmm3 ; b
packuswb xmm4, xmm4 ; g
punpcklbw xmm3, xmm4 ; gb
pxor xmm4, xmm4 ; a
packuswb xmm5, xmm5 ; r
punpcklbw xmm5, xmm4 ; ar
movdqa xmm4, xmm3
punpcklwd xmm3, xmm5 ; argb
movdqa [rdi], xmm3
lea rdi, [rdi + 16]
punpckhwd xmm4, xmm5 ; argb
movdqa [rdi], xmm4
lea rdi, [rdi + 16]
ret;
;The first six integer or pointer arguments are passed in registers
; RDI, RSI, RDX, RCX, R8, and R9
;int ;int
;i420_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs) ;i420_to_rgb32_amd64_sse2(unsigned char *yuvs, int width, int height, int *rgbs)
PROC i420_to_rgb32_amd64_sse2 PROC i420_to_rgb32_amd64_sse2
push rbx push rbx
push rsi
push rdi
push rbp
push rdi
push rdx
mov rdi, rcx ; rgbs
mov rcx, rsi ; width
mov rdx, rcx
pop rbp ; height
mov rax, rbp
shr rbp, 1
imul rax, rcx ; rax = width * height
pop rsi ; y
mov rbx, rsi ; u = y + width * height
add rbx, rax
; local vars
; char* yptr1
; char* yptr2
; char* uptr
; char* vptr
; int* rgbs1
; int* rgbs2
; int width
sub rsp, 56 ; local vars, 56 bytes
mov [rsp + 0], rsi ; save y1
add rsi, rdx
mov [rsp + 8], rsi ; save y2
mov [rsp + 16], rbx ; save u
shr rax, 2
add rbx, rax ; v = u + (width * height / 4)
mov [rsp + 24], rbx ; save v
mov [rsp + 32], rdi ; save rgbs1
mov rax, rdx
shl rax, 2
add rdi, rax
mov [rsp + 40], rdi ; save rgbs2
loop_y:
mov rcx, rdx ; width
shr rcx, 3
; save rdx
mov [rsp + 48], rdx
prefetchnta 4096[rsp + 0] ; y
prefetchnta 1024[rsp + 16] ; u
prefetchnta 1024[rsp + 24] ; v
loop_x:
mov rsi, [rsp + 0] ; y1
mov rbx, [rsp + 16] ; u
mov rdx, [rsp + 24] ; v
mov rdi, [rsp + 32] ; rgbs1
; y1
call do8_uv
mov [rsp + 0], rsi ; y1
mov [rsp + 32], rdi ; rgbs1
mov rsi, [rsp + 8] ; y2
mov rdi, [rsp + 40] ; rgbs2
; y2
call do8
mov [rsp + 8], rsi ; y2
mov [rsp + 16], rbx ; u
mov [rsp + 24], rdx ; v
mov [rsp + 40], rdi ; rgbs2
dec rcx ; width
jnz loop_x
; restore rdx
mov rdx, [rsp + 48]
; update y1 and 2
mov rax, [rsp + 0]
mov rbx, rdx
add rax, rbx
mov [rsp + 0], rax
mov rax, [rsp + 8]
add rax, rbx
mov [rsp + 8], rax
; update rgb1 and 2
mov rax, [rsp + 32]
mov rbx, rdx
shl rbx, 2
add rax, rbx
mov [rsp + 32], rax
mov rax, [rsp + 40]
add rax, rbx
mov [rsp + 40], rax
mov rcx, rbp
dec rcx ; height
mov rbp, rcx
jnz loop_y
add rsp, 56
mov rax, 0 mov rax, 0
pop rbp
pop rdi
pop rsi
pop rbx pop rbx
ret ret
align 16 align 16

@ -129,11 +129,12 @@ PROC yv12_to_rgb32_amd64_sse2
push rbp push rbp
push rdi push rdi
push rdx
mov rdi, rcx ; rgbs mov rdi, rcx ; rgbs
mov rcx, rsi ; width mov rcx, rsi ; width
mov rdx, rcx mov rdx, rcx
mov rbp, rdx ; height pop rbp ; height
mov rax, rbp mov rax, rbp
shr rbp, 1 shr rbp, 1
imul rax, rcx ; rax = width * height imul rax, rcx ; rax = width * height

@ -423,8 +423,8 @@ stretch_RGB32_RGB32(int *src, int src_width, int src_height,
{ {
ih -= 1 << 16; ih -= 1 << 16;
src32++; src32++;
pix = *src32;
} }
pix = *src32;
ih += oh; ih += oh;
dst32++; dst32++;
} }
@ -445,11 +445,11 @@ stretch_RGB32_RGB32(int *src, int src_width, int src_height,
/******************************************************************************/ /******************************************************************************/
/* returns error */ /* returns error */
static CARD32 static CARD32
rdpDeferredXv(OsTimerPtr timer, CARD32 now, pointer arg) rdpDeferredXvCleanup(OsTimerPtr timer, CARD32 now, pointer arg)
{ {
rdpPtr dev; rdpPtr dev;
LLOGLN(0, ("rdpDeferredXv:")); LLOGLN(0, ("rdpDeferredXvCleanup:"));
dev = (rdpPtr) arg; dev = (rdpPtr) arg;
dev->xv_timer_schedualed = 0; dev->xv_timer_schedualed = 0;
dev->xv_data_bytes = 0; dev->xv_data_bytes = 0;
@ -476,19 +476,21 @@ xrdpVidPutImage(ScrnInfoPtr pScrn,
int error; int error;
GCPtr tempGC; GCPtr tempGC;
LLOGLN(10, ("xrdpVidPutImage:")); LLOGLN(10, ("xrdpVidPutImage: format 0x%8.8x", format));
LLOGLN(10, ("xrdpVidPutImage: src_x %d srcy_y %d", src_x, src_y)); LLOGLN(10, ("xrdpVidPutImage: src_x %d srcy_y %d", src_x, src_y));
dev = XRDPPTR(pScrn); dev = XRDPPTR(pScrn);
if (dev->xv_timer_schedualed) if (dev->xv_timer_schedualed)
{ {
TimerCancel(dev->xv_timer); TimerCancel(dev->xv_timer);
dev->xv_timer = TimerSet(dev->xv_timer, 0, 2000, rdpDeferredXv, dev); dev->xv_timer = TimerSet(dev->xv_timer, 0, 2000,
rdpDeferredXvCleanup, dev);
} }
else else
{ {
dev->xv_timer_schedualed = 1; dev->xv_timer_schedualed = 1;
dev->xv_timer = TimerSet(dev->xv_timer, 0, 2000, rdpDeferredXv, dev); dev->xv_timer = TimerSet(dev->xv_timer, 0, 2000,
rdpDeferredXvCleanup, dev);
} }
index = width * height * 4 + drw_w * drw_h * 4 + 64; index = width * height * 4 + drw_w * drw_h * 4 + 64;

@ -140,12 +140,12 @@ PROC i420_to_rgb32_x86_sse2
add ebx, eax add ebx, eax
; local vars ; local vars
; char* yptr1; ; char* yptr1
; char* yptr2; ; char* yptr2
; char* uptr; ; char* uptr
; char* vptr; ; char* vptr
; int* rgbs1; ; int* rgbs1
; int* rgbs2; ; int* rgbs2
; int width ; int width
sub esp, 28 ; local vars, 28 bytes sub esp, 28 ; local vars, 28 bytes
mov [esp + 0], esi ; save y1 mov [esp + 0], esi ; save y1

Loading…
Cancel
Save