segment .data public class=DATA use32 flat

mmx_U_green  dd 0x0f37df37d, 0x0f37df37d
mmx_V_green  dd 0x0e5fce5fc, 0x0e5fce5fc
mmx_U_blue   dd 0x040934093, 0x040934093
mmx_V_red    dd 0x033123312, 0x033123312
mmx_00ffw    dd 0x000ff00ff, 0x000ff00ff
mmx_Y_coeff  dd 0x0253f253f, 0x0253f253f
mmx_80w      dd 0x000800080, 0x000800080

mmx_subYw    dd 0x010101010, 0x010101010
mmx_addYw    dd 0x000000000, 0x000000000

mmx_redmask  dd 0xf8f8f8f8, 0xf8f8f8f8
mmx_grnmask  dd 0xfcfcfcfc, 0xfcfcfcfc

M24A         dd 0xff0000ff,  0x00ff0000
M24B         dd 0x0000ff00,  0xff0000ff
M24C         dd 0x00ff0000,  0x0000ff00

segment .text public class=CODE use32 flat

global _yuv2rgb_pxor
global _yuv2rgb_load
global _yuv2rgb_proc
global _emms
global _yuv2rgb_16_mmx
global _yuv2rgb_16_mmx2
global _yuv2rgb_24_mmx
global _yuv2rgb_24_mmx2

global _yv12toyuy2_proc_mmx
global _yv12toyuy2_proc_mmx2

global _yuv2rgb_set_gamma_mmx


;push ebp
;
;mov ebp, esp
;
;[ebp+8]    0
;[ebp+0Ch]  1
;[ebp+10h]  2
;[ebp+14h]  3
;[ebp+18h]  4
;
;pop ebp

arg0 EQU 8
arg1 EQU 12
arg2 EQU 16
arg3 EQU 20
arg4 EQU 24
arg5 EQU 28
arg6 EQU 32

align 16

_yuv2rgb_pxor:

    pxor mm4, mm4

    ret

align 16

_yuv2rgb_load:

    push ebp
    mov ebp, esp
    push eax

    mov eax, [ebp+arg1]
    movd mm0, [eax]
    mov eax, [ebp+arg2]
    movd mm1, [eax]
    mov eax, [ebp+arg0]
    movq mm6, [eax]

    pop eax
    pop ebp

    ret

align 16

_yuv2rgb_proc:

    punpcklbw mm0, mm4
    punpcklbw mm1, mm4
    psubsw mm0, [mmx_80w]
    psubsw mm1, [mmx_80w]
    psllw mm0, 3
    psllw mm1, 3
    movq mm2, mm0
    movq mm3, mm1
    pmulhw mm2, [mmx_U_green]
    pmulhw mm3, [mmx_V_green]
    pmulhw mm0, [mmx_U_blue]
    pmulhw mm1, [mmx_V_red]
    paddsw mm2, mm3
    psubusb mm6, [mmx_subYw]
    paddusb mm6, [mmx_addYw]
    movq mm7, mm6
    pand mm6, [mmx_00ffw]
    psrlw mm7, 8
    psllw mm6, 3
    psllw mm7, 3
    pmulhw mm6, [mmx_Y_coeff]
    pmulhw mm7, [mmx_Y_coeff]
    movq mm3, mm0
    movq mm4, mm1
    movq mm5, mm2
    paddsw mm0, mm6
    paddsw mm3, mm7
    paddsw mm1, mm6
    paddsw mm4, mm7
    paddsw mm2, mm6
    paddsw mm5, mm7
    packuswb mm0, mm0
    packuswb mm1, mm1
    packuswb mm2, mm2
    packuswb mm3, mm3
    packuswb mm4, mm4
    packuswb mm5, mm5
    punpcklbw mm0, mm3
    punpcklbw mm1, mm4
    punpcklbw mm2, mm5

    ret

align 16

_yuv2rgb_16_mmx:

    push ebp
    mov ebp, esp
    push eax

    pand mm0, [mmx_redmask]
    pand mm2, [mmx_grnmask]
    pand mm1, [mmx_redmask]
    psrlw mm0, 3
    pxor mm4, mm4
    movq mm5, mm0
    movq mm7, mm2
    punpcklbw mm2, mm4
    punpcklbw mm0, mm1
    psllw mm2, 3
    por mm0, mm2
    mov eax, [ebp+arg0]
    movq mm6, [eax+8]
    mov eax, [ebp+arg3]
    movq [eax], mm0
    punpckhbw mm7, mm4
    punpckhbw mm5, mm1
    psllw mm7, 3
    mov eax, [ebp+arg1]
    movd mm0, [eax+4]
    por mm5, mm7
    mov eax, [ebp+arg2]
    movd mm1, [eax+4]
    mov eax, [ebp+arg3]
    movq [eax+8], mm5

    pop eax
    pop ebp

    ret

align 16

_yuv2rgb_16_mmx2:

    push ebp
    mov ebp, esp
    push eax

    pand mm0, [mmx_redmask]
    pand mm2, [mmx_grnmask]
    pand mm1, [mmx_redmask]
    psrlw mm0, 3
    pxor mm4, mm4
    movq mm5, mm0
    movq mm7, mm2
    punpcklbw mm2, mm4
    punpcklbw mm0, mm1
    psllw mm2, 3
    por mm0, mm2
    mov eax, [ebp+arg0]
    movq mm6, [eax+8]
    mov eax, [ebp+arg3]
    movntq [eax], mm0
    punpckhbw mm7, mm4
    punpckhbw mm5, mm1
    psllw mm7, 3
    mov eax, [ebp+arg1]
    movd mm0, [eax+4]
    por mm5, mm7
    mov eax, [ebp+arg2]
    movd mm1, [eax+4]
    mov eax, [ebp+arg3]
    movntq [eax+8], mm5

    pop eax
    pop ebp

    ret

align 16

_yuv2rgb_24_mmx:

    push ebp
    mov ebp, esp
    push eax

    pxor mm4, mm4
    movq mm5, mm0
    movq mm6, mm1
    punpcklbw mm0, mm2
    punpcklbw mm1, mm4
    punpckhbw mm5, mm2
    punpckhbw mm6, mm4
    movq mm7, mm0
    movq mm3, mm5
    punpcklwd mm7, mm1
    punpckhwd mm0, mm1
    punpcklwd mm5, mm6
    punpckhwd mm3, mm6
    movq mm2, mm7
    movq mm6, mm0
    movq mm1, mm5
    movq mm4, mm3
    psllq mm7, 28h
    psllq mm0, 28h
    psllq mm5, 28h
    psllq mm3, 28h
    punpckhdq mm7, mm2
    punpckhdq mm0, mm6
    punpckhdq mm5, mm1
    punpckhdq mm3, mm4
    psrlq mm7, 8
    movq mm6, mm0
    psllq mm0, 28h
    por	mm7, mm0
    mov eax, [ebp+arg3]
    movq [eax], mm7
    mov eax, [ebp+arg1]
    movd mm0, [eax+4]
    psrlq mm6, 18h
    movq mm1, mm5
    psllq mm5, 18h
    por	mm6, mm5
    mov eax, [ebp+arg3]
    movq [eax+8], mm6
    mov eax, [ebp+arg0]
    movq mm6, [eax+8]
    psrlq mm1, 28h
    psllq mm3, 8
    por	mm1, mm3
    mov eax, [ebp+arg3]
    movq [eax+10h], mm1
    mov eax, [ebp+arg2]
    movd mm1, [eax+4]
    pxor mm4, mm4

    pop eax
    pop ebp

    ret

align 16

_yuv2rgb_24_mmx2:

    push ebp
    mov ebp, esp
    push eax

    movq mm4, [M24A]
    movq mm7, [M24C]
    pshufw mm5, mm0, 50h
    pshufw mm3, mm2, 50h
    pshufw mm6, mm1, 0
    pand mm5, mm4
    pand mm3, mm4
    pand mm6, mm7
    psllq mm3, 8
    por mm6, mm5
    por mm6, mm3
    mov eax, [ebp+arg3]
    movntq [eax], mm6
    psrlq mm2, 8
    pshufw mm5, mm0, 0A5h
    pshufw mm3, mm2, 55h
    pshufw mm6, mm1, 0A5h
    pand mm5, [M24B]
    pand mm3, mm7
    pand mm6, mm4
    por mm3, mm5
    por mm6, mm3
    movntq [eax+8], mm6
    pshufw mm5, mm0, 0FFh
    pshufw mm3, mm2, 0FAh
    pshufw mm6, mm1, 0FAh
    mov eax, [ebp+arg1]
    movd mm0, [eax+4]
    pand mm5, mm7
    pand mm3, mm4
    pand mm6, [M24B]
    mov eax, [ebp+arg2]
    movd mm1, [eax+4]
    por mm3, mm5
    por mm6, mm3
    mov eax, [ebp+arg3]
    movntq [eax+10h], mm6
    mov eax, [ebp+arg0]
    movq mm6, [eax+8]
    pxor mm4, mm4

    pop eax
    pop ebp

    ret

align 16

_yv12toyuy2_proc_mmx:

    push ebp
    mov ebp, esp
    push eax
    push ebx

    xor eax, eax

align 16

yuy1:

    mov ebx, [ebp+arg2]
    movq mm0, [ebx+eax]
    movq mm2, mm0
    mov ebx, [ebp+arg3]
    movq mm1, [ebx+eax]
    punpcklbw mm0, mm1
    punpckhbw mm2, mm1
    mov ebx, [ebp+arg1]
    movq mm3, [ebx+eax*2]

    psubusb mm3, [mmx_subYw]
    paddusb mm3, [mmx_addYw]

    movq mm5, [ebx+eax*2+8]

    psubusb mm5, [mmx_subYw]
    paddusb mm5, [mmx_addYw]

    movq mm4, mm3
    movq mm6, mm5
    punpcklbw mm3, mm0
    punpckhbw mm4, mm0
    punpcklbw mm5, mm2
    punpckhbw mm6, mm2
    mov ebx, [ebp+arg0]
    movq [ebx+eax*4], mm3
    movq [ebx+eax*4+8], mm4
    movq [ebx+eax*4+10h], mm5
    movq [ebx+eax*4+18h], mm6
    add eax, 8
    cmp eax, [ebp+arg4]
    jb yuy1

    pop ebx
    pop eax
    pop ebp

    ret

align 16

_yv12toyuy2_proc_mmx2:

    push ebp
    mov ebp, esp
    push eax
    push ebx
    push ecx
    push edx

    xor eax, eax

    mov ebx, [ebp+arg1]
    mov ecx, [ebp+arg2]
    mov edx, [ebp+arg3]

align 16

mmx2_yuy1:

    prefetchnta [ebx+eax*2+32]
    prefetchnta [ecx+eax+32]
    prefetchnta [edx+eax+32]

    mov ebx, [ebp+arg2]

    movq mm0, [ebx+eax]
    movq mm2, mm0
    mov ebx, [ebp+arg3]
    movq mm1, [ebx+eax]
    punpcklbw mm0, mm1
    punpckhbw mm2, mm1
    mov ebx, [ebp+arg1]
    movq mm3, [ebx+eax*2]

    psubusb mm3, [mmx_subYw]
    paddusb mm3, [mmx_addYw]

    movq mm5, [ebx+eax*2+8]

    psubusb mm5, [mmx_subYw]
    paddusb mm5, [mmx_addYw]

    movq mm4, mm3
    movq mm6, mm5
    punpcklbw mm3, mm0
    punpckhbw mm4, mm0
    punpcklbw mm5, mm2
    punpckhbw mm6, mm2
    mov ebx, [ebp+arg0]
    movntq [ebx+eax*4], mm3
    movntq [ebx+eax*4+8], mm4
    movntq [ebx+eax*4+10h], mm5
    movntq [ebx+eax*4+18h], mm6
    add eax, 8
    cmp eax, [ebp+arg4]
    jb mmx2_yuy1

    pop edx
    pop ecx
    pop ebx
    pop eax
    pop ebp

    ret

align 16

_emms:

    emms

    ret

align 16

_yuv2rgb_set_gamma_mmx:

    push ebp
    mov ebp, esp
    push eax
    push ebx
    push edx

    mov eax, [ebp+arg0] ; gamma

    cmp eax, 16
    jg g0

    lea edx, [mmx_subYw]
    mov ebx, 16
    sub ebx, eax
    mov bh, bl
    mov eax, ebx
    shl ebx, 16
    add ebx, eax
    mov [edx], ebx
    mov [edx+4], ebx
    lea edx, [mmx_addYw]
    xor ebx, ebx
    mov [edx], ebx
    mov [edx+4], ebx
    jmp done

 g0:

    lea edx, [mmx_addYw]
    mov ebx, eax
    sub ebx, 16
    mov bh, bl
    mov eax, ebx
    shl ebx, 16
    add ebx, eax
    mov [edx], ebx
    mov [edx+4], ebx
    lea edx, [mmx_subYw]
    xor ebx, ebx
    mov [edx], ebx
    mov [edx+4], ebx

 done:

    pop edx
    pop ebx
    pop eax
    pop ebp

    ret
