1; 2; Copyright (C) 2009-2010 David McPaul 3; 4; All rights reserved. Distributed under the terms of the MIT License. 5; 6 7; A rather unoptimised set of sse yuv to rgb converters 8; does 4 pixels per loop 9 10; inputer: 11; reads 128 bits of yuv 8 bit data and puts 12; the y values converted to 16 bit in mm0 13; the u values converted to 16 bit and duplicated into mm1 14; the v values converted to 16 bit and duplicated into mm2 15 16; conversion: 17; does the yuv to rgb conversion using 16 bit fixed point and the 18; results are placed into the following registers as 8 bit clamped values 19; r values in mm3 20; g values in mm4 21; b values in mm5 22 23; outputer: 24; writes out the rgba pixels as 8 bit values with 0 for alpha 25 26; mm6 used for scratch 27; mm7 used for scratch 28 29%macro cglobal 1 30 global _%1 31 %define %1 _%1 32 align 16 33%1: 34%endmacro 35 36; conversion code 37%macro yuv2rgbsse 0 38; u = u - 128 39; v = v - 128 40; r = y + v + v >> 2 + v >> 3 + v >> 5 41; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5) 42; b = y + u + u >> 1 + u >> 2 + u >> 6 43; subtract 16 from y 44 movq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached) 45; psubsw mm0,mm7 ; y = y - 16 46; subtract 128 from u and v 47 movq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached) 48 psubsw mm1,mm7 ; u = u - 128 49 psubsw mm2,mm7 ; v = v - 128 50; load r,g,b with y 51 movq mm3,mm0 ; r = y 52 pshufw mm5,mm0, 0xE4 ; b = y 53 54; r = r + v + v >> 2 + v >> 3 + v >> 5 55 paddsw mm3, mm2 ; add v to r 56 movq mm7, mm1 ; move u to scratch 57 pshufw mm6, mm2, 0xE4 ; move v to scratch 58 59 psraw mm6,2 ; divide v by 4 60 paddsw mm3, mm6 ; and add to r 61 psraw mm6,1 ; divide v by 2 62 paddsw mm3, mm6 ; and add to r 63 psraw mm6,2 ; divide v by 4 64 paddsw mm3, mm6 ; and add to r 65 66; b = y + u + u >> 1 + u >> 2 + u >> 6 67 paddsw mm5, mm1 ; add u to b 68 psraw mm7,1 ; divide u by 2 69 paddsw mm5, mm7 ; and add to b 70 psraw mm7,1 ; divide u by 2 71 paddsw mm5, mm7 ; and add to b 72 psraw mm7,4 ; divide u by 32 73 paddsw mm5, mm7 ; and add to b 74 75; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5 76 movq mm7,mm2 ; move v to scratch 77 pshufw mm6,mm1, 0xE4 ; move u to scratch 78 movq mm4,mm0 ; g = y 79 80 psraw mm6,2 ; divide u by 4 81 psubsw mm4,mm6 ; subtract from g 82 psraw mm6,2 ; divide u by 4 83 psubsw mm4,mm6 ; subtract from g 84 psraw mm6,1 ; divide u by 2 85 psubsw mm4,mm6 ; subtract from g 86 87 psraw mm7,1 ; divide v by 2 88 psubsw mm4,mm7 ; subtract from g 89 psraw mm7,2 ; divide v by 4 90 psubsw mm4,mm7 ; subtract from g 91 psraw mm7,1 ; divide v by 2 92 psubsw mm4,mm7 ; subtract from g 93 psraw mm7,1 ; divide v by 2 94 psubsw mm4,mm7 ; subtract from g 95%endmacro 96 97; outputer 98%macro rgba32sseoutput 0 99; clamp values 100 pxor mm7,mm7 101 packuswb mm3,mm7 ; clamp to 0,255 and pack R to 8 bit per pixel 102 packuswb mm4,mm7 ; clamp to 0,255 and pack G to 8 bit per pixel 103 packuswb mm5,mm7 ; clamp to 0,255 and pack B to 8 bit per pixel 104; convert to bgra32 packed 105 punpcklbw mm5,mm4 ; bgbgbgbgbgbgbgbg 106 movq mm0, mm5 ; save bg values 107 punpcklbw mm3,mm7 ; r0r0r0r0 108 punpcklwd mm5,mm3 ; lower half bgr0bgr0 109 punpckhwd mm0,mm3 ; upper half bgr0bgr0 110; write to output ptr 111 movq [edi], mm5 ; output first 2 pixels 112 movq [edi+8], mm0 ; output second 2 pixels 113%endmacro 114 115SECTION .data align=16 116 117Const16 dw 16 118 dw 16 119 dw 16 120 dw 16 121 dw 16 122 dw 16 123 dw 16 124 dw 16 125 126Const128 dw 128 127 dw 128 128 dw 128 129 dw 128 130 dw 128 131 dw 128 132 dw 128 133 dw 128 134 135; Packed Convert 136; void Convert_YUV422_RGBA32_SSE(void *fromPtr, void *toPtr, int width) 137width equ ebp+16 138toPtr equ ebp+12 139fromPtr equ ebp+8 140 141; Planar Convert 142; void Convert_YUV420P_RGBA32_SSE(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width) 143width1 equ ebp+24 144toPtr1 equ ebp+20 145fromVPtr equ ebp+16 146fromUPtr equ ebp+12 147fromYPtr equ ebp+8 148 149SECTION .text align=16 150 151; YUY2 FOURCC 152cglobal Convert_YUV422_RGBA32_SSE 153; reserve variables 154 push ebp 155 mov ebp, esp 156 push edi 157 push esi 158 push ecx 159 160 mov esi, [fromPtr] 161 mov ecx, [width] 162 mov edi, [toPtr] 163; loop width / 4 times 164 shr ecx,2 165 test ecx,ecx 166 jng ENDLOOP2 167REPEATLOOP2: ; loop over width / 4 168 169; YUV422 packed inputer 170 movq mm0, [esi] ; should have yuyv yuyv 171 pshufw mm1, mm0, 0xE4 ; copy to mm1 172 movq mm2, mm0 ; copy to mm2 173; extract y 174 pxor mm7,mm7 ; 0000000000000000 175 pcmpeqb mm6,mm6 ; ffffffffffffffff 176 punpckhbw mm6,mm7 ; interleave mm7 into mm6 ff00ff00ff00ff00 177 pand mm0, mm6 ; clear all but y values leaving y0y0 etc 178; extract u and duplicate so each u in yuyv becomes 0u0u 179 psrld mm6,8 ; 00ff0000 00ff0000 180 pand mm1, mm6 ; clear all yv values leaving 0u00 etc 181 psrld mm1,8 ; rotate u to get u000 182 pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 (SSE not MMX) 183; extract v 184 pslld mm6,16 ; 000000ff000000ff 185 pand mm2, mm6 ; clear all yu values leaving 000v etc 186 psrld mm2,8 ; rotate v to get 00v0 187 pshufw mm2,mm2, 0xF5 ; copy v values to get v0v0 (SSE not MMX) 188 189yuv2rgbsse 190 191rgba32sseoutput 192 193 ; endloop 194 add edi,16 195 add esi,8 196 sub ecx, 1 ; apparently sub is better than dec 197 jnz REPEATLOOP2 198ENDLOOP2: 199; Cleanup 200 emms ; reset mmx regs back to float 201 pop ecx 202 pop esi 203 pop edi 204 mov esp, ebp 205 pop ebp 206 ret 207 208cglobal Convert_YUV420P_RGBA32_SSE 209; reserve variables 210 push ebp 211 mov ebp, esp 212 push edi 213 push esi 214 push ecx 215 push eax 216 push ebx 217 218 mov esi, [fromYPtr] 219 mov eax, [fromUPtr] 220 mov ebx, [fromVPtr] 221 mov edi, [toPtr1] 222 mov ecx, [width1] 223; loop width / 4 times 224 shr ecx,2 225 test ecx,ecx 226 jng ENDLOOP3 227REPEATLOOP3: ; loop over width / 4 228; YUV420 Planar inputer 229 movq mm0, [esi] ; fetch 4 y values (8 bit) yyyy0000 230 movd mm1, [eax] ; fetch 2 u values (8 bit) uu000000 231 movd mm2, [ebx] ; fetch 2 v values (8 bit) vv000000 232 233; extract y 234 pxor mm7,mm7 ; 0000000000000000 235 punpcklbw mm0,mm7 ; interleave xmm7 into xmm0 y0y0y0y 236; extract u and duplicate so each becomes 0u0u 237 punpcklbw mm1,mm7 ; interleave xmm7 into xmm1 u0u00000 238 punpcklwd mm1,mm7 ; interleave again u000u000 239 pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 240; extract v 241 punpcklbw mm2,mm7 ; interleave xmm7 into xmm1 v0v00000 242 punpcklwd mm2,mm7 ; interleave again v000v000 243 pshufw mm2,mm2, 0xA0 ; copy v values to get v0v0 244 245yuv2rgbsse 246 247rgba32sseoutput 248 249; endloop 250 add edi,16 251 add esi,4 252 add eax,2 253 add ebx,2 254 sub ecx, 1 ; apparently sub is better than dec 255 jnz REPEATLOOP3 256ENDLOOP3: 257; Cleanup 258 emms 259 pop ebx 260 pop eax 261 pop ecx 262 pop esi 263 pop edi 264 mov esp, ebp 265 pop ebp 266 ret 267 268SECTION .note.GNU-stack noalloc noexec nowrite progbits 269