1; 2; Copyright (C) 2009-2010 David McPaul 3; 4; All rights reserved. Distributed under the terms of the MIT License. 5; 6 7; A rather unoptimised set of sse2 yuv to rgb converters 8; does 8 pixels per loop 9 10; inputer: 11; reads 128 bits of yuv 8 bit data and puts 12; the y values converted to 16 bit in xmm0 13; the u values converted to 16 bit and duplicated into xmm1 14; the v values converted to 16 bit and duplicated into xmm2 15 16; conversion: 17; does the yuv to rgb conversion using 16 bit fixed point and the 18; results are placed into the following registers as 8 bit clamped values 19; r values in xmm3 20; g values in xmm4 21; b values in xmm5 22 23; outputer: 24; writes out the rgba pixels as 8 bit values with 0 for alpha 25 26; xmm6 used for scratch 27; xmm7 used for scratch 28 29%macro cglobal 1 30 global _%1 31 %define %1 _%1 32 align 16 33%1: 34%endmacro 35 36SECTION .data align=16 37 38Const16 dw 16 39 dw 16 40 dw 16 41 dw 16 42 dw 16 43 dw 16 44 dw 16 45 dw 16 46 47Const128 dw 128 48 dw 128 49 dw 128 50 dw 128 51 dw 128 52 dw 128 53 dw 128 54 dw 128 55 56RConst dw 0 57 dw 5743 58 dw 0 59 dw 5743 60 dw 0 61 dw 5743 62 dw 0 63 dw 5743 64 65GConst dw -1409 66 dw -2925 67 dw -1409 68 dw -2925 69 dw -1409 70 dw -2925 71 dw -1409 72 dw -2925 73 74BConst dw 7258 75 dw 0 76 dw 7258 77 dw 0 78 dw 7258 79 dw 0 80 dw 7258 81 dw 0 82 83shuffconst db 0x0 84 db 0x01 85 db 0x00 86 db 0x01 87 db 0x04 88 db 0x05 89 db 0x04 90 db 0x05 91 db 0x08 92 db 0x09 93 db 0x08 94 db 0x09 95 db 0x0c 96 db 0x0d 97 db 0x0c 98 db 0x0d 99 100YMask db 0x00 101 db 0x80 102 db 0x02 103 db 0x80 104 db 0x04 105 db 0x80 106 db 0x06 107 db 0x80 108 db 0x08 109 db 0x80 110 db 0x0a 111 db 0x80 112 db 0x0c 113 db 0x80 114 db 0x0e 115 db 0x80 116 117UVMask db 0x01 118 db 0x80 119 db 0x03 120 db 0x80 121 db 0x05 122 db 0x80 123 db 0x07 124 db 0x80 125 db 0x09 126 db 0x80 127 db 0x0b 128 db 0x80 129 db 0x0d 130 db 0x80 131 db 0x0f 132 db 0x80 133 134; conversion code 135%macro yuv2rgbsse2 0 136; u = u - 128 137; v = v - 128 138; r = y + 0 * u + 1.402 * v 139; g = y + -0.344 * u + -0.714 * v 140; b = y + 1.772 * u + 0 * v 141; subtract 16 from y 142; psubsw xmm0, [Const16] ; y = y - 16 143; subtract 128 from u and v 144 psubsw xmm3, [Const128] ; u = u - 128, v = v -128 145 146 movdqa xmm4, xmm3 ; duplicate 147 pshufd xmm5, xmm3, 0xE4 ; duplicate 148 149 pmaddwd xmm3, [RConst] ; multiply and add 150 pmaddwd xmm4, [GConst] ; to get RGB offsets to Y 151 pmaddwd xmm5, [BConst] ; 152 153 psrad xmm3, 12 ; Scale back to original range 154 psrad xmm4, 12 ; 155 psrad xmm5, 12 ; 156 157 pshuflw xmm3, xmm3, 0xa0 ; duplicate results 158 pshufhw xmm3, xmm3, 0xa0 159 pshuflw xmm4, xmm4, 0xa0 160 pshufhw xmm4, xmm4, 0xa0 161 pshuflw xmm5, xmm5, 0xa0 162 pshufhw xmm5, xmm5, 0xa0 163 164 paddsw xmm3, xmm0 ; add to y 165 paddsw xmm4, xmm0 ; 166 paddsw xmm5, xmm0 ; 167%endmacro 168 169; outputer 170%macro rgba32sse2output 0 171; clamp values 172 pxor xmm7,xmm7 173 packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel 174 packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel 175 packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel 176; convert to bgra32 packed 177 punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg 178 movdqa xmm0, xmm5 ; save bg values 179 punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0 180 punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0 181 punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0 182; write to output ptr 183 movntdq [edi], xmm5 ; output first 4 pixels bypassing cache 184 movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache 185%endmacro 186 187; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width) 188%define width ebp+16 189%define toPtr ebp+12 190%define fromPtr ebp+8 191 192; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width) 193%define width1 ebp+24 194%define toPtr1 ebp+20 195%define fromVPtr ebp+16 196%define fromUPtr ebp+12 197%define fromYPtr ebp+8 198 199SECTION .text align=16 200 201cglobal Convert_YUV422_RGBA32_SSE2 202; reserve variables 203 push ebp 204 mov ebp, esp 205 push edi 206 push esi 207 push ecx 208 209 mov esi, [fromPtr] 210 mov edi, [toPtr] 211 mov ecx, [width] 212; loop width / 8 times 213 shr ecx,3 214 test ecx,ecx 215 jng ENDLOOP 216REPEATLOOP: ; loop over width / 8 217 prefetchnta [esi+256] 218; YUV422 packed inputer 219 movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv 220 pshufd xmm3, xmm0, 0xE4 ; copy to xmm3 221; extract y 222 pxor xmm7, xmm7 ; 00000000000000000000000000000000 223 pcmpeqd xmm6, xmm6 ; ffffffffffffffffffffffffffffffff 224 punpcklbw xmm6, xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00 225 pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc 226; extract u and v 227 psllw xmm6, 8 ; 00ff00ff00ff00ff00ff00ff00ff00ff 228 pand xmm3, xmm6 ; extract uv values 0u0v0u0v0u0v0u0v0u0v 229 psrlw xmm3, 8 ; covert to 16bit u0v0u0v0u0v0u0v0u0v0 230 231yuv2rgbsse2 232 233rgba32sse2output 234 235; endloop 236 add edi,32 237 add esi,16 238 sub ecx, 1 ; apparently sub is better than dec 239 jnz REPEATLOOP 240ENDLOOP: 241; Cleanup 242 pop ecx 243 pop esi 244 pop edi 245 mov esp, ebp 246 pop ebp 247 ret 248 249cglobal Convert_YUV420P_RGBA32_SSE2 250; reserve variables 251 push ebp 252 mov ebp, esp 253 push edi 254 push esi 255 push ecx 256 push eax 257 push ebx 258 259 mov esi, [fromYPtr] 260 mov eax, [fromUPtr] 261 mov ebx, [fromVPtr] 262 mov edi, [toPtr1] 263 mov ecx, [width1] 264; loop width / 8 times 265 shr ecx,3 266 test ecx,ecx 267 jng ENDLOOP1 268REPEATLOOP1: ; loop over width / 8 269; YUV420 Planar inputer 270 movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000 271 movd xmm3, [eax] ; fetch 4 u values (8 bit) uuuu000000000000 272 movd xmm1, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000 273 274; extract y 275 pxor xmm7, xmm7 ; 00000000000000000000000000000000 276 punpcklbw xmm0, xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0 277 278; combine u and v 279 punpcklbw xmm3, xmm1 ; uvuvuvuv00000000 280 punpcklbw xmm3, xmm7 ; u0v0u0v0u0v0u0v0 281 282yuv2rgbsse2 283 284rgba32sse2output 285 286; endloop 287 add edi,32 288 add esi,8 289 add eax,4 290 add ebx,4 291 sub ecx, 1 ; apparently sub is better than dec 292 jnz REPEATLOOP1 293ENDLOOP1: 294; Cleanup 295 pop ebx 296 pop eax 297 pop ecx 298 pop esi 299 pop edi 300 mov esp, ebp 301 pop ebp 302 ret 303 304cglobal Test_SSE2 305; reserve variables 306 push ebp 307 mov ebp, esp 308 push edi 309 push esi 310 push ecx 311 push eax 312 push ebx 313 314 mov esi, [fromPtr] 315 mov edi, [toPtr] 316 317 movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv 318 pshufd xmm1, xmm0, 0xE4 ; copy to xmm1 319 movdqa xmm3, xmm0 ; copy to xmm2 320; extract y 321 pxor xmm7,xmm7 ; 00000000000000000000000000000000 322 pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffff 323 punpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00 324 pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc 325; extract u and duplicate so each u in yuyv becomes 0u0u 326 psrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000 327 pand xmm1, xmm6 ; clear all yv values leaving 0u00 etc 328 psrld xmm1,8 ; rotate u to get u000 329; extract v 330 pslld xmm6,16 ; 000000ff000000ff 000000ff000000ff 331 pand xmm3, xmm6 ; clear all yu values leaving 000v etc 332 psrld xmm3,8 ; rotate v to get 00v0 333 por xmm3, xmm1 334 335 psubsw xmm3, [Const128] ; u = u - 128, v = v -128 336 337 pmaddwd xmm3, [RConst] ; multiply and add 338 psrad xmm3, 12 ; Scale back to original range 339 340 pshufb xmm3, [shuffconst] ; duplicate results 341; paddsw xmm3, xmm0 ; add to y 342 343; pxor xmm7,xmm7 344; packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel 345 346 movntdq [edi], xmm3 ; output first 4 pixels bypassing cache 347 348; Cleanup 349 pop ebx 350 pop eax 351 pop ecx 352 pop esi 353 pop edi 354 mov esp, ebp 355 pop ebp 356 ret 357 358SECTION .note.GNU-stack noalloc noexec nowrite progbits 359