1; 2; Copyright (C) 2009-2010 David McPaul 3; 4; All rights reserved. Distributed under the terms of the MIT License. 5; 6 7; A rather unoptimised set of yuv to rgb converters 8; does 8 pixels at a time 9 10; inputer: 11; reads 128bits of yuv 8 bit data and puts 12; the y values converted to 16 bit in xmm0 13; the u values converted to 16 bit and duplicated into xmm1 14; the v values converted to 16 bit and duplicated into xmm2 15 16; conversion: 17; does the yuv to rgb conversion using 16 bit fixed point and the 18; results are placed into the following registers as 8 bit clamped values 19; r values in xmm3 20; g values in xmm4 21; b values in xmm5 22 23; outputer: 24; writes out the rgba pixels as 8 bit values with 0 for alpha 25 26; xmm6 used for scratch 27; xmm7 used for scratch 28 29%macro cglobal 1 30 global _%1 31 %define %1 _%1 32 align 16 33%1: 34%endmacro 35 36; conversion code 37%macro yuv2rgbsse2 0 38; u = u - 128 39; v = v - 128 40; r = y + v + v >> 2 + v >> 3 + v >> 5 41; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5) 42; b = y + u + u >> 1 + u >> 2 + u >> 6 43; subtract 16 from y 44 movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached) 45 psubsw xmm0,xmm7 ; y = y - 16 46; subtract 128 from u and v 47 movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached) 48 psubsw xmm1,xmm7 ; u = u - 128 49 psubsw xmm2,xmm7 ; v = v - 128 50; load r,b with y 51 movdqa xmm3,xmm0 ; r = y 52 pshufd xmm5,xmm0, 0xE4 ; b = y 53 54; r = y + v + v >> 2 + v >> 3 + v >> 5 55 paddsw xmm3, xmm2 ; add v to r 56 movdqa xmm7, xmm1 ; move u to scratch 57 pshufd xmm6, xmm2, 0xE4 ; move v to scratch 58 59 psraw xmm6,2 ; divide v by 4 60 paddsw xmm3, xmm6 ; and add to r 61 psraw xmm6,1 ; divide v by 2 62 paddsw xmm3, xmm6 ; and add to r 63 psraw xmm6,2 ; divide v by 4 64 paddsw xmm3, xmm6 ; and add to r 65 66; b = y + u + u >> 1 + u >> 2 + u >> 6 67 paddsw xmm5, xmm1 ; add u to b 68 psraw xmm7,1 ; divide u by 2 69 paddsw xmm5, xmm7 ; and add to b 70 psraw xmm7,1 ; divide u by 2 71 paddsw xmm5, xmm7 ; and add to b 72 psraw xmm7,4 ; divide u by 32 73 paddsw xmm5, xmm7 ; and add to b 74 75; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5 76 movdqa xmm7,xmm2 ; move v to scratch 77 pshufd xmm6,xmm1, 0xE4 ; move u to scratch 78 movdqa xmm4,xmm0 ; g = y 79 80 psraw xmm6,2 ; divide u by 4 81 psubsw xmm4,xmm6 ; subtract from g 82 psraw xmm6,2 ; divide u by 4 83 psubsw xmm4,xmm6 ; subtract from g 84 psraw xmm6,1 ; divide u by 2 85 psubsw xmm4,xmm6 ; subtract from g 86 87 psraw xmm7,1 ; divide v by 2 88 psubsw xmm4,xmm7 ; subtract from g 89 psraw xmm7,2 ; divide v by 4 90 psubsw xmm4,xmm7 ; subtract from g 91 psraw xmm7,1 ; divide v by 2 92 psubsw xmm4,xmm7 ; subtract from g 93 psraw xmm7,1 ; divide v by 2 94 psubsw xmm4,xmm7 ; subtract from g 95%endmacro 96 97; conversion code 98%macro yuv2rgbsse 0 99; u = u - 128 100; v = v - 128 101; r = y + v + v >> 2 + v >> 3 + v >> 5 102; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5) 103; b = y + u + u >> 1 + u >> 2 + u >> 6 104; subtract 16 from y 105 movq mm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached) 106 psubsw mm0,mm7 ; y = y - 16 107; subtract 128 from u and v 108 movq mm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached) 109 psubsw mm1,mm7 ; u = u - 128 110 psubsw mm2,mm7 ; v = v - 128 111; load r,g,b with y 112 movq mm3,mm0 ; r = y 113 pshufw mm5,mm0, 0xE4 ; b = y 114 115; r = r + v + v >> 2 + v >> 3 + v >> 5 116 paddsw mm3, mm2 ; add v to r 117 movq mm7, mm1 ; move u to scratch 118 pshufw mm6, mm2, 0xE4 ; move v to scratch 119 120 psraw mm6,2 ; divide v by 4 121 paddsw mm3, mm6 ; and add to r 122 psraw mm6,1 ; divide v by 2 123 paddsw mm3, mm6 ; and add to r 124 psraw mm6,2 ; divide v by 4 125 paddsw mm3, mm6 ; and add to r 126 127; b = y + u + u >> 1 + u >> 2 + u >> 6 128 paddsw mm5, mm1 ; add u to b 129 psraw mm7,1 ; divide u by 2 130 paddsw mm5, mm7 ; and add to b 131 psraw mm7,1 ; divide u by 2 132 paddsw mm5, mm7 ; and add to b 133 psraw mm7,4 ; divide u by 32 134 paddsw mm5, mm7 ; and add to b 135 136; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5 137 movq mm7,mm2 ; move v to scratch 138 pshufw mm6,mm1, 0xE4 ; move u to scratch 139 movq mm4,mm0 ; g = y 140 141 psraw mm6,2 ; divide u by 4 142 psubsw mm4,mm6 ; subtract from g 143 psraw mm6,2 ; divide u by 4 144 psubsw mm4,mm6 ; subtract from g 145 psraw mm6,1 ; divide u by 2 146 psubsw mm4,mm6 ; subtract from g 147 148 psraw mm7,1 ; divide v by 2 149 psubsw mm4,mm7 ; subtract from g 150 psraw mm7,2 ; divide v by 4 151 psubsw mm4,mm7 ; subtract from g 152 psraw mm7,1 ; divide v by 2 153 psubsw mm4,mm7 ; subtract from g 154 psraw mm7,1 ; divide v by 2 155 psubsw mm4,mm7 ; subtract from g 156%endmacro 157 158; outputer 159%macro rgba32sse2output 0 160; clamp values 161 pxor xmm7,xmm7 162 packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel 163 packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel 164 packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel 165; convert to bgra32 packed 166 punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg 167 movdqa xmm0, xmm5 ; save bg values 168 punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0 169 punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0 170 punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0 171; write to output ptr 172 movntdq [edi], xmm5 ; output first 4 pixels bypassing cache 173 movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache 174%endmacro 175 176; outputer 177%macro rgba32sseoutput 0 178; clamp values 179 pxor mm7,mm7 180 packuswb mm3,mm7 ; clamp to 0,255 and pack R to 8 bit per pixel 181 packuswb mm4,mm7 ; clamp to 0,255 and pack G to 8 bit per pixel 182 packuswb mm5,mm7 ; clamp to 0,255 and pack B to 8 bit per pixel 183; convert to bgra32 packed 184 punpcklbw mm5,mm4 ; bgbgbgbgbgbgbgbg 185 movq mm0, mm5 ; save bg values 186 punpcklbw mm3,mm7 ; r0r0r0r0 187 punpcklwd mm5,mm3 ; lower half bgr0bgr0 188 punpckhwd mm0,mm3 ; upper half bgr0bgr0 189; write to output ptr 190 movq [edi], mm5 ; output first 2 pixels 191 movq [edi+8], mm0 ; output second 2 pixels 192%endmacro 193 194SECTION .data align=16 195 196Const16 dw 16 197 dw 16 198 dw 16 199 dw 16 200 dw 16 201 dw 16 202 dw 16 203 dw 16 204 205Const128 dw 128 206 dw 128 207 dw 128 208 dw 128 209 dw 128 210 dw 128 211 dw 128 212 dw 128 213 214; void Convert_YUV422_RGBA32_SSE2(void *fromPtr, void *toPtr, int width) 215width equ ebp+16 216toPtr equ ebp+12 217fromPtr equ ebp+8 218 219; void Convert_YUV420P_RGBA32_SSE2(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width) 220width1 equ ebp+24 221toPtr1 equ ebp+20 222fromVPtr equ ebp+16 223fromUPtr equ ebp+12 224fromYPtr equ ebp+8 225 226SECTION .text align=16 227 228cglobal Convert_YUV422_RGBA32_SSE2 229; reserve variables 230 push ebp 231 mov ebp, esp 232 push edi 233 push esi 234 push ecx 235 236 mov esi, [fromPtr] 237 mov edi, [toPtr] 238 mov ecx, [width] 239; loop width / 8 times 240 shr ecx,3 241 test ecx,ecx 242 jng ENDLOOP 243REPEATLOOP: ; loop over width / 8 244; YUV422 packed inputer 245 movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv 246 pshufd xmm1, xmm0, 0xE4 ; copy to xmm1 247 movdqa xmm2, xmm0 ; copy to xmm2 248; extract y 249 pxor xmm7,xmm7 ; 00000000000000000000000000000000 250 pcmpeqd xmm6,xmm6 ; ffffffffffffffffffffffffffffffff 251 punpcklbw xmm6,xmm7 ; interleave xmm7 into xmm6 ff00ff00ff00ff00ff00ff00ff00ff00 252 pand xmm0, xmm6 ; clear all but y values leaving y0y0 etc 253; extract u and duplicate so each u in yuyv becomes 0u0u 254 psrld xmm6,8 ; 00ff0000 00ff0000 00ff0000 00ff0000 255 pand xmm1, xmm6 ; clear all yv values leaving 0u00 etc 256 psrld xmm1,8 ; rotate u to get u000 257 pshuflw xmm1,xmm1, 0xA0 ; copy u values 258 pshufhw xmm1,xmm1, 0xA0 ; to get u0u0 259; extract v 260 pslld xmm6,16 ; 000000ff000000ff 000000ff000000ff 261 pand xmm2, xmm6 ; clear all yu values leaving 000v etc 262 psrld xmm2,8 ; rotate v to get 00v0 263 pshuflw xmm2,xmm2, 0xF5 ; copy v values 264 pshufhw xmm2,xmm2, 0xF5 ; to get v0v0 265 266yuv2rgbsse2 267 268rgba32sse2output 269 270; endloop 271 add edi,32 272 add esi,16 273 sub ecx, 1 ; apparently sub is better than dec 274 jnz REPEATLOOP 275ENDLOOP: 276; Cleanup 277 pop ecx 278 pop esi 279 pop edi 280 mov esp, ebp 281 pop ebp 282 ret 283 284cglobal Convert_YUV420P_RGBA32_SSE2 285; reserve variables 286 push ebp 287 mov ebp, esp 288 push edi 289 push esi 290 push ecx 291 push eax 292 push ebx 293 294 mov esi, [fromYPtr] 295 mov eax, [fromUPtr] 296 mov ebx, [fromVPtr] 297 mov edi, [toPtr1] 298 mov ecx, [width1] 299; loop width / 8 times 300 shr ecx,3 301 test ecx,ecx 302 jng ENDLOOP1 303REPEATLOOP1: ; loop over width / 8 304; YUV420 Planar inputer 305 movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000 306 movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000 307 movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000 308 309; extract y 310 pxor xmm7,xmm7 ; 00000000000000000000000000000000 311 punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0 312; extract u and duplicate so each becomes 0u0u 313 punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000 314 punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000 315 pshuflw xmm1,xmm1, 0xA0 ; copy u values 316 pshufhw xmm1,xmm1, 0xA0 ; to get u0u0 317; extract v 318 punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000 319 punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000 320 pshuflw xmm2,xmm2, 0xA0 ; copy v values 321 pshufhw xmm2,xmm2, 0xA0 ; to get v0v0 322 323yuv2rgbsse2 324 325rgba32sse2output 326 327; endloop 328 add edi,32 329 add esi,8 330 add eax,4 331 add ebx,4 332 sub ecx, 1 ; apparently sub is better than dec 333 jnz REPEATLOOP1 334ENDLOOP1: 335; Cleanup 336 pop ebx 337 pop eax 338 pop ecx 339 pop esi 340 pop edi 341 mov esp, ebp 342 pop ebp 343 ret 344 345cglobal Convert_YUV422_RGBA32_SSE 346; reserve variables 347 push ebp 348 mov ebp, esp 349 push edi 350 push esi 351 push ecx 352 353 mov esi, [fromPtr] 354 mov ecx, [width] 355 mov edi, [toPtr] 356; loop width / 4 times 357 shr ecx,2 358 test ecx,ecx 359 jng ENDLOOP2 360REPEATLOOP2: ; loop over width / 4 361 362; YUV422 packed inputer 363 movq mm0, [esi] ; should have yuyv yuyv 364 pshufw mm1, mm0, 0xE4 ; copy to mm1 365 movq mm2, mm0 ; copy to mm2 366; extract y 367 pxor mm7,mm7 ; 0000000000000000 368 pcmpeqb mm6,mm6 ; ffffffffffffffff 369 punpckhbw mm6,mm7 ; interleave mm7 into mm6 ff00ff00ff00ff00 370 pand mm0, mm6 ; clear all but y values leaving y0y0 etc 371; extract u and duplicate so each u in yuyv becomes 0u0u 372 psrld mm6,8 ; 00ff0000 00ff0000 373 pand mm1, mm6 ; clear all yv values leaving 0u00 etc 374 psrld mm1,8 ; rotate u to get u000 375 pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 (SSE not MMX) 376; extract v 377 pslld mm6,16 ; 000000ff000000ff 378 pand mm2, mm6 ; clear all yu values leaving 000v etc 379 psrld mm2,8 ; rotate v to get 00v0 380 pshufw mm2,mm2, 0xF5 ; copy v values to get v0v0 (SSE not MMX) 381 382yuv2rgbsse 383 384rgba32sseoutput 385 386 ; endloop 387 add edi,16 388 add esi,8 389 sub ecx, 1 ; apparently sub is better than dec 390 jnz REPEATLOOP2 391ENDLOOP2: 392; Cleanup 393 emms ; reset mmx regs back to float 394 pop ecx 395 pop esi 396 pop edi 397 mov esp, ebp 398 pop ebp 399 ret 400 401cglobal Convert_YUV420P_RGBA32_SSE 402; reserve variables 403 push ebp 404 mov ebp, esp 405 push edi 406 push esi 407 push ecx 408 push eax 409 push ebx 410 411 mov esi, [fromYPtr] 412 mov eax, [fromUPtr] 413 mov ebx, [fromVPtr] 414 mov edi, [toPtr1] 415 mov ecx, [width1] 416; loop width / 4 times 417 shr ecx,2 418 test ecx,ecx 419 jng ENDLOOP3 420REPEATLOOP3: ; loop over width / 4 421; YUV420 Planar inputer 422 movq mm0, [esi] ; fetch 4 y values (8 bit) yyyy0000 423 movd mm1, [eax] ; fetch 2 u values (8 bit) uu000000 424 movd mm2, [ebx] ; fetch 2 v values (8 bit) vv000000 425 426; extract y 427 pxor mm7,mm7 ; 0000000000000000 428 punpcklbw mm0,mm7 ; interleave xmm7 into xmm0 y0y0y0y 429; extract u and duplicate so each becomes 0u0u 430 punpcklbw mm1,mm7 ; interleave xmm7 into xmm1 u0u00000 431 punpcklwd mm1,mm7 ; interleave again u000u000 432 pshufw mm1,mm1, 0xA0 ; copy u values to get u0u0 433; extract v 434 punpcklbw mm2,mm7 ; interleave xmm7 into xmm1 v0v00000 435 punpcklwd mm2,mm7 ; interleave again v000v000 436 pshufw mm2,mm2, 0xA0 ; copy v values to get v0v0 437 438yuv2rgbsse 439 440rgba32sseoutput 441 442; endloop 443 add edi,16 444 add esi,4 445 add eax,2 446 add ebx,2 447 sub ecx, 1 ; apparently sub is better than dec 448 jnz REPEATLOOP3 449ENDLOOP3: 450; Cleanup 451 emms 452 pop ebx 453 pop eax 454 pop ecx 455 pop esi 456 pop edi 457 mov esp, ebp 458 pop ebp 459 ret 460 461SECTION .note.GNU-stack noalloc noexec nowrite progbits 462