1; 2; Copyright (C) 2009-2010 David McPaul 3; 4; All rights reserved. Distributed under the terms of the MIT License. 5; 6 7; A rather unoptimised set of ssse3 yuv to rgb converters 8; does 8 pixels per loop 9 10; inputer: 11; reads 128 bits of yuv 8 bit data and puts 12; the y values converted to 16 bit in xmm0 13; the u values converted to 16 bit and duplicated into xmm1 14; the v values converted to 16 bit and duplicated into xmm2 15 16; conversion: 17; does the yuv to rgb conversion using 16 bit fixed point and the 18; results are placed into the following registers as 8 bit clamped values 19; r values in xmm3 20; g values in xmm4 21; b values in xmm5 22 23; outputer: 24; writes out the rgba pixels as 8 bit values with 0 for alpha 25 26; xmm6 used for scratch 27; xmm7 used for scratch 28 29%macro cglobal 1 30 global _%1 31 %define %1 _%1 32 align 16 33%1: 34%endmacro 35 36SECTION .data align=16 37 38Const16 dw 16 39 dw 16 40 dw 16 41 dw 16 42 dw 16 43 dw 16 44 dw 16 45 dw 16 46 47Const128 dw 128 48 dw 128 49 dw 128 50 dw 128 51 dw 128 52 dw 128 53 dw 128 54 dw 128 55 56UMask db 0x01 57 db 0x80 58 db 0x01 59 db 0x80 60 db 0x05 61 db 0x80 62 db 0x05 63 db 0x80 64 db 0x09 65 db 0x80 66 db 0x09 67 db 0x80 68 db 0x0d 69 db 0x80 70 db 0x0d 71 db 0x80 72 73VMask db 0x03 74 db 0x80 75 db 0x03 76 db 0x80 77 db 0x07 78 db 0x80 79 db 0x07 80 db 0x80 81 db 0x0b 82 db 0x80 83 db 0x0b 84 db 0x80 85 db 0x0f 86 db 0x80 87 db 0x0f 88 db 0x80 89 90YMask db 0x00 91 db 0x80 92 db 0x02 93 db 0x80 94 db 0x04 95 db 0x80 96 db 0x06 97 db 0x80 98 db 0x08 99 db 0x80 100 db 0x0a 101 db 0x80 102 db 0x0c 103 db 0x80 104 db 0x0e 105 db 0x80 106 107UVMask db 0x01 108 db 0x80 109 db 0x03 110 db 0x80 111 db 0x05 112 db 0x80 113 db 0x07 114 db 0x80 115 db 0x09 116 db 0x80 117 db 0x0b 118 db 0x80 119 db 0x0d 120 db 0x80 121 db 0x0f 122 db 0x80 123 124shuffconst db 0x0 125 db 0x01 126 db 0x00 127 db 0x01 128 db 0x04 129 db 0x05 130 db 0x04 131 db 0x05 132 db 0x08 133 db 0x09 134 db 0x08 135 db 0x09 136 db 0x0c 137 db 0x0d 138 db 0x0c 139 db 0x0d 140 141RConst dw 0 142 dw 5743 143 dw 0 144 dw 5743 145 dw 0 146 dw 5743 147 dw 0 148 dw 5743 149 150GConst dw -1409 151 dw -2925 152 dw -1409 153 dw -2925 154 dw -1409 155 dw -2925 156 dw -1409 157 dw -2925 158 159BConst dw 7258 160 dw 0 161 dw 7258 162 dw 0 163 dw 7258 164 dw 0 165 dw 7258 166 dw 0 167 168; conversion code 169%macro yuv2rgbssse3 0 170; u = u - 128 171; v = v - 128 172; r = y + 0 * u + 1.403 * v 173; g = y + -0.344 * u + -0.714 * v 174; b = y + 1.773 * u + 0 * v 175; subtract 128 from u and v 176 psubsw xmm3, [Const128] ; u = u - 128, v = v -128 177 178 pshufd xmm5, xmm3, 0xE4 ; duplicate 179 movdqa xmm4, xmm3 ; duplicate 180 181; subtract 16 from y 182; psubsw xmm0, [Const16] ; y = y - 16 183 184 pmaddwd xmm3, [RConst] ; multiply and add 185 pmaddwd xmm4, [GConst] ; to get RGB offsets to Y 186 pmaddwd xmm5, [BConst] ; 187 188 psrad xmm3, 12 ; Scale back to original range 189 psrad xmm4, 12 ; 190 psrad xmm5, 12 ; 191 192 pshufb xmm3, [shuffconst] ; duplicate results 193 pshufb xmm4, [shuffconst] ; 2 y values per const 194 pshufb xmm5, [shuffconst] ; 195 196 paddsw xmm3, xmm0 ; and add to y 197 paddsw xmm4, xmm0 ; 198 paddsw xmm5, xmm0 ; 199%endmacro 200 201; outputer 202%macro rgba32ssse3output 0 203; clamp values 204 pxor xmm7,xmm7 205 packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel 206 packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel 207 packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel 208; convert to bgra32 packed 209 punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg 210 movdqa xmm0, xmm5 ; save bg values 211 punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0 212 punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0 213 punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0 214; write to output ptr 215 movntdq [edi], xmm5 ; output first 4 pixels bypassing cache 216 movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache 217%endmacro 218 219 220; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width) 221%define width ebp+16 222%define toPtr ebp+12 223%define fromPtr ebp+8 224 225; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width) 226%define width1 ebp+24 227%define toPtr1 ebp+20 228%define fromVPtr ebp+16 229%define fromUPtr ebp+12 230%define fromYPtr ebp+8 231 232SECTION .text align=16 233 234cglobal Convert_YUV422_RGBA32_SSSE3 235; reserve variables 236 push ebp 237 mov ebp, esp 238 push edi 239 push esi 240 push ecx 241 242 mov esi, [fromPtr] 243 mov edi, [toPtr] 244 mov ecx, [width] 245; loop width / 8 times 246 shr ecx,3 247 test ecx,ecx 248 jng ENDLOOP 249REPEATLOOP: ; loop over width / 8 250 prefetchnta [esi+256] 251; YUV422 packed inputer 252 movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv 253 pshufd xmm3, xmm0, 0xE4 ; copy to xmm1 254; extract both y giving y0y0 255 pshufb xmm0, [YMask] 256; extract u and v to have u0v0 257 pshufb xmm3, [UVMask] 258 259yuv2rgbssse3 260 261rgba32ssse3output 262 263; endloop 264 add edi,32 265 add esi,16 266 sub ecx, 1 ; apparently sub is better than dec 267 jnz REPEATLOOP 268ENDLOOP: 269; Cleanup 270 pop ecx 271 pop esi 272 pop edi 273 mov esp, ebp 274 pop ebp 275 ret 276 277cglobal Convert_YUV420P_RGBA32_SSSE3 278; reserve variables 279 push ebp 280 mov ebp, esp 281 push edi 282 push esi 283 push ecx 284 push eax 285 push ebx 286 287 mov esi, [fromYPtr] 288 mov eax, [fromUPtr] 289 mov ebx, [fromVPtr] 290 mov edi, [toPtr1] 291 mov ecx, [width1] 292; loop width / 8 times 293 shr ecx,3 294 test ecx,ecx 295 jng ENDLOOP1 296REPEATLOOP1: ; loop over width / 8 297 prefetchnta [esi+256] 298 prefetchnta [eax+128] 299 prefetchnta [ebx+128] 300 301; YUV420 Planar inputer 302 movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000 303 movd xmm3, [eax] ; fetch 4 u values (8 bit) uuuu000000000000 304 movd xmm1, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000 305 306; convert y to 16 bit 307 pxor xmm7,xmm7 ; 00000000000000000000000000000000 308 punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0 309 310; combine u and v 311 punpcklbw xmm3,xmm1 ; uvuvuvuv00000000 312 punpcklbw xmm3,xmm7 ; u0v0u0v0u0v0u0v0 313 314yuv2rgbssse3 315 316rgba32ssse3output 317 318; endloop 319 add edi,32 320 add esi,8 321 add eax,4 322 add ebx,4 323 sub ecx, 1 ; apparently sub is better than dec 324 jnz REPEATLOOP1 325ENDLOOP1: 326; Cleanup 327 pop ebx 328 pop eax 329 pop ecx 330 pop esi 331 pop edi 332 mov esp, ebp 333 pop ebp 334 ret 335 336SECTION .note.GNU-stack noalloc noexec nowrite progbits 337