1; 2; Copyright (C) 2009-2010 David McPaul 3; 4; All rights reserved. Distributed under the terms of the MIT License. 5; 6 7; A rather unoptimised set of ssse3 yuv to rgb converters 8; does 8 pixels per loop 9 10; inputer: 11; reads 128 bits of yuv 8 bit data and puts 12; the y values converted to 16 bit in xmm0 13; the u values converted to 16 bit and duplicated into xmm1 14; the v values converted to 16 bit and duplicated into xmm2 15 16; conversion: 17; does the yuv to rgb conversion using 16 bit fixed point and the 18; results are placed into the following registers as 8 bit clamped values 19; r values in xmm3 20; g values in xmm4 21; b values in xmm5 22 23; outputer: 24; writes out the rgba pixels as 8 bit values with 0 for alpha 25 26; xmm6 used for scratch 27; xmm7 used for scratch 28 29%macro cglobal 1 30 global _%1 31 %define %1 _%1 32 align 16 33%1: 34%endmacro 35 36; conversion code 37%macro yuv2rgbsse2 0 38; u = u - 128 39; v = v - 128 40; r = y + v + v >> 2 + v >> 3 + v >> 5 41; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5) 42; b = y + u + u >> 1 + u >> 2 + u >> 6 43; subtract 16 from y 44 movdqa xmm7, [Const16] ; loads a constant using data cache (slower on first fetch but then cached) 45 psubsw xmm0,xmm7 ; y = y - 16 46; subtract 128 from u and v 47 movdqa xmm7, [Const128] ; loads a constant using data cache (slower on first fetch but then cached) 48 psubsw xmm1,xmm7 ; u = u - 128 49 psubsw xmm2,xmm7 ; v = v - 128 50; load r,b with y 51 movdqa xmm3,xmm0 ; r = y 52 pshufd xmm5,xmm0, 0xE4 ; b = y 53 54; r = y + v + v >> 2 + v >> 3 + v >> 5 55 paddsw xmm3, xmm2 ; add v to r 56 movdqa xmm7, xmm1 ; move u to scratch 57 pshufd xmm6, xmm2, 0xE4 ; move v to scratch 58 59 psraw xmm6,2 ; divide v by 4 60 paddsw xmm3, xmm6 ; and add to r 61 psraw xmm6,1 ; divide v by 2 62 paddsw xmm3, xmm6 ; and add to r 63 psraw xmm6,2 ; divide v by 4 64 paddsw xmm3, xmm6 ; and add to r 65 66; b = y + u + u >> 1 + u >> 2 + u >> 6 67 paddsw xmm5, xmm1 ; add u to b 68 psraw xmm7,1 ; divide u by 2 69 paddsw xmm5, xmm7 ; and add to b 70 psraw xmm7,1 ; divide u by 2 71 paddsw xmm5, xmm7 ; and add to b 72 psraw xmm7,4 ; divide u by 32 73 paddsw xmm5, xmm7 ; and add to b 74 75; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5 76 movdqa xmm7,xmm2 ; move v to scratch 77 pshufd xmm6,xmm1, 0xE4 ; move u to scratch 78 movdqa xmm4,xmm0 ; g = y 79 80 psraw xmm6,2 ; divide u by 4 81 psubsw xmm4,xmm6 ; subtract from g 82 psraw xmm6,2 ; divide u by 4 83 psubsw xmm4,xmm6 ; subtract from g 84 psraw xmm6,1 ; divide u by 2 85 psubsw xmm4,xmm6 ; subtract from g 86 87 psraw xmm7,1 ; divide v by 2 88 psubsw xmm4,xmm7 ; subtract from g 89 psraw xmm7,2 ; divide v by 4 90 psubsw xmm4,xmm7 ; subtract from g 91 psraw xmm7,1 ; divide v by 2 92 psubsw xmm4,xmm7 ; subtract from g 93 psraw xmm7,1 ; divide v by 2 94 psubsw xmm4,xmm7 ; subtract from g 95%endmacro 96 97; outputer 98%macro rgba32sse2output 0 99; clamp values 100 pxor xmm7,xmm7 101 packuswb xmm3,xmm7 ; clamp to 0,255 and pack R to 8 bit per pixel 102 packuswb xmm4,xmm7 ; clamp to 0,255 and pack G to 8 bit per pixel 103 packuswb xmm5,xmm7 ; clamp to 0,255 and pack B to 8 bit per pixel 104; convert to bgra32 packed 105 punpcklbw xmm5,xmm4 ; bgbgbgbgbgbgbgbg 106 movdqa xmm0, xmm5 ; save bg values 107 punpcklbw xmm3,xmm7 ; r0r0r0r0r0r0r0r0 108 punpcklwd xmm5,xmm3 ; lower half bgr0bgr0bgr0bgr0 109 punpckhwd xmm0,xmm3 ; upper half bgr0bgr0bgr0bgr0 110; write to output ptr 111 movntdq [edi], xmm5 ; output first 4 pixels bypassing cache 112 movntdq [edi+16], xmm0 ; output second 4 pixels bypassing cache 113%endmacro 114 115SECTION .data align=16 116 117Const16 dw 16 118 dw 16 119 dw 16 120 dw 16 121 dw 16 122 dw 16 123 dw 16 124 dw 16 125 126Const128 dw 128 127 dw 128 128 dw 128 129 dw 128 130 dw 128 131 dw 128 132 dw 128 133 dw 128 134 135UMask db 0x01 136 db 0x80 137 db 0x01 138 db 0x80 139 db 0x05 140 db 0x80 141 db 0x05 142 db 0x80 143 db 0x09 144 db 0x80 145 db 0x09 146 db 0x80 147 db 0x0d 148 db 0x80 149 db 0x0d 150 db 0x80 151 152VMask db 0x03 153 db 0x80 154 db 0x03 155 db 0x80 156 db 0x07 157 db 0x80 158 db 0x07 159 db 0x80 160 db 0x0b 161 db 0x80 162 db 0x0b 163 db 0x80 164 db 0x0f 165 db 0x80 166 db 0x0f 167 db 0x80 168 169YMask db 0x00 170 db 0x80 171 db 0x02 172 db 0x80 173 db 0x04 174 db 0x80 175 db 0x06 176 db 0x80 177 db 0x08 178 db 0x80 179 db 0x0a 180 db 0x80 181 db 0x0c 182 db 0x80 183 db 0x0e 184 db 0x80 185 186 187; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width) 188width equ ebp+16 189toPtr equ ebp+12 190fromPtr equ ebp+8 191 192; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width) 193width1 equ ebp+24 194toPtr1 equ ebp+20 195fromVPtr equ ebp+16 196fromUPtr equ ebp+12 197fromYPtr equ ebp+8 198 199SECTION .text align=16 200 201cglobal Convert_YUV422_RGBA32_SSSE3 202; reserve variables 203 push ebp 204 mov ebp, esp 205 push edi 206 push esi 207 push ecx 208 209 mov esi, [fromPtr] 210 mov edi, [toPtr] 211 mov ecx, [width] 212; loop width / 8 times 213 shr ecx,3 214 test ecx,ecx 215 jng ENDLOOP 216REPEATLOOP: ; loop over width / 8 217; YUV422 packed inputer 218 movdqa xmm0, [esi] ; should have yuyv yuyv yuyv yuyv 219 pshufd xmm1, xmm0, 0xE4 ; copy to xmm1 220 movdqa xmm2, xmm0 ; copy to xmm2 221; extract both y giving y0y0 222 pshufb xmm0, [YMask] 223; extract u and duplicate so each u in yuyv becomes u0u0 224 pshufb xmm1, [UMask] 225; extract v and duplicate so each v in yuyv becomes v0v0 226 pshufb xmm2, [VMask] 227 228yuv2rgbsse2 229 230rgba32sse2output 231 232; endloop 233 add edi,32 234 add esi,16 235 sub ecx, 1 ; apparently sub is better than dec 236 jnz REPEATLOOP 237ENDLOOP: 238; Cleanup 239 pop ecx 240 pop esi 241 pop edi 242 mov esp, ebp 243 pop ebp 244 ret 245 246cglobal Convert_YUV420P_RGBA32_SSSE3 247; reserve variables 248 push ebp 249 mov ebp, esp 250 push edi 251 push esi 252 push ecx 253 push eax 254 push ebx 255 256 mov esi, [fromYPtr] 257 mov eax, [fromUPtr] 258 mov ebx, [fromVPtr] 259 mov edi, [toPtr1] 260 mov ecx, [width1] 261; loop width / 8 times 262 shr ecx,3 263 test ecx,ecx 264 jng ENDLOOP1 265REPEATLOOP1: ; loop over width / 8 266; YUV420 Planar inputer 267 movq xmm0, [esi] ; fetch 8 y values (8 bit) yyyyyyyy00000000 268 movd xmm1, [eax] ; fetch 4 u values (8 bit) uuuu000000000000 269 movd xmm2, [ebx] ; fetch 4 v values (8 bit) vvvv000000000000 270 271; extract y 272 pxor xmm7,xmm7 ; 00000000000000000000000000000000 273 punpcklbw xmm0,xmm7 ; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0 274; extract u and duplicate so each becomes 0u0u 275 punpcklbw xmm1,xmm7 ; interleave xmm7 into xmm1 u0u0u0u000000000 276 punpcklwd xmm1,xmm7 ; interleave again u000u000u000u000 277 pshuflw xmm1,xmm1, 0xA0 ; copy u values 278 pshufhw xmm1,xmm1, 0xA0 ; to get u0u0 279; extract v 280 punpcklbw xmm2,xmm7 ; interleave xmm7 into xmm1 v0v0v0v000000000 281 punpcklwd xmm2,xmm7 ; interleave again v000v000v000v000 282 pshuflw xmm2,xmm2, 0xA0 ; copy v values 283 pshufhw xmm2,xmm2, 0xA0 ; to get v0v0 284 285yuv2rgbsse2 286 287rgba32sse2output 288 289; endloop 290 add edi,32 291 add esi,8 292 add eax,4 293 add ebx,4 294 sub ecx, 1 ; apparently sub is better than dec 295 jnz REPEATLOOP1 296ENDLOOP1: 297; Cleanup 298 pop ebx 299 pop eax 300 pop ecx 301 pop esi 302 pop edi 303 mov esp, ebp 304 pop ebp 305 ret 306 307SECTION .note.GNU-stack noalloc noexec nowrite progbits 308