xref: /haiku/src/add-ons/media/plugins/ffmpeg/yuvrgb_ssse3.nasm (revision 51d9b4fe14bec902662f90f9ff7dca5df5622527)
1;
2; Copyright (C) 2009-2010 David McPaul
3;
4; All rights reserved. Distributed under the terms of the MIT License.
5;
6
7; A rather unoptimised set of ssse3 yuv to rgb converters
8; does 8 pixels per loop
9
10; inputer:
11; reads 128 bits of yuv 8 bit data and puts
12; the y values converted to 16 bit in xmm0
13; the u values converted to 16 bit and duplicated into xmm1
14; the v values converted to 16 bit and duplicated into xmm2
15
16; conversion:
17; does the yuv to rgb conversion using 16 bit fixed point and the
18; results are placed into the following registers as 8 bit clamped values
19; r values in xmm3
20; g values in xmm4
21; b values in xmm5
22
23; outputer:
24; writes out the rgba pixels as 8 bit values with 0 for alpha
25
26; xmm6 used for scratch
27; xmm7 used for scratch
28
29%macro  cglobal 1
30	global  _%1
31	%define %1 _%1
32	align 16
33%1:
34%endmacro
35
36; conversion code
37%macro yuv2rgbsse2 0
38; u = u - 128
39; v = v - 128
40; r = y + v + v >> 2 + v >> 3 + v >> 5
41; g = y - (u >> 2 + u >> 4 + u >> 5) - (v >> 1 + v >> 3 + v >> 4 + v >> 5)
42; b = y + u + u >> 1 + u >> 2 + u >> 6
43; subtract 16 from y
44	movdqa xmm7, [Const16]			; loads a constant using data cache (slower on first fetch but then cached)
45	psubsw xmm0,xmm7				; y = y - 16
46; subtract 128 from u and v
47	movdqa xmm7, [Const128]			; loads a constant using data cache (slower on first fetch but then cached)
48	psubsw xmm1,xmm7				; u = u - 128
49	psubsw xmm2,xmm7				; v = v - 128
50; load r,b with y
51	movdqa xmm3,xmm0				; r = y
52	pshufd xmm5,xmm0, 0xE4			; b = y
53
54; r = y + v + v >> 2 + v >> 3 + v >> 5
55	paddsw xmm3, xmm2				; add v to r
56	movdqa xmm7, xmm1				; move u to scratch
57	pshufd xmm6, xmm2, 0xE4			; move v to scratch
58
59	psraw  xmm6,2					; divide v by 4
60	paddsw xmm3, xmm6				; and add to r
61	psraw  xmm6,1					; divide v by 2
62	paddsw xmm3, xmm6				; and add to r
63	psraw  xmm6,2					; divide v by 4
64	paddsw xmm3, xmm6				; and add to r
65
66; b = y + u + u >> 1 + u >> 2 + u >> 6
67	paddsw xmm5, xmm1				; add u to b
68	psraw  xmm7,1					; divide u by 2
69	paddsw xmm5, xmm7				; and add to b
70	psraw  xmm7,1					; divide u by 2
71	paddsw xmm5, xmm7				; and add to b
72	psraw  xmm7,4					; divide u by 32
73	paddsw xmm5, xmm7				; and add to b
74
75; g = y - u >> 2 - u >> 4 - u >> 5 - v >> 1 - v >> 3 - v >> 4 - v >> 5
76	movdqa xmm7,xmm2				; move v to scratch
77	pshufd xmm6,xmm1, 0xE4			; move u to scratch
78	movdqa xmm4,xmm0				; g = y
79
80	psraw  xmm6,2					; divide u by 4
81	psubsw xmm4,xmm6				; subtract from g
82	psraw  xmm6,2					; divide u by 4
83	psubsw xmm4,xmm6				; subtract from g
84	psraw  xmm6,1					; divide u by 2
85	psubsw xmm4,xmm6				; subtract from g
86
87	psraw  xmm7,1					; divide v by 2
88	psubsw xmm4,xmm7				; subtract from g
89	psraw  xmm7,2					; divide v by 4
90	psubsw xmm4,xmm7				; subtract from g
91	psraw  xmm7,1					; divide v by 2
92	psubsw xmm4,xmm7				; subtract from g
93	psraw  xmm7,1					; divide v by 2
94	psubsw xmm4,xmm7				; subtract from g
95%endmacro
96
97; outputer
98%macro rgba32sse2output 0
99; clamp values
100	pxor xmm7,xmm7
101	packuswb xmm3,xmm7				; clamp to 0,255 and pack R to 8 bit per pixel
102	packuswb xmm4,xmm7				; clamp to 0,255 and pack G to 8 bit per pixel
103	packuswb xmm5,xmm7				; clamp to 0,255 and pack B to 8 bit per pixel
104; convert to bgra32 packed
105	punpcklbw xmm5,xmm4				; bgbgbgbgbgbgbgbg
106	movdqa xmm0, xmm5				; save bg values
107	punpcklbw xmm3,xmm7				; r0r0r0r0r0r0r0r0
108	punpcklwd xmm5,xmm3				; lower half bgr0bgr0bgr0bgr0
109	punpckhwd xmm0,xmm3				; upper half bgr0bgr0bgr0bgr0
110; write to output ptr
111	movntdq [edi], xmm5				; output first 4 pixels bypassing cache
112	movntdq [edi+16], xmm0			; output second 4 pixels bypassing cache
113%endmacro
114
115SECTION .data align=16
116
117Const16	dw	16
118	dw	16
119	dw	16
120	dw	16
121	dw	16
122	dw	16
123	dw	16
124	dw	16
125
126Const128	dw	128
127	dw	128
128	dw	128
129	dw	128
130	dw	128
131	dw	128
132	dw	128
133	dw	128
134
135UMask	db	0x01
136	db	0x80
137	db	0x01
138	db	0x80
139	db	0x05
140	db	0x80
141	db	0x05
142	db	0x80
143	db	0x09
144	db	0x80
145	db	0x09
146	db	0x80
147	db	0x0d
148	db	0x80
149	db	0x0d
150	db	0x80
151
152VMask	db	0x03
153	db	0x80
154	db	0x03
155	db	0x80
156	db	0x07
157	db	0x80
158	db	0x07
159	db	0x80
160	db	0x0b
161	db	0x80
162	db	0x0b
163	db	0x80
164	db	0x0f
165	db	0x80
166	db	0x0f
167	db	0x80
168
169YMask	db	0x00
170	db	0x80
171	db	0x02
172	db	0x80
173	db	0x04
174	db	0x80
175	db	0x06
176	db	0x80
177	db	0x08
178	db	0x80
179	db	0x0a
180	db	0x80
181	db	0x0c
182	db	0x80
183	db	0x0e
184	db	0x80
185
186
187; void Convert_YUV422_RGBA32_SSSE3(void *fromPtr, void *toPtr, int width)
188width    equ	ebp+16
189toPtr    equ	ebp+12
190fromPtr  equ	ebp+8
191
192; void Convert_YUV420P_RGBA32_SSSE3(void *fromYPtr, void *fromUPtr, void *fromVPtr, void *toPtr, int width)
193width1    equ	ebp+24
194toPtr1    equ	ebp+20
195fromVPtr  equ	ebp+16
196fromUPtr  equ	ebp+12
197fromYPtr  equ	ebp+8
198
199SECTION .text align=16
200
201cglobal Convert_YUV422_RGBA32_SSSE3
202; reserve variables
203	push ebp
204	mov ebp, esp
205	push edi
206	push esi
207	push ecx
208
209	mov esi, [fromPtr]
210	mov edi, [toPtr]
211	mov ecx, [width]
212; loop width / 8 times
213	shr ecx,3
214	test ecx,ecx
215	jng ENDLOOP
216REPEATLOOP:							; loop over width / 8
217; YUV422 packed inputer
218	movdqa xmm0, [esi]				; should have yuyv yuyv yuyv yuyv
219	pshufd xmm1, xmm0, 0xE4			; copy to xmm1
220	movdqa xmm2, xmm0				; copy to xmm2
221; extract both y giving y0y0
222	pshufb xmm0, [YMask]
223; extract u and duplicate so each u in yuyv becomes u0u0
224	pshufb xmm1, [UMask]
225; extract v and duplicate so each v in yuyv becomes v0v0
226	pshufb xmm2, [VMask]
227
228yuv2rgbsse2
229
230rgba32sse2output
231
232; endloop
233	add edi,32
234	add esi,16
235	sub ecx, 1				; apparently sub is better than dec
236	jnz REPEATLOOP
237ENDLOOP:
238; Cleanup
239	pop ecx
240	pop esi
241	pop edi
242	mov esp, ebp
243	pop ebp
244	ret
245
246cglobal Convert_YUV420P_RGBA32_SSSE3
247; reserve variables
248	push ebp
249	mov ebp, esp
250	push edi
251	push esi
252	push ecx
253	push eax
254	push ebx
255
256	mov esi, [fromYPtr]
257	mov eax, [fromUPtr]
258	mov ebx, [fromVPtr]
259	mov edi, [toPtr1]
260	mov ecx, [width1]
261; loop width / 8 times
262	shr ecx,3
263	test ecx,ecx
264	jng ENDLOOP1
265REPEATLOOP1:						; loop over width / 8
266; YUV420 Planar inputer
267	movq xmm0, [esi]				; fetch 8 y values (8 bit) yyyyyyyy00000000
268	movd xmm1, [eax]				; fetch 4 u values (8 bit) uuuu000000000000
269	movd xmm2, [ebx]				; fetch 4 v values (8 bit) vvvv000000000000
270
271; extract y
272	pxor xmm7,xmm7					; 00000000000000000000000000000000
273	punpcklbw xmm0,xmm7				; interleave xmm7 into xmm0 y0y0y0y0y0y0y0y0
274; extract u and duplicate so each becomes 0u0u
275	punpcklbw xmm1,xmm7				; interleave xmm7 into xmm1 u0u0u0u000000000
276	punpcklwd xmm1,xmm7				; interleave again u000u000u000u000
277	pshuflw xmm1,xmm1, 0xA0			; copy u values
278	pshufhw xmm1,xmm1, 0xA0			; to get u0u0
279; extract v
280	punpcklbw xmm2,xmm7				; interleave xmm7 into xmm1 v0v0v0v000000000
281	punpcklwd xmm2,xmm7				; interleave again v000v000v000v000
282	pshuflw xmm2,xmm2, 0xA0			; copy v values
283	pshufhw xmm2,xmm2, 0xA0			; to get v0v0
284
285yuv2rgbsse2
286
287rgba32sse2output
288
289; endloop
290	add edi,32
291	add esi,8
292	add eax,4
293	add ebx,4
294	sub ecx, 1				; apparently sub is better than dec
295	jnz REPEATLOOP1
296ENDLOOP1:
297; Cleanup
298	pop ebx
299	pop eax
300	pop ecx
301	pop esi
302	pop edi
303	mov esp, ebp
304	pop ebp
305	ret
306
307SECTION .note.GNU-stack noalloc noexec nowrite progbits
308