xref: /haiku/src/add-ons/media/media-add-ons/radeon/yuv_converter.h (revision e81a954787e50e56a7f06f72705b7859b6ab06d1)
1 /*
2 	Copyright (c) 2004, Thomas Kurschel
3 
4 
5 	Part of Radeon In add-on
6 
7 	YUV converter
8 
9 	The Rage Theatre always provides YUV422 data and starting with Radeon, ATI
10 	moved colour converter from 2D to 3D unit, so if you need another format you
11 	must convert it manually, unless you get 3D working (which is, starting with r300,
12 	hopeless anyway as there is no spec).
13 
14 	Colour temperature is according to BT.601; for YCbCr format see also GraphicsDefs.h
15 
16 	This header is included from VideoIn.cpp, with various defines to convert to
17 	the wished format (RGB15, RGB16 or RGB32).
18 
19 	Things to improve:
20 	- colour components should be interpolated for odd pixels
21 */
22 
23 	static const int8 c_offs[8] =
24 		{ 128, 128, 128, 128, 128, 128, 128, 128 };
25 
26 	static const int16 y_offs[4] =
27 		{ 16*128, 16*128, 16*128, 16*128 };
28 
29 	static const uint16 masks[2][4] = {
30 		// high byte mask
31 		{ 0xff00, 0xff00, 0xff00, 0xff00 },
32 		// low byte mask
33 		{ 0x00ff, 0x00ff, 0x00ff, 0x00ff },
34 	};
35 
36 	static const int16 scale[5][4] = {
37 		// Y pre-scale
38 		{ (int16)(1.1678 * 512), (int16)(1.1678 * 512), (int16)(1.1678 * 512), (int16)(1.1678 * 512) },
39 		// CbG CrG CbG CrG
40 		{ (int16)(-0.3929 * 256), (int16)(-0.8154 * 256), (int16)(-0.3929 * 256), (int16)(-0.8154 * 256) },
41 		// CbB CrR CbB CrR
42 		{ (int16)(2.0232 * 256), (int16)(1.6007 * 256), (int16)(2.0232 * 256), (int16)(1.6007 * 256) },
43 	};
44 
45 	static const int8 masks_8bit[2][8] = {
46 		// r/b 16 bit mask and r/g/b 15 bit mask
47 		{ 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8, 0xf8 },
48 		// g 16 bit mask
49 		{ 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc, 0xfc },
50 	};
51 
52 	asm volatile(
53 	"2:\n"
54 		"pxor		%%mm7,%%mm7\n"
55 
56 	"1:\n"
57 		"movq		(%0),%%mm0\n"		// mm0 = Cr2'Y3' Cb2'Y2' Cr0'Y1' Cb0'Y0'
58 		"movq		%%mm0,%%mm1\n"		// mm1 = Cr2'Y3' Cb2'Y2' Cr0'Y1' Cb0'Y0'
59 
60 		// Y is in 16..235
61 		// we need to substract 16 and scale to full range
62 		// as standard MMX has a _signed_ integer multiply only, the highest bit must
63 		// be zero before scaling, i.e. we use signed format 9.7
64 		"pand		8+%4,%%mm0\n"		// mm2 =     Y3'     Y2'     Y1'     Y0'
65 		"psllw		$7,%%mm0\n"
66 		"psubusw	%3,%%mm0\n"
67 		"pmulhw		%5,%%mm0\n"			// mm0 =      Y3      Y2      Y1      Y0
68 
69 		// Cb and Cr is biased; compensate that
70 		"psubb		%2,%%mm1\n"			// mm1 = Cr2 xxx Cb2 xxx Cr0 xxx Cb0 xxx
71 		"pand		%4,%%mm1\n"			// mm1 = Cr2     Cb2     Cr0     Cb0
72 
73 		// transform Cb and Cr to green component
74 		"movq		%%mm1,%%mm2\n"
75 		"pmaddwd	8+%5,%%mm1\n"		// mm1 =  CbCrG2 xxxxxxx  CbCrG0 xxxxxxx
76 		"psrad		$16,%%mm1\n"		// mm1 =          CbCrG2          CbCrG0
77 		"packssdw	%%mm1,%%mm1\n"		// mm1 =  CbCrG2  CbCrG0  CbCrG2  CbCrG0
78 		"punpcklwd	%%mm1,%%mm1\n"		// mm1 =  CbCrG2  CbCrG2  CbCrG0  CbCrG0
79 
80 		// transform Cb to blue and Cr to red component
81 		"pmulhw		16+%5,%%mm2\n"		// mm2 =    CrR2    CbB2    CrR0    CbB0
82 
83 		// nasty shuffling to separate and duplicate components
84 		"movq		%%mm2,%%mm3\n"
85 		"punpcklwd	%%mm3,%%mm3\n"		// mm3 =    CrR0    CrR0    CbB0    CbB0
86 		"punpckhwd	%%mm2,%%mm2\n"		// mm2 =    CrR2    CrR2    CbB2    CbB2
87 
88 		"movq		%%mm3,%%mm4\n"
89 		"punpckldq	%%mm2,%%mm3\n"		// mm3 =    CbB2    CbB2    CbB0    CbB0
90 		"punpckhdq	%%mm2,%%mm4\n"		// mm4 =    CrR2    CrR2    CrR0    CrR0
91 
92 		// add Y to get final RGB
93 		"paddsw		%%mm0,%%mm1\n"		// mm1 =      G3      G2      G1      G0
94 		"paddsw		%%mm0,%%mm3\n"		// mm3 =      B3      B2      B1      B0
95 		"paddsw		%%mm0,%%mm4\n"		// mm4 =      R3      R2      R1      R0
96 
97 		// now, RBG can be converted to 8 bits each
98 		"packuswb	%%mm0,%%mm1\n"		// mm1 =  Y3  Y2  Y1  Y0  G3  G2  G1  G0
99 		"packuswb	%%mm4,%%mm3\n"		// mm3 =  R3  R2  R1  R0  B3  B2  B1  B0
100 
101 #ifdef RGB32
102 		// convertion to RGB32
103 		"movq		%%mm3,%%mm2\n"
104 		"punpckhbw	%%mm1,%%mm3\n"		// mm3 =  Y3  R3  Y2  R2  Y1  R1  Y0  R0
105 		"punpcklbw	%%mm1,%%mm2\n"		// mm2 =  G3  B3  G2  B2  G1  B1  G0  B0
106 
107 		"movq		%%mm2,%%mm1\n"
108 		"punpcklwd	%%mm3,%%mm2\n"
109 		"movq		%%mm2,0x00(%1)\n"	// dst =  Y1  R1  G1  B1  Y0  R0  G0  B0
110 
111 		"punpckhwd	%%mm3,%%mm1\n"
112 		"movq		%%mm1,0x08(%1)\n"	// dst =  Y3  R3  G3  B3  Y2  R2  G2  B2
113 
114 		"addl		$0x08,%0\n"			// source += 8
115 		"addl		$0x10,%1\n"			// destination += 16
116 		"subl		$0x10,%7\n"			// next pixels
117 #endif
118 
119 #ifdef RGB16
120 		// convertion to RGB16
121 		// things would be much easier if Intel had added a RGB32->RGB16 instruction
122 		"pand		%6,%%mm3\n"			//  mm3 -  R3  R2  R1  R0  B3  B2  B1  B0 (masked)
123 		"pand		8+%6,%%mm1\n"		//  mm1 -  Y3  Y2  Y1  Y0  G3  G2  G1  G0 (masked)
124 
125 		"punpcklbw	%%mm7,%%mm1\n"		//  mm1 -      G3      G2      G1      G0
126 		"movq		%%mm7,%%mm2\n"
127 		"punpckhbw 	%%mm3,%%mm2\n"		//  mm2 -  R3      R2      R1      R0
128 		"punpcklbw 	%%mm7,%%mm3\n"		//  mm3 -      B3      B2      B1      B0
129 
130 		"psllw		$3,%%mm1\n"			//  mm1 -    G3      G2      G1      G0
131 		"psrlw		$3,%%mm3\n"			//  mm3 -      B3      B2      B1      B0
132 
133 		"por		%%mm2,%%mm1\n"
134 		"por		%%mm3,%%mm1\n"
135 		"movq		%%mm1,(%1)\n"
136 
137 		"addl		$0x08,%0\n"			// source += 8
138 		"addl		$0x08,%1\n"			// destination += 8
139 		"subl		$0x08,%7\n"			// next pixels
140 #endif
141 
142 #ifdef RGB15
143 		// convertion to RGB15
144 		// same problem as before
145 		"pand		%6,%%mm3\n"			//  mm3 -  R3  R2  R1  R0  B3  B2  B1  B0 (masked)
146 		"pand		%6,%%mm1\n"			//  mm1 -  Y3  Y2  Y1  Y0  G3  G2  G1  G0 (masked)
147 
148 		"punpcklbw	%%mm7,%%mm1\n"		//  mm1 -      G3      G2      G1      G0
149 		"movq		%%mm7,%%mm2\n"
150 		"punpckhbw 	%%mm3,%%mm2\n"		//  mm2 -  R3      R2      R1      R0
151 		"punpcklbw 	%%mm7,%%mm3\n"		//  mm3 -      B3      B2      B1      B0
152 
153 		"psllw		$2,%%mm1\n"			//  mm1 -    G3      G2      G1      G0
154 		"psrlw		$1,%%mm2\n"			//  mm2 -  R3      R2      R1      R0
155 		"psrlw		$3,%%mm3\n"			//  mm3 -      B3      B2      B1      B0
156 
157 		"por		%%mm2,%%mm1\n"
158 		"por		%%mm3,%%mm1\n"
159 		"movq		%%mm1,(%1)\n"
160 
161 		"addl		$0x08,%0\n"			// source += 8
162 		"addl		$0x08,%1\n"			// destination += 8
163 		"subl		$0x08,%7\n"			// next pixels
164 #endif
165 
166 		// next
167 		"jg			1b\n"
168 
169 		"movl		%9,%7\n"
170 		"subl		%7,%8\n"
171 
172 		"jg			2b\n"
173 		"emms\n"
174 		:
175 		: "a" (convert_buffer), "d" (bits),
176 		  "g" (c_offs), "g" (y_offs), "g" (masks), "g" (scale), "g" (masks_8bit),
177 		  "c" (bytesPerRow), "S" (bitsLength), "D" (bytesPerRow));
178