xref: /haiku/src/add-ons/accelerants/radeon/overlay.c (revision 1acbe440b8dd798953bec31d18ee589aa3f71b73)
1 /*
2 	Copyright (c) 2002-2004, Thomas Kurschel
3 
4 
5 	Part of Radeon accelerant
6 
7 	Hardware access routines for overlays
8 */
9 
10 #include "GlobalData.h"
11 #include "radeon_interface.h"
12 #include "mmio.h"
13 #include "overlay_regs.h"
14 #include "pll_regs.h"
15 #include "capture_regs.h"
16 #include "utils.h"
17 #include "pll_access.h"
18 #include <math.h>
19 #include <string.h>
20 #include "CP.h"
21 
22 
23 void Radeon_TempHideOverlay( accelerator_info *ai );
24 
25 // standard (linear) gamma
26 static struct {
27     uint16 reg;
28     bool r200_or_above;
29     uint32 slope;
30     uint32 offset;
31 } std_gamma[] = {
32     { RADEON_OV0_GAMMA_0_F, false, 0x100, 0x0000 },
33     { RADEON_OV0_GAMMA_10_1F, false, 0x100, 0x0020 },
34     { RADEON_OV0_GAMMA_20_3F, false, 0x100, 0x0040 },
35     { RADEON_OV0_GAMMA_40_7F, false, 0x100, 0x0080 },
36     { RADEON_OV0_GAMMA_80_BF, true, 0x100, 0x0100 },
37     { RADEON_OV0_GAMMA_C0_FF, true, 0x100, 0x0100 },
38     { RADEON_OV0_GAMMA_100_13F, true, 0x100, 0x0200 },
39     { RADEON_OV0_GAMMA_140_17F, true, 0x100, 0x0200 },
40     { RADEON_OV0_GAMMA_180_1BF, true, 0x100, 0x0300 },
41     { RADEON_OV0_GAMMA_1C0_1FF, true, 0x100, 0x0300 },
42     { RADEON_OV0_GAMMA_200_23F, true, 0x100, 0x0400 },
43     { RADEON_OV0_GAMMA_240_27F, true, 0x100, 0x0400 },
44     { RADEON_OV0_GAMMA_280_2BF, true, 0x100, 0x0500 },
45     { RADEON_OV0_GAMMA_2C0_2FF, true, 0x100, 0x0500 },
46     { RADEON_OV0_GAMMA_300_33F, true, 0x100, 0x0600 },
47     { RADEON_OV0_GAMMA_340_37F, true, 0x100, 0x0600 },
48     { RADEON_OV0_GAMMA_380_3BF, false, 0x100, 0x0700 },
49     { RADEON_OV0_GAMMA_3C0_3FF, false, 0x100, 0x0700 }
50 };
51 
52 
53 // setup overlay unit before first use
54 void Radeon_InitOverlay(
55 	accelerator_info *ai, int crtc_idx )
56 {
57 	vuint8 *regs = ai->regs;
58 	shared_info *si = ai->si;
59 	uint i;
60 	uint32 ecp_div;
61 
62 	SHOW_FLOW0( 0, "" );
63 
64 	// make sure we really write this value as the "toggle" bit
65 	// contained in it (which is zero initially) is edge-sensitive!
66 	// for capturing, we need to select "software" video port
67 	si->overlay_mgr.auto_flip_reg = RADEON_OV0_VID_PORT_SELECT_SOFTWARE;
68 
69 	OUTREG( regs, RADEON_OV0_SCALE_CNTL, RADEON_SCALER_SOFT_RESET );
70 	OUTREG( regs, RADEON_OV0_AUTO_FLIP_CNTRL, si->overlay_mgr.auto_flip_reg );
71 	OUTREG( regs, RADEON_OV0_FILTER_CNTL, 			// use fixed filter coefficients
72 		RADEON_OV0_HC_COEF_ON_HORZ_Y |
73 		RADEON_OV0_HC_COEF_ON_HORZ_UV |
74 		RADEON_OV0_HC_COEF_ON_VERT_Y |
75 		RADEON_OV0_HC_COEF_ON_VERT_UV );
76 	OUTREG( regs, RADEON_OV0_KEY_CNTL, RADEON_GRAPHIC_KEY_FN_EQ |
77 		RADEON_VIDEO_KEY_FN_FALSE |
78 		RADEON_CMP_MIX_OR );
79 	OUTREG( regs, RADEON_OV0_TEST, 0 );
80 //	OUTREG( regs, RADEON_FCP_CNTL, RADEON_FCP_CNTL_GND );	// disable capture clock
81 //	OUTREG( regs, RADEON_CAP0_TRIG_CNTL, 0 );				// disable capturing
82 	OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, 0 );
83 	// tell deinterlacer to always show recent field
84 	OUTREG( regs, RADEON_OV0_DEINTERLACE_PATTERN,
85 		0xaaaaa | (9 << RADEON_OV0_DEINT_PAT_LEN_M1_SHIFT) );
86 
87 	// set gamma
88 	for( i = 0; i < sizeof( std_gamma ) / sizeof( std_gamma[0] ); ++i ) {
89 		if( !std_gamma[i].r200_or_above || si->asic >= rt_r200 ) {
90 			OUTREG( regs, std_gamma[i].reg,
91 				(std_gamma[i].slope << 16) | std_gamma[i].offset );
92 		}
93 	}
94 
95 	// overlay unit can only handle up to 175 MHz, if pixel clock is higher,
96 	// only every second pixel is handled
97 	if( si->crtc[crtc_idx].mode.timing.pixel_clock < 175000 )
98 		ecp_div = 0;
99 	else
100 		ecp_div = 1;
101 
102 	Radeon_OUTPLLP( regs, si->asic, RADEON_VCLK_ECP_CNTL,
103 		ecp_div << RADEON_ECP_DIV_SHIFT, ~RADEON_ECP_DIV_MASK );
104 
105 	// Force the overlay clock on for integrated chips
106 	if ((si->asic == rt_rs100) ||
107 	(si->asic == rt_rs200) ||
108 	(si->asic == rt_rs300)) {
109 		Radeon_OUTPLL( regs, si->asic, RADEON_VCLK_ECP_CNTL,
110         	(Radeon_INPLL( regs, si->asic, RADEON_VCLK_ECP_CNTL) | (1<<18)));
111     }
112 
113 	si->active_overlay.crtc_idx = si->pending_overlay.crtc_idx;
114 
115 	// invalidate active colour space
116 	si->active_overlay.ob.space = -1;
117 
118 	// invalidate position/scaling
119 	si->active_overlay.ob.width = -1;
120 }
121 
122 // colour space transformation matrix
123 typedef struct space_transform
124 {
125     float   RefLuma;	// scaling of luma to use full RGB range
126     float   RefRCb;		// b/u -> r
127     float   RefRY;		// g/y -> r
128     float   RefRCr;		// r/v -> r
129     float   RefGCb;
130     float   RefGY;
131     float   RefGCr;
132     float   RefBCb;
133     float   RefBY;
134     float   RefBCr;
135 } space_transform;
136 
137 
138 // Parameters for ITU-R BT.601 and ITU-R BT.709 colour spaces
139 space_transform trans_yuv[2] =
140 {
141     { 1.1678, 0.0, 1, 1.6007, -0.3929, 1, -0.8154, 2.0232, 1, 0.0 }, /* BT.601 */
142     { 1.1678, 0.0, 1, 1.7980, -0.2139, 1, -0.5345, 2.1186, 1, 0.0 }  /* BT.709 */
143 };
144 
145 
146 // RGB is a pass through
147 space_transform trans_rgb =
148 	{ 1, 0, 0, 1, 0, 1, 0, 1, 0, 0 };
149 
150 
151 // set overlay colour space transformation matrix
152 static void Radeon_SetTransform(
153 	accelerator_info *ai,
154 	float	    bright,
155 	float	    cont,
156 	float	    sat,
157 	float	    hue,
158 	float	    red_intensity,
159 	float	    green_intensity,
160 	float	    blue_intensity,
161 	uint	    ref)
162 {
163 	vuint8 *regs = ai->regs;
164 	shared_info *si = ai->si;
165 	float	    OvHueSin, OvHueCos;
166 	float	    CAdjOff;
167 	float		CAdjRY, CAdjGY, CAdjBY;
168 	float	    CAdjRCb, CAdjRCr;
169 	float	    CAdjGCb, CAdjGCr;
170 	float	    CAdjBCb, CAdjBCr;
171 	float	    RedAdj,GreenAdj,BlueAdj;
172 	float	    OvROff, OvGOff, OvBOff;
173 	float		OvRY, OvGY, OvBY;
174 	float	    OvRCb, OvRCr;
175 	float	    OvGCb, OvGCr;
176 	float	    OvBCb, OvBCr;
177 	float	    Loff;
178 	float	    Coff;
179 
180 	uint32	    dwOvROff, dwOvGOff, dwOvBOff;
181 	uint32		dwOvRY, dwOvGY, dwOvBY;
182 	uint32	    dwOvRCb, dwOvRCr;
183 	uint32	    dwOvGCb, dwOvGCr;
184 	uint32	    dwOvBCb, dwOvBCr;
185 
186 	space_transform	*trans;
187 
188 	SHOW_FLOW0( 0, "" );
189 
190 	// get proper conversion formula
191 	switch( si->pending_overlay.ob.space ) {
192 	case B_YCbCr422:
193 	case B_YUV12:
194 		Loff = 16 * 4;		// internal representation is 10 Bits
195 		Coff = 128 * 4;
196 
197 		if (ref >= 2)
198 			ref = 0;
199 
200 		trans = &trans_yuv[ref];
201 		break;
202 
203 	case B_RGB15:
204 	case B_RGB16:
205 	case B_RGB32:
206 	default:
207 		Loff = 0;
208 		Coff = 0;
209 		trans = &trans_rgb;
210 	}
211 
212 	OvHueSin = sin(hue);
213 	OvHueCos = cos(hue);
214 
215 	// get matrix values to convert overlay colour space to RGB
216 	// applying colour adjustment, saturation and luma scaling
217 	// (saturation doesn't work with RGB input, perhaps it did with some
218 	//  maths; this is left to the reader :)
219 	CAdjRY = cont * trans->RefLuma * trans->RefRY;
220 	CAdjGY = cont * trans->RefLuma * trans->RefGY;
221 	CAdjBY = cont * trans->RefLuma * trans->RefBY;
222 
223 	CAdjRCb = sat * -OvHueSin * trans->RefRCr;
224 	CAdjRCr = sat * OvHueCos * trans->RefRCr;
225 	CAdjGCb = sat * (OvHueCos * trans->RefGCb - OvHueSin * trans->RefGCr);
226 	CAdjGCr = sat * (OvHueSin * trans->RefGCb + OvHueCos * trans->RefGCr);
227 	CAdjBCb = sat * OvHueCos * trans->RefBCb;
228 	CAdjBCr = sat * OvHueSin * trans->RefBCb;
229 
230 	// adjust black level
231 	CAdjOff = cont * trans[ref].RefLuma * bright * 1023.0;
232 	RedAdj = cont * trans[ref].RefLuma * red_intensity * 1023.0;
233 	GreenAdj = cont * trans[ref].RefLuma * green_intensity * 1023.0;
234 	BlueAdj = cont * trans[ref].RefLuma * blue_intensity * 1023.0;
235 
236 	OvRY = CAdjRY;
237 	OvGY = CAdjGY;
238 	OvBY = CAdjBY;
239 	OvRCb = CAdjRCb;
240 	OvRCr = CAdjRCr;
241 	OvGCb = CAdjGCb;
242 	OvGCr = CAdjGCr;
243 	OvBCb = CAdjBCb;
244 	OvBCr = CAdjBCr;
245 	// apply offsets
246 	OvROff = RedAdj + CAdjOff -	CAdjRY * Loff - (OvRCb + OvRCr) * Coff;
247 	OvGOff = GreenAdj + CAdjOff - CAdjGY * Loff - (OvGCb + OvGCr) * Coff;
248 	OvBOff = BlueAdj + CAdjOff - CAdjBY * Loff - (OvBCb + OvBCr) * Coff;
249 
250 	dwOvROff = ((int32)(OvROff * 2.0)) & 0x1fff;
251 	dwOvGOff = ((int32)(OvGOff * 2.0)) & 0x1fff;
252 	dwOvBOff = ((int32)(OvBOff * 2.0)) & 0x1fff;
253 
254 	dwOvRY = (((int32)(OvRY * 2048.0))&0x7fff)<<17;
255 	dwOvGY = (((int32)(OvGY * 2048.0))&0x7fff)<<17;
256 	dwOvBY = (((int32)(OvBY * 2048.0))&0x7fff)<<17;
257 	dwOvRCb = (((int32)(OvRCb * 2048.0))&0x7fff)<<1;
258 	dwOvRCr = (((int32)(OvRCr * 2048.0))&0x7fff)<<17;
259 	dwOvGCb = (((int32)(OvGCb * 2048.0))&0x7fff)<<1;
260 	dwOvGCr = (((int32)(OvGCr * 2048.0))&0x7fff)<<17;
261 	dwOvBCb = (((int32)(OvBCb * 2048.0))&0x7fff)<<1;
262 	dwOvBCr = (((int32)(OvBCr * 2048.0))&0x7fff)<<17;
263 
264 	OUTREG( regs, RADEON_OV0_LIN_TRANS_A, dwOvRCb | dwOvRY );
265 	OUTREG( regs, RADEON_OV0_LIN_TRANS_B, dwOvROff | dwOvRCr );
266 	OUTREG( regs, RADEON_OV0_LIN_TRANS_C, dwOvGCb | dwOvGY );
267 	OUTREG( regs, RADEON_OV0_LIN_TRANS_D, dwOvGOff | dwOvGCr );
268 	OUTREG( regs, RADEON_OV0_LIN_TRANS_E, dwOvBCb | dwOvBY );
269 	OUTREG( regs, RADEON_OV0_LIN_TRANS_F, dwOvBOff | dwOvBCr );
270 
271 	si->active_overlay.ob.space = si->pending_overlay.ob.space;
272 }
273 
274 
275 // convert Be colour key to rgb value
276 static uint32 colourKey2RGB32(
277 	uint32 space, uint8 red, uint8 green, uint8 blue )
278 {
279 	uint32 res;
280 
281 	SHOW_FLOW0( 3, "" );
282 
283 	// the way Be defines colour keys may be convinient to some driver developers,
284 	// but it's not well defined - took me some time to find out the format used
285 	// and still I have no idea how alpha is defined; Rudolf told me that alpha is
286 	// never used
287 	switch( space ) {
288 	case B_RGB15:
289 		res =
290 			((uint32)(red >> 0) << (16+3)) |
291 			((uint32)(green >> 0) << (8+3)) |
292 			((blue >> 0) << 3);
293 		break;
294 	case B_RGB16:
295 		res =
296 			((uint32)(red >> 0) << (16+3)) |
297 			((uint32)(green >> 0) << (8+2)) |
298 			((blue >> 0) << 3);
299 		break;
300 	case B_RGB32:
301 	case B_CMAP8:
302 		res = ((uint32)(red) << 16) | ((uint32)(green) << 8) | blue;
303 		break;
304 	default:
305 		res = 0;
306 	}
307 
308 	SHOW_FLOW( 3, "key=%lx", res );
309 	return res;
310 }
311 
312 
313 // set colour key of overlay
314 static void Radeon_SetColourKey(
315 	accelerator_info *ai, const overlay_window *ow )
316 {
317 	virtual_card *vc = ai->vc;
318 	vuint8 *regs = ai->regs;
319 	uint32 rgb32, mask32, min32, max32;
320 
321 	/*SHOW_FLOW( 0, "value=%02x %02x %02x, mask=%02x %02x %02x",
322 		ow->red.value, ow->green.value, ow->blue.value,
323 		ow->red.mask, ow->green.mask, ow->blue.mask );*/
324 
325 	// Radeons don't support value and mask as colour key but colour range
326 	rgb32 = colourKey2RGB32( vc->mode.space,
327 		ow->red.value, ow->green.value, ow->blue.value );
328 	mask32 = colourKey2RGB32( vc->mode.space,
329 		ow->red.mask, ow->green.mask, ow->blue.mask );
330 
331 	// ~mask32 are all unimportant (usually low order) bits
332 	// oring this to the colour should give us the highest valid colour value
333 	// (add would be more precise but may lead to overflows)
334 	min32 = rgb32;
335 	max32 = rgb32 | ~mask32;
336 
337 	OUTREG( regs, RADEON_OV0_GRAPHICS_KEY_CLR_LOW, min32 );
338 	OUTREG( regs, RADEON_OV0_GRAPHICS_KEY_CLR_HIGH, max32 );
339 	OUTREG( regs, RADEON_OV0_KEY_CNTL,
340 		RADEON_GRAPHIC_KEY_FN_EQ |
341 		RADEON_VIDEO_KEY_FN_FALSE |
342 		RADEON_CMP_MIX_OR );
343 }
344 
345 typedef struct {
346 	uint max_scale;					// maximum src_width/dest_width,
347 									// i.e. source increment per screen pixel
348 	uint8 group_size; 				// size of one filter group in pixels
349 	uint8 p1_step_by, p23_step_by;	// > 0: log(source pixel increment)+1, 2-tap filter
350 									// = 0: source pixel increment = 1, 4-tap filter
351 } hscale_factor;
352 
353 #define count_of( a ) (sizeof( a ) / sizeof( a[0] ))
354 
355 // scaling/filter tables depending on overlay colour space:
356 // magnifying pixels is no problem, but minifying can lead to overload,
357 // so we have to skip pixels and/or use 2-tap filters
358 static hscale_factor scale_RGB16[] = {
359 	{ (2 << 12), 		2, 1, 1 },
360 	{ (4 << 12), 		2, 2, 2 },
361 	{ (8 << 12), 		2, 3, 3 },
362 	{ (16 << 12), 		2, 4, 4 },
363 	{ (32 << 12), 		2, 5, 5 }
364 };
365 
366 static hscale_factor scale_RGB32[] = {
367 	{ (2 << 12) / 3,	2, 0, 0 },
368 	{ (4 << 12) / 3,	4, 1, 1 },
369 	{ (8 << 12) / 3,	4, 2, 2 },
370 	{ (4 << 12), 		4, 2, 3 },
371 	{ (16 << 12) / 3,	4, 3, 3 },
372 	{ (8 << 12), 		4, 3, 4 },
373 	{ (32 << 12) / 3,	4, 4, 4 },
374 	{ (16 << 12),		4, 5, 5 }
375 };
376 
377 static hscale_factor scale_YUV[] = {
378 	{ (16 << 12) / 16,	2, 0, 0 },
379 	{ (16 << 12) / 12,	2, 0, 1 },	// mode 4, 1, 0 (as used by YUV12) is impossible
380 	{ (16 << 12) / 8,	4, 1, 1 },
381 	{ (16 << 12) / 6,	4, 1, 2 },
382 	{ (16 << 12) / 4,	4, 2, 2 },
383 	{ (16 << 12) / 3,	4, 2, 3 },
384 	{ (16 << 12) / 2,	4, 3, 3 },
385 	{ (16 << 12) / 1,	4, 4, 4 }
386 };
387 
388 static hscale_factor scale_YUV12[] = {
389 	{ (16 << 12) / 16,			2, 0, 0 },
390 	{ (16 << 12) / 12,			4, 1, 0 },
391 	{ (16 << 12) / 12,			2, 0, 1 },
392 	{ (16 << 12) / 8,			4, 1, 1 },
393 	{ (16 << 12) / 6,			4, 1, 2 },
394 	{ (16 << 12) / 4,			4, 2, 2 },
395 	{ (16 << 12) / 3,			4, 2, 3 },
396 	{ (16 << 12) / 2,			4, 3, 3 },
397 	{ (int)((16 << 12) / 1.5),	4, 3, 4 },
398 	{ (int)((16 << 12) / 1.0),	4, 4, 4 },
399 	{ (int)((16 << 12) / 0.75),	4, 4, 5 },
400 	{ (int)((16 << 12) / 0.5),	4, 5, 5 }
401 };
402 
403 #define min3( a, b, c ) (min( (a), min( (b), (c) )))
404 
405 static hscale_factor scale_YUV9[] = {
406 	{ min3( (16 << 12) / 12,	(3 << 12) * 1,	(2 << 12) * 4 * 1 ),	2, 0, 0 },
407 	{ min3( (16 << 12) / 8, 	(3 << 12) * 1,	(2 << 12) * 4 * 1 ),	4, 1, 0 },
408 	{ min3( (16 << 12) / 10,	(3 << 12) * 1,	(2 << 12) * 4 * 1 ),	2, 0, 1 },
409 	{ min3( (16 << 12) / 6, 	(3 << 12) * 1,	(2 << 12) * 4 * 1 ),	4, 1, 1 },
410 	{ min3( (16 << 12) / 5, 	(3 << 12) * 1,	(2 << 12) * 4 * 2 ),	4, 1, 2 },
411 	{ min3( (16 << 12) / 3, 	(3 << 12) * 2,	(2 << 12) * 4 * 2 ),	4, 2, 2 },
412 	{ min3( (int)((16 << 12) / 2.5), 	(3 << 12) * 1,	(2 << 12) * 4 * 4 ),	4, 2, 3 },	// probably, it should be (3 << 12) * 2
413 	{ min3( (int)((16 << 12) / 1.5), 	(3 << 12) * 4,	(2 << 12) * 4 * 4 ),	4, 3, 3 },
414 	{ min3( (int)((16 << 12) / 0.75), 	(3 << 12) * 8,	(2 << 12) * 4 * 8 ),	4, 4, 4 },
415 	{ min3( (int)((16 << 12) / 0.625), 	(3 << 12) * 8,	(2 << 12) * 4 * 16 ),	4, 4, 5 },
416 	{ min3( (int)((16 << 12) / 0.375), 	(3 << 12) * 16,	(2 << 12) * 4 * 16 ),	4, 5, 5 }
417 };
418 
419 
420 // parameters of an overlay colour space
421 typedef struct {
422 	uint8 bpp_shift;				// log2( bytes per pixel (main plain) )
423 	uint8 bpuv_shift;				// log2( bytes per pixel (uv-plane) );
424 									// if there is one plane only: bpp=bpuv
425 	uint8 num_planes;				// number of planes
426 	uint8 h_uv_sub_sample_shift;	// log2( horizontal pixels per uv pair )
427 	uint8 v_uv_sub_sample_shift;	// log2( vertical pixels per uv pair )
428 	hscale_factor *factors;			// scaling/filter table
429 	uint8 num_factors;
430 } space_params;
431 
432 static space_params space_params_table[16] = {
433 	{ 0, 0, 0, 0, 0, NULL, 0 },	// reserved
434 	{ 0, 0, 0, 0, 0, NULL, 0 },	// reserved
435 	{ 0, 0, 0, 0, 0, NULL, 0 },	// reserved
436 	{ 1, 1, 1, 0, 0, scale_RGB16, count_of( scale_RGB16 ) },	// RGB15
437 	{ 1, 1, 1, 0, 0, scale_RGB16, count_of( scale_RGB16 ) },	// RGB16
438 	{ 0, 0, 0, 0, 0, NULL, 0 },	// reserved
439 	{ 2, 2, 1, 0, 0, scale_RGB32, count_of( scale_RGB32 ) },	// RGB32
440 	{ 0, 0, 0, 0, 0, NULL, 0 },	// reserved
441 	{ 0, 0, 0, 0, 0, NULL, 0 },	// reserved
442 	{ 0, 0, 3, 2, 2, scale_YUV9, count_of( scale_YUV9 ) },		// YUV9
443 	{ 0, 0, 3, 1, 1, scale_YUV12, count_of( scale_YUV12 ) },	// YUV12, three-plane
444 	{ 1, 1, 1, 1, 0, scale_YUV, count_of( scale_YUV ) },		// VYUY422
445 	{ 1, 1, 1, 1, 0, scale_YUV, count_of( scale_YUV ) },		// YVYU422
446 	{ 0, 1, 2, 1, 1, scale_YUV12, count_of( scale_YUV12 ) },	// YUV12, two-plane
447 	{ 0, 1, 2, 1, 1, NULL, 0 },	// ???
448 	{ 0, 0, 0, 0, 0, NULL, 0 }	// reserved
449 };
450 
451 // get appropriate scaling/filter parameters
452 static hscale_factor *getHScaleFactor(
453 	accelerator_info *ai,
454 	space_params *params,
455 	uint32 src_left, uint32 src_right, uint32 *h_inc )
456 {
457 	uint words_per_p1_line, words_per_p23_line, max_words_per_line;
458 	bool p1_4tap_allowed, p23_4tap_allowed;
459 	uint i;
460 	uint num_factors;
461 	hscale_factor *factors;
462 
463 	SHOW_FLOW0( 3, "" );
464 
465 	// check whether fifo is large enough to feed vertical 4-tap-filter
466 
467 	words_per_p1_line =
468 		ceilShiftDiv( (src_right - 1) << params->bpp_shift, 4 ) -
469 		((src_left << params->bpp_shift) >> 4) + 1;
470 	words_per_p23_line =
471 		ceilShiftDiv( (src_right - 1) << params->bpuv_shift, 4 ) -
472 		((src_left << params->bpuv_shift) >> 4) + 1;
473 
474 	// overlay scaler line length differs for different revisions
475 	// this needs to be maintained by hand
476 	if (ai->si->asic == rt_r200 || ai->si->asic >= rt_r300)
477 		max_words_per_line = 1920 / 16;
478 	else
479 		max_words_per_line = 1536 / 16;
480 
481 	switch (params->num_planes) {
482 		case 3:
483 			p1_4tap_allowed = words_per_p1_line < max_words_per_line / 2;
484 			p23_4tap_allowed = words_per_p23_line < max_words_per_line / 4;
485 			break;
486 		case 2:
487 			p1_4tap_allowed = words_per_p1_line < max_words_per_line / 2;
488 			p23_4tap_allowed = words_per_p23_line < max_words_per_line / 2;
489 			break;
490 		case 1:
491 		default:
492 			p1_4tap_allowed = p23_4tap_allowed = words_per_p1_line < max_words_per_line;
493 			break;
494 	}
495 
496 	SHOW_FLOW( 3, "p1_4tap_allowed=%d, p23_4t_allowed=%d",
497 		(int)p1_4tap_allowed, (int)p23_4tap_allowed );
498 
499 	// search for proper scaling/filter entry
500 	factors = params->factors;
501 	num_factors = params->num_factors;
502 
503 	if (factors == NULL || num_factors == 0)
504 		return NULL;
505 
506 	for (i = 0; i < num_factors; ++i, ++factors) {
507 		if (*h_inc <= factors->max_scale
508 			&& (factors->p1_step_by > 0 || p1_4tap_allowed)
509 			&& (factors->p23_step_by > 0 || p23_4tap_allowed))
510 			break;
511 	}
512 
513 	if (i == num_factors) {
514 		// overlay is asked to be scaled down more than allowed,
515 		// so use least scaling factor supported
516 		--factors;
517 		*h_inc = factors->max_scale;
518 	}
519 
520 	SHOW_FLOW( 3, "group_size=%d, p1_step_by=%d, p23_step_by=%d",
521 		factors->group_size, factors->p1_step_by, factors->p23_step_by );
522 
523 	return factors;
524 }
525 
526 
527 #define I2FF( a, shift ) ((uint32)((a) * (1 << (shift))))
528 
529 
530 // show overlay on screen
531 static status_t Radeon_ShowOverlay(
532 	accelerator_info *ai, int crtc_idx )
533 {
534 	virtual_card *vc = ai->vc;
535 	shared_info *si = ai->si;
536 	vuint8 *regs = ai->regs;
537 	overlay_info *overlay = &si->pending_overlay;
538 	overlay_buffer_node *node = overlay->on;
539 	crtc_info *crtc = &si->crtc[crtc_idx];
540 
541 	uint32 ecp_div;
542 	uint32 v_inc, h_inc;
543 	uint32 src_v_inc, src_h_inc;
544 	uint32 src_left, src_top, src_right, src_bottom;
545 	int32 dest_left, dest_top, dest_right, dest_bottom;
546 	uint32 offset;
547 	uint32 tmp;
548 	uint32 p1_h_accum_init, p23_h_accum_init, p1_v_accum_init, p23_v_accum_init;
549 	uint32 p1_active_lines, p23_active_lines;
550 	hscale_factor *factors;
551 	space_params *params;
552 
553 	uint32 p1_h_inc, p23_h_inc;
554 	uint32 p1_x_start, p1_x_end;
555 	uint32 p23_x_start, p23_x_end;
556 
557 	uint scale_ctrl;
558 
559 	/*uint32 buffer[20*2];
560 	uint idx = 0;*/
561 
562 	SHOW_FLOW0( 0, "" );
563 
564 	Radeon_SetColourKey( ai, &overlay->ow );
565 
566 	// overlay unit can only handle up to 175 MHz; if pixel clock is higher,
567 	// only every second pixel is handled
568 	// (this devider is gets written into PLL by InitOverlay,
569 	//  so we don't need to do it ourself)
570 	if( crtc->mode.timing.pixel_clock < 175000 )
571 		ecp_div = 0;
572 	else
573 		ecp_div = 1;
574 
575 
576 	// scaling is independant of clipping, get this first
577 	{
578 		uint32 src_width, src_height;
579 
580 		src_width = overlay->ov.width;
581 		src_height = overlay->ov.height;
582 
583 		// this is for graphics card
584 		v_inc = (src_height << 20) / overlay->ow.height;
585 		h_inc = (src_width << (12 + ecp_div)) / overlay->ow.width;
586 
587 
588 		// this is for us
589 		src_v_inc = (src_height << 16) / overlay->ow.height;
590 		src_h_inc = (src_width << 16) / overlay->ow.width;
591 	}
592 
593 	// calculate unclipped position/size
594 	// TBD: I assume that overlay_window.offset_xyz is only a hint where
595 	//      no overlay is visible; another interpretation were to zoom
596 	//      the overlay so it fits into remaining space
597 	src_left = (overlay->ov.h_start << 16) + overlay->ow.offset_left * src_h_inc;
598 	src_top = (overlay->ov.v_start << 16) + overlay->ow.offset_top * src_v_inc;
599 	src_right = ((overlay->ov.h_start + overlay->ov.width) << 16) -
600 		overlay->ow.offset_right * src_h_inc;
601 	src_bottom = ((overlay->ov.v_start + overlay->ov.height) << 16) -
602 		overlay->ow.offset_top * src_v_inc;
603 	dest_left = overlay->ow.h_start + overlay->ow.offset_left;
604 	dest_top = overlay->ow.v_start + overlay->ow.offset_top;
605 	dest_right = overlay->ow.h_start + overlay->ow.width - overlay->ow.offset_right;
606 	dest_bottom = overlay->ow.v_start + overlay->ow.height - overlay->ow.offset_bottom;
607 
608 	SHOW_FLOW( 3, "ow: h=%d, v=%d, width=%d, height=%d",
609 		overlay->ow.h_start, overlay->ow.v_start,
610 		overlay->ow.width, overlay->ow.height );
611 
612 	SHOW_FLOW( 3, "offset_left=%d, offset_right=%d, offset_top=%d, offset_bottom=%d",
613 		overlay->ow.offset_left, overlay->ow.offset_right,
614 		overlay->ow.offset_top, overlay->ow.offset_bottom );
615 
616 
617 	// apply virtual screen
618 	dest_left -= vc->mode.h_display_start + crtc->rel_x;
619 	dest_top -= vc->mode.v_display_start + crtc->rel_y;
620 	dest_right -= vc->mode.h_display_start + crtc->rel_x;
621 	dest_bottom -= vc->mode.v_display_start + crtc->rel_y;
622 
623 
624 	// clip to visible area
625 	if( dest_left < 0 ) {
626 		src_left += -dest_left * src_h_inc;
627 		dest_left = 0;
628 	}
629 	if( dest_top < 0 ) {
630 		src_top += -dest_top * src_v_inc;
631 		dest_top = 0;
632 	}
633 
634 	SHOW_FLOW( 3, "mode: w=%d, h=%d",
635 		crtc->mode.timing.h_display, crtc->mode.timing.v_display );
636 
637 	if( dest_right > crtc->mode.timing.h_display )
638 		dest_right = crtc->mode.timing.h_display;
639 	if( dest_bottom > crtc->mode.timing.v_display )
640 		dest_bottom = crtc->mode.timing.v_display;
641 
642 	SHOW_FLOW( 3, "src=(%d, %d, %d, %d)",
643 		src_left, src_top, src_right, src_bottom );
644 	SHOW_FLOW( 3, "dest=(%d, %d, %d, %d)",
645 		dest_left, dest_top, dest_right, dest_bottom );
646 
647 
648 	// especially with multi-screen modes the overlay may not be on screen at all
649 	if( dest_left >= dest_right || dest_top >= dest_bottom ||
650 		src_left >= src_right || src_top >= src_bottom )
651 	{
652 		Radeon_TempHideOverlay( ai );
653 		goto done;
654 	}
655 
656 
657 	// let's calculate all those nice register values
658 	SHOW_FLOW( 3, "ati_space=%d", node->ati_space );
659 	params = &space_params_table[node->ati_space];
660 
661 	// choose proper scaler
662 	{
663 		factors = getHScaleFactor( ai, params, src_left >> 16, src_right >> 16, &h_inc );
664 		if( factors == NULL )
665 			return B_ERROR;
666 
667 		p1_h_inc = factors->p1_step_by > 0 ?
668 			h_inc >> (factors->p1_step_by - 1) : h_inc;
669 		p23_h_inc =
670 			(factors->p23_step_by > 0 ? h_inc >> (factors->p23_step_by - 1) : h_inc)
671 			>> params->h_uv_sub_sample_shift;
672 
673 		SHOW_FLOW( 3, "p1_h_inc=%x, p23_h_inc=%x", p1_h_inc, p23_h_inc );
674 	}
675 
676 	// get register value for start/end position of overlay image (pixel-precise only)
677 	{
678 		uint32 p1_step_size, p23_step_size;
679 		uint32 p1_left, p1_right, p1_width;
680 		uint32 p23_left, p23_right, p23_width;
681 
682 		p1_left = src_left >> 16;
683 		p1_right = src_right >> 16;
684 		p1_width = p1_right - p1_left;
685 
686 		p1_step_size = factors->p1_step_by > 0 ? (1 << (factors->p1_step_by - 1)) : 1;
687 		p1_x_start = p1_left % (16 >> params->bpp_shift);
688 		p1_x_end = ((p1_x_start + p1_width - 1) / p1_step_size) * p1_step_size;
689 
690 		SHOW_FLOW( 3, "p1_x_start=%d, p1_x_end=%d", p1_x_start, p1_x_end );
691 
692 		p23_left = (src_left >> 16) >> params->h_uv_sub_sample_shift;
693 		p23_right = (src_right >> 16) >> params->h_uv_sub_sample_shift;
694 		p23_width = p23_right - p23_left;
695 
696 		p23_step_size = factors->p23_step_by > 0 ? (1 << (factors->p23_step_by - 1)) : 1;
697 		// if resolution of Y and U/V differs but YUV are stored in one
698 		// plane then UV alignment depends on Y data, therefore the hack
699 		// (you are welcome to replace this with some cleaner code ;)
700 		p23_x_start = p23_left %
701 			((16 >> params->bpuv_shift) /
702 			 (node->ati_space == 11 || node->ati_space == 12 ? 2 : 1));
703 		p23_x_end = (int)((p23_x_start + p23_width - 1) / p23_step_size) * p23_step_size;
704 
705 		SHOW_FLOW( 3, "p23_x_start=%d, p23_x_end=%d", p23_x_start, p23_x_end );
706 
707 		// get memory location of first word to be read by scaler
708 		// (save relative offset for fast update)
709 		si->active_overlay.rel_offset = (src_top >> 16) * node->buffer.bytes_per_row +
710 			((p1_left << params->bpp_shift) & ~0xf);
711 		offset = node->mem_offset + si->active_overlay.rel_offset;
712 
713 		SHOW_FLOW( 3, "rel_offset=%x", si->active_overlay.rel_offset );
714 	}
715 
716 	// get active lines for scaler
717 	// (we could add additional blank lines for DVD letter box mode,
718 	//  but this is not supported by API; additionally, this only makes
719 	//  sense if want to put subtitles onto the black border, which is
720 	//  supported neither)
721 	{
722 		uint16 int_top, int_bottom;
723 
724 		int_top = src_top >> 16;
725 		int_bottom = (src_bottom >> 16);
726 
727 		p1_active_lines = int_bottom - int_top - 1;
728 		p23_active_lines =
729 			ceilShiftDiv( int_bottom - 1, params->v_uv_sub_sample_shift ) -
730 			(int_top >> params->v_uv_sub_sample_shift);
731 
732 		SHOW_FLOW( 3, "p1_active_lines=%d, p23_active_lines=%d",
733 			p1_active_lines, p23_active_lines );
734 	}
735 
736 	// if picture is stretched for flat panel, we need to scale all
737 	// vertical values accordingly
738 	// TBD: there is no description at all concerning this, so v_accum_init may
739 	//      need to be initialized based on original value
740 	{
741 		if( (crtc->active_displays & (dd_lvds | dd_dvi)) != 0 ) {
742 			uint64 v_ratio;
743 
744 			// convert 32.32 format to 16.16 format; else we
745 			// cannot multiply two fixed point values without
746 			// overflow
747 			v_ratio = si->flatpanels[crtc->flatpanel_port].v_ratio >> (FIX_SHIFT - 16);
748 
749 			v_inc = (v_inc * v_ratio) >> 16;
750 		}
751 
752 		SHOW_FLOW( 3, "v_inc=%x", v_inc );
753 	}
754 
755 	// get initial horizontal scaler values, taking care of precharge
756 	// don't ask questions about formulas - take them as is
757 	// (TBD: home-brewed sub-pixel source clipping may be wrong,
758 	//       especially for uv-planes)
759 	{
760 		uint32 p23_group_size;
761 
762 	    tmp = ((src_left & 0xffff) >> 11) + (
763 	    	(
764 		    	I2FF( p1_x_start % factors->group_size, 12 ) +
765 		    	I2FF( 2.5, 12 ) +
766 		    	p1_h_inc / 2 +
767 		    	I2FF( 0.5, 12-5 )	// rounding
768 	        ) >> (12 - 5));	// scaled by 1 << 5
769 
770 	    SHOW_FLOW( 3, "p1_h_accum_init=%x", tmp );
771 
772 		p1_h_accum_init =
773 			((tmp << 15) & RADEON_OV0_P1_H_ACCUM_INIT_MASK) |
774 			((tmp << 23) & RADEON_OV0_P1_PRESHIFT_MASK);
775 
776 
777 		p23_group_size = 2;
778 
779 		tmp = ((src_left & 0xffff) >> 11) + (
780 			(
781 				I2FF( p23_x_start % p23_group_size, 12 ) +
782 				I2FF( 2.5, 12 ) +
783 				p23_h_inc / 2 +
784 				I2FF( 0.5, 12-5 )	// rounding
785 			) >> (12 - 5)); // scaled by 1 << 5
786 
787 		SHOW_FLOW( 3, "p23_h_accum_init=%x", tmp );
788 
789 		p23_h_accum_init =
790 			((tmp << 15) & RADEON_OV0_P23_H_ACCUM_INIT_MASK) |
791 			((tmp << 23) & RADEON_OV0_P23_PRESHIFT_MASK);
792 	}
793 
794 	// get initial vertical scaler values, taking care of precharge
795 	{
796 		uint extra_full_line;
797 
798 		extra_full_line = factors->p1_step_by == 0 ? 1 : 0;
799 
800 	    tmp = ((src_top & 0x0000ffff) >> 11) + (
801 	    	(min(
802 		    	I2FF( 1.5, 20 ) + I2FF( extra_full_line, 20 ) + v_inc / 2,
803 	    		I2FF( 2.5, 20 ) + 2 * I2FF( extra_full_line, 20 )
804 	    	 ) + I2FF( 0.5, 20-5 )) // rounding
805 	    	>> (20 - 5)); // scaled by 1 << 5
806 
807 	    SHOW_FLOW( 3, "p1_v_accum_init=%x", tmp );
808 
809 		p1_v_accum_init =
810 			((tmp << 15) & RADEON_OV0_P1_V_ACCUM_INIT_MASK) | 0x00000001;
811 
812 
813 		extra_full_line = factors->p23_step_by == 0 ? 1 : 0;
814 
815 		if( params->v_uv_sub_sample_shift > 0 ) {
816 			tmp = ((src_top & 0x0000ffff) >> 11) + (
817 				(min(
818 					I2FF( 1.5, 20 ) +
819 						I2FF( extra_full_line, 20 ) +
820 						((v_inc / 2) >> params->v_uv_sub_sample_shift),
821 					I2FF( 2.5, 20 ) +
822 						2 * I2FF( extra_full_line, 20 )
823 				 ) + I2FF( 0.5, 20-5 )) // rounding
824 				>> (20 - 5)); // scaled by 1 << 5
825 		} else {
826 			tmp = ((src_top & 0x0000ffff) >> 11) + (
827 				(
828 					I2FF( 2.5, 20 ) +
829 					2 * I2FF( extra_full_line, 20 ) +
830 					I2FF( 0.5, 20-5 )	// rounding
831 				) >> (20 - 5)); // scaled by 1 << 5
832 		}
833 
834 		SHOW_FLOW( 3, "p23_v_accum_init=%x", tmp );
835 
836 		p23_v_accum_init =
837 			((tmp << 15) & RADEON_OV0_P23_V_ACCUM_INIT_MASK) | 0x00000001;
838 	}
839 
840 	// show me what you've got!
841 	// we could lock double buffering of overlay unit during update
842 	// (new values are copied during vertical blank, so if we've updated
843 	// only some of them, you get a whole frame of mismatched values)
844 	// but during tests I couldn't get the artifacts go away, so
845 	// we use the dangerous way which has the pro to not require any
846 	// waiting
847 
848 	// let's try to lock overlay unit
849 	// we had to wait now until the lock takes effect, but this is
850 	// impossible with CCE; perhaps we have to convert this code to
851 	// direct register access; did that - let's see what happens...
852 	OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, RADEON_REG_LD_CTL_LOCK );
853 
854 	// wait until register access is locked
855 	while( (INREG( regs, RADEON_OV0_REG_LOAD_CNTL)
856 		& RADEON_REG_LD_CTL_LOCK_READBACK) == 0 )
857 		;
858 
859 	OUTREG( regs, RADEON_OV0_VID_BUF0_BASE_ADRS, offset );
860 	OUTREG( regs, RADEON_OV0_VID_BUF_PITCH0_VALUE, node->buffer.bytes_per_row );
861 	OUTREG( regs, RADEON_OV0_H_INC, p1_h_inc | (p23_h_inc << 16) );
862 	OUTREG( regs, RADEON_OV0_STEP_BY, factors->p1_step_by | (factors->p23_step_by << 8) );
863 	OUTREG( regs, RADEON_OV0_V_INC, v_inc );
864 
865 	OUTREG( regs,
866 		crtc->crtc_idx == 0 ? RADEON_OV0_Y_X_START : RADEON_OV1_Y_X_START,
867 		(dest_left) | (dest_top << 16) );
868 	OUTREG( regs,
869 		crtc->crtc_idx == 0 ? RADEON_OV0_Y_X_END : RADEON_OV1_Y_X_END,
870 		(dest_right - 1) | ((dest_bottom - 1) << 16) );
871 
872 	OUTREG( regs, RADEON_OV0_P1_BLANK_LINES_AT_TOP,
873 		RADEON_P1_BLNK_LN_AT_TOP_M1_MASK | (p1_active_lines << 16) );
874 	OUTREG( regs, RADEON_OV0_P1_X_START_END, p1_x_end | (p1_x_start << 16) );
875 	OUTREG( regs, RADEON_OV0_P1_H_ACCUM_INIT, p1_h_accum_init );
876 	OUTREG( regs, RADEON_OV0_P1_V_ACCUM_INIT, p1_v_accum_init );
877 
878 	OUTREG( regs, RADEON_OV0_P23_BLANK_LINES_AT_TOP,
879 		RADEON_P23_BLNK_LN_AT_TOP_M1_MASK | (p23_active_lines << 16) );
880 	OUTREG( regs, RADEON_OV0_P2_X_START_END,
881 		p23_x_end | (p23_x_start << 16) );
882 	OUTREG( regs, RADEON_OV0_P3_X_START_END,
883 		p23_x_end | (p23_x_start << 16) );
884 	OUTREG( regs, RADEON_OV0_P23_H_ACCUM_INIT, p23_h_accum_init );
885 	OUTREG( regs, RADEON_OV0_P23_V_ACCUM_INIT, p23_v_accum_init );
886 
887 	OUTREG( regs, RADEON_OV0_TEST, node->test_reg );
888 
889 	scale_ctrl = RADEON_SCALER_ENABLE |
890 		RADEON_SCALER_DOUBLE_BUFFER |
891 		(node->ati_space << 8) |
892 		RADEON_SCALER_ADAPTIVE_DEINT |
893 		RADEON_SCALER_BURST_PER_PLANE |
894 		(crtc->crtc_idx == 0 ? 0 : RADEON_SCALER_CRTC_SEL );
895 
896 	switch (node->ati_space << 8) {
897 		case RADEON_SCALER_SOURCE_15BPP: // RGB15
898 		case RADEON_SCALER_SOURCE_16BPP:
899 		case RADEON_SCALER_SOURCE_32BPP:
900 			OUTREG( regs, RADEON_OV0_SCALE_CNTL, scale_ctrl |
901 							RADEON_SCALER_LIN_TRANS_BYPASS);
902 			break;
903 		case RADEON_SCALER_SOURCE_VYUY422: // VYUY422
904 		case RADEON_SCALER_SOURCE_YVYU422: // YVYU422
905 			OUTREG( regs, RADEON_OV0_SCALE_CNTL, scale_ctrl);
906 			break;
907 		default:
908 			SHOW_FLOW(4, "What overlay format is this??? %d", node->ati_space);
909 			OUTREG( regs, RADEON_OV0_SCALE_CNTL, scale_ctrl |
910 			 (( ai->si->asic >= rt_r200) ? R200_SCALER_TEMPORAL_DEINT : 0));
911 
912 	}
913 
914 	si->overlay_mgr.auto_flip_reg ^= RADEON_OV0_SOFT_EOF_TOGGLE;
915 
916 	OUTREG( regs, RADEON_OV0_AUTO_FLIP_CNTRL,
917 		si->overlay_mgr.auto_flip_reg );
918 
919 	OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, 0 );
920 
921 done:
922 	ai->si->active_overlay.on = ai->si->pending_overlay.on;
923 	ai->si->active_overlay.ow = ai->si->pending_overlay.ow;
924 	ai->si->active_overlay.ov = ai->si->pending_overlay.ov;
925 	ai->si->active_overlay.ob = ai->si->pending_overlay.ob;
926 	ai->si->active_overlay.h_display_start = vc->mode.h_display_start;
927 	ai->si->active_overlay.v_display_start = vc->mode.v_display_start;
928 
929 	return B_OK;
930 }
931 
932 
933 // hide overlay, but not permanently
934 void Radeon_TempHideOverlay(
935 	accelerator_info *ai )
936 {
937 	SHOW_FLOW0( 3, "" );
938 
939 	OUTREG( ai->regs, RADEON_OV0_SCALE_CNTL, 0 );
940 }
941 
942 
943 // hide overlay (can be called even if there is none visible)
944 void Radeon_HideOverlay(
945 	accelerator_info *ai )
946 {
947 	shared_info *si = ai->si;
948 
949 	Radeon_TempHideOverlay( ai );
950 
951 	// remember that there is no overlay to be shown
952 	si->active_overlay.on = NULL;
953 	si->active_overlay.prev_on = NULL;
954 	si->pending_overlay.on = NULL;
955 
956 	// invalidate active head so it will be setup again once
957 	// a new overlay is shown
958 	si->active_overlay.crtc_idx = -1;
959 }
960 
961 
962 // show new overlay buffer with same parameters as last one
963 static void Radeon_ReplaceOverlayBuffer(
964 	accelerator_info *ai )
965 {
966 #if 0
967 	shared_info *si = ai->si;
968 	vuint8 *regs = ai->regs;
969 	uint32 offset;
970 	int /*old_buf, */new_buf;
971 
972 	offset = si->pending_overlay.on->mem_offset + si->active_overlay.rel_offset;
973 
974 	/*old_buf = si->overlay_mgr.auto_flip_reg & RADEON_OV0_SOFT_BUF_NUM_MASK;
975 	new_buf = old_buf == 0 ? 3 : 0;
976 	si->overlay_mgr.auto_flip_reg &= ~RADEON_OV0_SOFT_BUF_NUM_MASK;
977 	si->overlay_mgr.auto_flip_reg |= new_buf;*/
978 	new_buf = 0;
979 
980 	// lock overlay registers
981 /*	OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, RADEON_REG_LD_CTL_LOCK );
982 
983 	// wait until register access is locked
984 	while( (INREG( regs, RADEON_OV0_REG_LOAD_CNTL)
985 		& RADEON_REG_LD_CTL_LOCK_READBACK) == 0 )
986 		;*/
987 
988 	// setup new buffer
989 	/*OUTREG( regs,
990 		new_buf == 0 ? RADEON_OV0_VID_BUF_PITCH0_VALUE : RADEON_OV0_VID_BUF_PITCH1_VALUE,
991 		si->pending_overlay.on->buffer.bytes_per_row );*/
992 	OUTREG( regs,
993 		new_buf == 0 ? RADEON_OV0_VID_BUF0_BASE_ADRS : RADEON_OV0_VID_BUF3_BASE_ADRS,
994 		offset | (new_buf == 0 ? 0 : RADEON_VIF_BUF0_PITCH_SEL));
995 
996 	// make changes visible
997 	si->overlay_mgr.auto_flip_reg ^= RADEON_OV0_SOFT_EOF_TOGGLE;
998 
999 	OUTREG( regs, RADEON_OV0_AUTO_FLIP_CNTRL, si->overlay_mgr.auto_flip_reg );
1000 
1001 	// unlock overlay registers
1002 //	OUTREG( regs, RADEON_OV0_REG_LOAD_CNTL, 0 );
1003 
1004 	ai->si->active_overlay.on = ai->si->pending_overlay.on;
1005 #else
1006 	shared_info *si = ai->si;
1007 	uint32 offset;
1008 
1009 	if ( ai->si->acc_dma )
1010 	{
1011 		START_IB();
1012 
1013 		offset = si->pending_overlay.on->mem_offset + si->active_overlay.rel_offset;
1014 
1015 		WRITE_IB_REG( RADEON_OV0_VID_BUF0_BASE_ADRS, offset);
1016 
1017 		si->overlay_mgr.auto_flip_reg ^= RADEON_OV0_SOFT_EOF_TOGGLE;
1018 		WRITE_IB_REG( RADEON_OV0_AUTO_FLIP_CNTRL, si->overlay_mgr.auto_flip_reg );
1019 
1020 		SUBMIT_IB();
1021 	} else {
1022 		Radeon_WaitForFifo( ai, 2 );
1023 		offset = si->pending_overlay.on->mem_offset + si->active_overlay.rel_offset;
1024 
1025 		OUTREG( ai->regs, RADEON_OV0_VID_BUF0_BASE_ADRS, offset);
1026 
1027 		si->overlay_mgr.auto_flip_reg ^= RADEON_OV0_SOFT_EOF_TOGGLE;
1028 		OUTREG( ai->regs, RADEON_OV0_AUTO_FLIP_CNTRL, si->overlay_mgr.auto_flip_reg );
1029 	}
1030 	ai->si->active_overlay.on = ai->si->pending_overlay.on;
1031 #endif
1032 }
1033 
1034 
1035 // get number of pixels of overlay shown on virtual port
1036 static int getIntersectArea(
1037 	accelerator_info *ai, overlay_window *ow, crtc_info *crtc )
1038 {
1039 	virtual_card *vc = ai->vc;
1040 	int left, top, right, bottom;
1041 
1042 	left = ow->h_start - (vc->mode.h_display_start + crtc->rel_x);
1043 	top = ow->v_start - (vc->mode.v_display_start + crtc->rel_y);
1044 	right = left + ow->width;
1045 	bottom = top + ow->height;
1046 
1047 	if( left < 0 )
1048 		left = 0;
1049 	if( top < 0 )
1050 		top = 0;
1051 	if( right > crtc->mode.timing.h_display )
1052 		right = crtc->mode.timing.h_display;
1053 	if( bottom > crtc->mode.timing.v_display )
1054 		bottom = crtc->mode.timing.v_display;
1055 
1056 	if( right < left || bottom < top )
1057 		return 0;
1058 
1059 	return (right - left) * (bottom - top);
1060 }
1061 
1062 
1063 // update overlay, to be called whenever something in terms of
1064 // overlay have or can have been changed
1065 status_t Radeon_UpdateOverlay(
1066 	accelerator_info *ai )
1067 {
1068 	virtual_card *vc = ai->vc;
1069 	shared_info *si = ai->si;
1070 	int crtc_idx;
1071 
1072 	float brightness = 0.0f;
1073 	float contrast = 1.0f;
1074 	float saturation = 1.0f;
1075 	float hue = 0.0f;
1076     int32 ref = 0;
1077 
1078     SHOW_FLOW0( 3, "" );
1079 
1080 	// don't mess around with overlay of someone else
1081     if( !vc->uses_overlay )
1082     	return B_OK;
1083 
1084 	// make sure there really is an overlay
1085 	if( si->pending_overlay.on == NULL )
1086 		return B_OK;
1087 
1088 	// verify that the overlay is still valid
1089 	if( (uint32)si->pending_overlay.ot != si->overlay_mgr.token )
1090 		return B_BAD_VALUE;
1091 
1092 	if( vc->different_heads > 1 ) {
1093 		int area0, area1;
1094 
1095 		// determine on which port most of the overlay is shown
1096 		area0 = getIntersectArea( ai, &si->pending_overlay.ow, &si->crtc[0] );
1097 		area1 = getIntersectArea( ai, &si->pending_overlay.ow, &si->crtc[0] );
1098 
1099 		SHOW_FLOW( 3, "area0=%d, area1=%d", area0, area1 );
1100 
1101 		if( area0 >= area1 )
1102 			crtc_idx = 0;
1103 		else
1104 			crtc_idx = 1;
1105 
1106 	} else if( vc->independant_heads > 1 ) {
1107 		// both ports show the same, use "swap displays" to decide
1108 		// where to show the overlay (to be improved as this flag isn't
1109 		// really designed for that)
1110 		if( vc->swap_displays )
1111 			crtc_idx = 1;
1112 		else
1113 			crtc_idx = 0;
1114 
1115 	} else {
1116 
1117 		// one crtc used only - pick the one that we use
1118 		crtc_idx = vc->used_crtc[0] ? 0 : 1;
1119 	}
1120 
1121 	si->pending_overlay.crtc_idx = crtc_idx;
1122 
1123 	// only update registers that have been changed to minimize work
1124 	if( si->active_overlay.crtc_idx != si->pending_overlay.crtc_idx ) {
1125 		Radeon_InitOverlay( ai, crtc_idx );
1126 	}
1127 
1128 	if( si->active_overlay.ob.space != si->pending_overlay.ob.space ) {
1129 		Radeon_SetTransform( ai, brightness, contrast, saturation, hue, 0, 0, 0, ref );
1130 	}
1131 
1132 	if( memcmp( &si->active_overlay.ow, &si->pending_overlay.ow, sizeof( si->active_overlay.ow )) != 0 ||
1133 		memcmp( &si->active_overlay.ov, &si->pending_overlay.ov, sizeof( si->active_overlay.ov )) != 0 ||
1134 		si->active_overlay.h_display_start != vc->mode.h_display_start ||
1135 		si->active_overlay.v_display_start != vc->mode.v_display_start ||
1136 		si->active_overlay.ob.width != si->pending_overlay.ob.width ||
1137 		si->active_overlay.ob.height != si->pending_overlay.ob.height ||
1138 		si->active_overlay.ob.bytes_per_row != si->pending_overlay.ob.bytes_per_row )
1139 		Radeon_ShowOverlay( ai, crtc_idx );
1140 
1141 	else if( si->active_overlay.on != si->pending_overlay.on )
1142 		Radeon_ReplaceOverlayBuffer( ai );
1143 
1144 	SHOW_FLOW0( 3, "success" );
1145 
1146 	return B_OK;
1147 }
1148