xref: /haiku/src/add-ons/accelerants/matrox/engine/mga_acc.c (revision 56430ad8002b8fd1ac69b590e9cc130de6d9e852)
1 /* MGA Acceleration functions */
2 /* Authors:
3    Mark Watson 2/2000,
4    Rudolf Cornelissen 10/2002-1/2006.
5 */
6 
7 #define MODULE_BIT 0x00080000
8 
9 #include "mga_std.h"
10 
11 /*acceleration notes*/
12 
13 /*functions Be's app_server uses:
14 fill span (horizontal only)
15 fill rectangle (these 2 are very similar)
16 invert rectangle
17 blit
18 */
19 
20 /* needed by MIL 1/2 because of adress linearisation constraints */
21 #define ACCW_YDSTLEN(dst, len) do { \
22 	if (si->engine.y_lin) { \
23 		ACCW(YDST,((dst)* (si->fbc.bytes_per_row / (si->engine.depth >> 3))) >> 5); \
24 		ACCW(LEN,len); \
25 	} else ACCW(YDSTLEN,((dst)<<16)|(len)); \
26 } while (0)
27 
28 status_t gx00_acc_wait_idle()
29 {
30 	/* wait until engine completely idle */
31 	while (ACCR(STATUS) & 0x00010000)
32 	{
33 		/* snooze a bit so I do not hammer the bus */
34 		snooze (100);
35 	}
36 
37 	return B_OK;
38 }
39 
40 /* AFAIK this must be done for every new screenmode.
41  * Engine required init. */
42 status_t gx00_acc_init()
43 {
44 	/* used for convenience: MACCESS is a write only register! */
45 	uint32 maccess = 0x00000000;
46 	/* if we were unable to read PINS, we have to assume something (keeping bit6 zero) */
47 	if ((si->ps.card_type >= G450) && (si->ps.pins_status == B_OK))
48 	{
49 		/* b7 v5_mem_type = done by Mark Watson. fixme: still confirm! (unknown bits) */
50 		maccess |= ((((uint32)si->ps.v5_mem_type) & 0x80) >> 1);
51 	}
52 
53 	/* preset using hardware adress linearisation */
54 	si->engine.y_lin = 0x00;
55 	/* reset depth */
56 	si->engine.depth = 0;
57 
58 	/* cleanup bitblt */
59 	ACCW(OPMODE,0);
60 
61 	/* Set the Z origin to the start of FB (otherwise lockup on blits) */
62 	ACCW(ZORG,0);
63 
64 	/* Set pixel width */
65 	switch(si->dm.space)
66 	{
67 	case B_CMAP8:
68 		ACCW(MACCESS, ((maccess & 0xfffffffc) | 0x00));
69 		si->engine.depth = 8;
70 		break;
71 	case B_RGB15_LITTLE:case B_RGB16_LITTLE:
72 		ACCW(MACCESS, ((maccess & 0xfffffffc) | 0x01));
73 		si->engine.depth = 16;
74 		break;
75 	case B_RGB32_LITTLE:case B_RGBA32_LITTLE:
76 		ACCW(MACCESS, ((maccess & 0xfffffffc) | 0x02));
77 		si->engine.depth = 32;
78 		break;
79 	default:
80 		LOG(8,("ACC: init, invalid bit depth\n"));
81 		return B_ERROR;
82 	}
83 
84 	/* setup PITCH: very cardtype specific! */
85 	switch (si->ps.card_type)
86 	{
87 	case MIL1:
88 		switch (si->fbc.bytes_per_row / (si->engine.depth >> 3))
89 		{
90 			case 640:
91 			case 768:
92 			case 800:
93 			case 960:
94 			case 1024:
95 			case 1152:
96 			case 1280:
97 			case 1600:
98 			case 1920:
99 			case 2048:
100 				/* we are using hardware adress linearisation */
101 				break;
102 			default:
103 				/* we are using software adress linearisation */
104 				si->engine.y_lin = 0x01;
105 				LOG(8,("ACC: using software adress linearisation\n"));
106 				break;
107 		}
108 		ACCW(PITCH, (si->engine.y_lin << 15) |
109 					((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x0FFF));
110 		break;
111 	case MIL2:
112 		switch (si->fbc.bytes_per_row / (si->engine.depth >> 3))
113 		{
114 			case 512:
115 			case 640:
116 			case 768:
117 			case 800:
118 			case 832:
119 			case 960:
120 			case 1024:
121 			case 1152:
122 			case 1280:
123 			case 1600:
124 			case 1664:
125 			case 1920:
126 			case 2048:
127 				/* we are using hardware adress linearisation */
128 				break;
129 			default:
130 				/* we are using software adress linearisation */
131 				si->engine.y_lin = 0x01;
132 				LOG(8,("ACC: using software adress linearisation\n"));
133 				break;
134 		}
135 		ACCW(PITCH, (si->engine.y_lin << 15) |
136 					((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x0FFF));
137 		break;
138 	case G100:
139 		/* always using hardware adress linearisation, because 2D/3D
140 		 * engine works on every pitch multiple of 32 */
141 		ACCW(PITCH, ((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x0FFF));
142 		break;
143 	default:
144 		/* G200 and up are equal.. */
145 		/* always using hardware adress linearisation, because 2D/3D
146 		 * engine works on every pitch multiple of 32 */
147 		ACCW(PITCH, ((si->fbc.bytes_per_row / (si->engine.depth >> 3)) & 0x1FFF));
148 		break;
149 	}
150 
151 	/* disable plane write mask (needed for SDRAM): actual change needed to get it sent to RAM */
152 	ACCW(PLNWT,0x00000000);
153 	ACCW(PLNWT,0xffffffff);
154 
155 	if (si->ps.card_type >= G200) {
156 		/*DSTORG - location of active screen in framebuffer*/
157 		ACCW(DSTORG,((uint8*)si->fbc.frame_buffer) - ((uint8*)si->framebuffer));
158 
159 		/*SRCORG - init source address - same as dest*/
160 		ACCW(SRCORG,((uint8*)si->fbc.frame_buffer) - ((uint8*)si->framebuffer));
161 	}
162 
163 	/* init YDSTORG - apsed, if not inited, BitBlts may fails on <= G200 */
164 	si->engine.src_dst = 0;
165 	ACCW(YDSTORG, si->engine.src_dst);
166 
167 	/* <= G100 uses this register as SRCORG/DSTORG replacement, but
168 	 * MIL 1/2 does not need framebuffer space for the hardcursor! */
169 	if ((si->ps.card_type == G100) && (si->settings.hardcursor))
170 	{
171 		switch (si->dm.space)
172 		{
173 			case B_CMAP8:
174 				si->engine.src_dst = 1024 / 1;
175 				break;
176 			case B_RGB15_LITTLE:
177 			case B_RGB16_LITTLE:
178 				si->engine.src_dst = 1024 / 2;
179 				break;
180 			case B_RGB32_LITTLE:
181 				si->engine.src_dst =  1024 / 4;
182 				break;
183 			default:
184 				LOG(8,("ACC: G100 hardcursor not supported for current colorspace\n"));
185 				return B_ERROR;
186 		}
187 	}
188 	ACCW(YDSTORG, si->engine.src_dst);
189 
190 	/* clipping */
191 	/* i.e. highest and lowest X pixel adresses */
192 	ACCW(CXBNDRY,(((si->fbc.bytes_per_row / (si->engine.depth >> 3)) - 1) << 16) | (0));
193 
194 	/* Y pixel addresses must be linear */
195 	/* lowest adress */
196 	ACCW(YTOP, 0 + si->engine.src_dst);
197 	/* highest adress */
198 	ACCW(YBOT,((si->dm.virtual_height - 1) *
199 		(si->fbc.bytes_per_row / (si->engine.depth >> 3))) + si->engine.src_dst);
200 
201 	return B_OK;
202 }
203 
204 
205 /*
206 	note:
207 	moved acceleration 'top-level' routines to be integrated in the engine:
208 	it is costly to call the engine for every single function within a loop!
209 	(measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.)
210 */
211 
212 /* screen to screen blit - i.e. move windows around.
213  * Engine function bitblit, paragraph 4.5.7.2 */
214 void SCREEN_TO_SCREEN_BLIT(engine_token *et, blit_params *list, uint32 count)
215 {
216 	uint32 t_start,t_end,offset;
217 	uint32 b_start,b_end;
218 	int i = 0;
219 
220 	/* calc offset 'per line' */
221 	offset = (si->fbc.bytes_per_row / (si->engine.depth >> 3));
222 
223 	while (count--)
224 	{
225 		/* find where the top and bottom are */
226 		t_end = t_start =
227 			list[i].src_left + (offset * list[i].src_top) + si->engine.src_dst;
228 		t_end += list[i].width;
229 
230 		b_end = b_start =
231 			list[i].src_left + (offset * (list[i].src_top + list[i].height)) + si->engine.src_dst;
232 		b_end += list[i].width;
233 
234 		/* sgnzero bit _must_ be '0' before accessing SGN! */
235 		ACCW(DWGCTL, 0x00000000);
236 
237 		/*find which quadrant */
238 		switch((list[i].dest_top > list[i].src_top) | ((list[i].dest_left > list[i].src_left) << 1))
239 		{
240 		case 0: /*L->R,down*/
241 			ACCW(SGN, 0);
242 			ACCW(AR3, t_start);
243 			ACCW(AR0, t_end);
244 			ACCW(AR5, offset);
245 			ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
246 			break;
247 		case 1: /*L->R,up*/
248 			ACCW(SGN, 4);
249 			ACCW(AR3, b_start);
250 			ACCW(AR0, b_end);
251 			ACCW(AR5, -offset);
252 			ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
253 			break;
254 		case 2: /*R->L,down*/
255 			ACCW(SGN, 1);
256 			ACCW(AR3, t_end);
257 			ACCW(AR0, t_start);
258 			ACCW(AR5, offset);
259 			ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
260 			break;
261 		case 3: /*R->L,up*/
262 			ACCW(SGN, 5);
263 			ACCW(AR3, b_end);
264 			ACCW(AR0, b_start);
265 			ACCW(AR5, -offset);
266 			ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
267 			break;
268 		}
269 		ACCW(FXBNDRY,((list[i].dest_left + list[i].width) << 16) | list[i].dest_left);
270 
271 		/* start the blit */
272 		ACCGO(DWGCTL, 0x040c4018); // atype RSTR
273 		i++;
274 	}
275 }
276 
277 /* screen to screen tranparent blit - not sure what uses this.
278  * Engine function bitblit, paragraph 4.5.7.2 */
279 //WARNING:
280 //yet untested function!!
281 void SCREEN_TO_SCREEN_TRANSPARENT_BLIT(engine_token *et, uint32 transparent_colour, blit_params *list, uint32 count)
282 {
283 	uint32 t_start,t_end,offset;
284 	uint32 b_start,b_end;
285 	int i = 0;
286 
287 	/* calc offset 'per line' */
288 	offset = (si->fbc.bytes_per_row / (si->engine.depth >> 3));
289 
290 	while (count--)
291 	{
292 		/* find where the top and bottom are */
293 		t_end = t_start =
294 			list[i].src_left + (offset * list[i].src_top) + si->engine.src_dst;
295 		t_end += list[i].width;
296 
297 		b_end = b_start =
298 			list[i].src_left + (offset * (list[i].src_top + list[i].height)) + si->engine.src_dst;
299 		b_end += list[i].width;
300 
301 		/* sgnzero bit _must_ be '0' before accessing SGN! */
302 		ACCW(DWGCTL, 0x00000000);
303 
304 		/*find which quadrant */
305 		switch((list[i].dest_top > list[i].src_top) | ((list[i].dest_left > list[i].src_left) << 1))
306 		{
307 		case 0: /*L->R,down*/
308 			ACCW(SGN, 0);
309 			ACCW(AR3, t_start);
310 			ACCW(AR0, t_end);
311 			ACCW(AR5, offset);
312 			ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
313 			break;
314 		case 1: /*L->R,up*/
315 			ACCW(SGN, 4);
316 			ACCW(AR3, b_start);
317 			ACCW(AR0, b_end);
318 			ACCW(AR5, -offset);
319 			ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
320 			break;
321 		case 2: /*R->L,down*/
322 			ACCW(SGN, 1);
323 			ACCW(AR3, t_end);
324 			ACCW(AR0, t_start);
325 			ACCW(AR5, offset);
326 			ACCW_YDSTLEN(list[i].dest_top, list[i].height + 1);
327 			break;
328 		case 3: /*R->L,up*/
329 			ACCW(SGN, 5);
330 			ACCW(AR3, b_end);
331 			ACCW(AR0, b_start);
332 			ACCW(AR5, -offset);
333 			ACCW_YDSTLEN(list[i].dest_top + list[i].height, list[i].height + 1);
334 			break;
335 		}
336 		ACCW(FXBNDRY,((list[i].dest_left + list[i].width) << 16) | list[i].dest_left);
337 
338 		/* start the blit */
339 		ACCW(FCOL, transparent_colour);
340 		ACCW(BCOL, 0xffffffff);
341 		ACCGO(DWGCTL, 0x440c4018); // atype RSTR
342 		i++;
343 	}
344 }
345 
346 /* screen to screen scaled filtered blit - i.e. scale video in memory.
347  * Engine function texture mapping for video, paragraphs 4.5.5.5 - 4.5.5.9 */
348 //fixme: implement...
349 void SCREEN_TO_SCREEN_SCALED_FILTERED_BLIT(engine_token *et, scaled_blit_params *list, uint32 count)
350 {
351 	int i = 0;
352 
353 	while (count--)
354 	{
355 /*
356 			list[i].src_left,
357 			list[i].src_top,
358 			list[i].src_width,
359 			list[i].src_height,
360 			list[i].dest_left,
361 			list[i].dest_top,
362 			list[i].dest_width,
363 			list[i].dest_height
364 */
365 		i++;
366 	}
367 }
368 
369 /* rectangle fill.
370  * Engine function rectangle_fill: paragraph 4.5.5.2 */
371 void FILL_RECTANGLE(engine_token *et, uint32 colorIndex, fill_rect_params *list, uint32 count)
372 {
373 /*
374 	FXBNDRY - left and right coordinates    a
375 	YDSTLEN - y start and no of lines       a
376 	(or YDST and LEN)
377 	DWGCTL - atype must be RSTR or BLK      a
378 	FCOL - foreground colour                a
379 */
380 	int i = 0;
381 
382 	while (count--)
383 	{
384 		ACCW(FXBNDRY, (((list[i].right + 1) << 16) | list[i].left));
385 		ACCW_YDSTLEN(list[i].top, ((list[i].bottom - list[i].top) + 1));
386 		ACCW(FCOL, colorIndex);
387 
388 		/* start the fill */
389 //acc fixme: checkout blockmode constraints for G100+ (mil: nc?): also add blockmode
390 //	         for other functions, and use fastblt on MIL1/2 if possible...
391 //or is CMAP8 contraint a non-blockmode contraint? (linearisation problem maybe?)
392 		if ((si->dm.space == B_CMAP8) || si->ps.sdram)
393 		{
394 			ACCGO(DWGCTL, 0x400c7814); // atype RSTR
395 		}
396 		else
397 		{
398 			ACCGO(DWGCTL, 0x400c7844); // atype BLK
399 		}
400 		i++;
401 	}
402 }
403 
404 /* horizontal span fill.
405  * Engine function rectangle_fill: paragraph 4.5.5.2 */
406 //(uint32 xs,uint32 xe,uint32 ys,uint32 yl,uint32 col)
407 void FILL_SPAN(engine_token *et, uint32 colorIndex, uint16 *list, uint32 count)
408 {
409 /*
410 	FXBNDRY - left and right coordinates    a
411 	YDSTLEN - y start and no of lines       a
412 	(or YDST and LEN)
413 	DWGCTL - atype must be RSTR or BLK      a
414 	FCOL - foreground colour                a
415 */
416 	int i = 0;
417 
418 	while (count--)
419 	{
420 		ACCW(FXBNDRY, ((list[i + 2] + 1) << 16)| list[i + 1]);
421 		ACCW_YDSTLEN(list[i], 1);
422 		ACCW(FCOL, colorIndex);
423 
424 		/* start the fill */
425 //acc fixme: checkout blockmode constraints for G100+ (mil: nc?): also add blockmode
426 //	         for other functions, and use fastblt on MIL1/2 if possible...
427 //or is CMAP8 contraint a non-blockmode contraint? (linearisation problem maybe?)
428 		if ((si->dm.space == B_CMAP8) || si->ps.sdram)
429 		{
430 			ACCGO(DWGCTL, 0x400c7814); // atype RSTR
431 		}
432 		else
433 		{
434 			ACCGO(DWGCTL, 0x400c7844); // atype BLK
435 		}
436 		i += 3;
437 	}
438 }
439 
440 /* rectangle invert.
441  * Engine function rectangle_fill: paragraph 4.5.5.2 */
442 void INVERT_RECTANGLE(engine_token *et, fill_rect_params *list, uint32 count)
443 {
444 /*
445 	FXBNDRY - left and right coordinates    a
446 	YDSTLEN - y start and no of lines       a
447 	(or YDST and LEN)
448 	DWGCTL - atype must be RSTR or BLK      a
449 	FCOL - foreground colour                a
450 */
451 	int i = 0;
452 //	uint32 * dma;
453 //	uint32 pci;
454 
455 	while (count--)
456 	{
457 		ACCW(FXBNDRY, (((list[i].right) + 1) << 16) | list[i].left);
458 		ACCW_YDSTLEN(list[i].top, ((list[i].bottom - list[i].top) + 1));
459 		ACCW(FCOL, 0); /* color */
460 
461 		/* start the invert (top nibble is c is clipping enabled) */
462 		ACCGO(DWGCTL, 0x40057814); // atype RSTR
463 
464 		/* pseudo_dma version! */
465 //		MGAACC_DWGCTL      =0x1c00,
466 //		MGAACC_FCOL        =0x1c24,
467 //		MGAACC_FXBNDRY     =0x1c84,
468 //		MGAACC_YDSTLEN     =0x1c88,
469 //
470 //		40,09,21,22 (ordered as registers)
471 
472 //		dma = (uint32 *)si->pseudo_dma;
473 //		*dma++= 0x40092221;
474 //		*dma++= (((list[i].right) + 1) << 16) | list[i].left;
475 //		*dma++= (list[i].top << 16) | ((list[i].bottom - list[i].top) + 1);
476 //		*dma++= 0; /* color */
477 //		*dma++= 0x40057814;
478 
479 		/* real dma version! */
480 //		dma = (vuint32 *)si->dma_buffer;
481 //		*dma++= 0x40092221; /* indices */
482 //		*dma++= (((list[i].right) + 1) << 16) | list[i].left;
483 //		*dma++= (list[i].top << 16) | ((list[i].bottom - list[i].top) + 1);
484 //		*dma++= 0; /* color */
485 //		*dma++= 0x40057814;
486 
487 //		pci = si->dma_buffer_pci;
488 //		ACCW(PRIMADDRESS, (pci));
489 //		ACCW(PRIMEND, (20 + pci));
490 
491 //		delay(100);
492 
493 		i++;
494 	}
495 }
496