xref: /haiku/src/add-ons/accelerants/nvidia/engine/nv_acc_dma.c (revision f2b4344867e97c3f4e742a1b4a15e6879644601a)
1 /* NV Acceleration functions */
2 
3 /* Author:
4    Rudolf Cornelissen 8/2003-6/2010.
5 
6    This code was possible thanks to:
7     - the Linux XFree86 NV driver,
8     - the Linux UtahGLX 3D driver.
9 */
10 
11 #define MODULE_BIT 0x00080000
12 
13 #include "nv_std.h"
14 
15 /*acceleration notes*/
16 
17 /*functions Be's app_server uses:
18 fill span (horizontal only)
19 fill rectangle (these 2 are very similar)
20 invert rectangle
21 blit
22 */
23 
24 static void nv_init_for_3D_dma(void);
25 static void nv_start_dma(void);
26 static status_t nv_acc_fifofree_dma(uint16 cmd_size);
27 static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size);
28 static void nv_acc_set_ch_dma(uint16 ch, uint32 handle);
29 
30 /* used to track engine DMA stalls */
31 static uint8 err;
32 
33 /* wait until engine completely idle */
34 status_t nv_acc_wait_idle_dma()
35 {
36 	/* we'd better check for timeouts on the DMA engine as it's theoretically
37 	 * breakable by malfunctioning software */
38 	uint16 cnt = 0;
39 
40 	/* wait until all upcoming commands are in execution at least. Do this until
41 	 * we hit a timeout; abort if we failed at least three times before:
42 	 * if DMA stalls, we have to forget about it alltogether at some point, or
43 	 * the system will almost come to a complete halt.. */
44 	/* note:
45 	 * it doesn't matter which FIFO channel's DMA registers we access, they are in
46 	 * fact all the same set. It also doesn't matter if the channel was assigned a
47 	 * command or not. */
48 	while ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET) != (si->engine.dma.put << 2)) &&
49 			(cnt < 10000) && (err < 3))
50 	{
51 		/* snooze a bit so I do not hammer the bus */
52 		snooze (100);
53 		cnt++;
54 	}
55 
56 	/* log timeout if we had one */
57 	if (cnt == 10000)
58 	{
59 		if (err < 3) err++;
60 		LOG(4,("ACC_DMA: wait_idle; DMA timeout #%d, engine trouble!\n", err));
61 	}
62 
63 	/* wait until execution completed */
64 	while (ACCR(STATUS))
65 	{
66 		/* snooze a bit so I do not hammer the bus */
67 		snooze (100);
68 	}
69 
70 	return B_OK;
71 }
72 
73 /* AFAIK this must be done for every new screenmode.
74  * Engine required init. */
75 status_t nv_acc_init_dma()
76 {
77 	uint32 cnt, tmp;
78 	uint32 surf_depth, cmd_depth;
79 	/* reset the engine DMA stalls counter */
80 	err = 0;
81 
82 	/* a hanging engine only recovers from a complete power-down/power-up cycle */
83 	NV_REG32(NV32_PWRUPCTRL) = 0xffff00ff;
84 	snooze(1000);
85 	NV_REG32(NV32_PWRUPCTRL) = 0xffffffff;
86 
87 	/* don't try this on NV20 and later.. */
88 	/* note:
89 	 * the specific register that's responsible for the speedfix on NV18 is
90 	 * $00400ed8: bit 6 needs to be zero for fastest rendering (confirmed). */
91 	/* note also:
92 	 * on NV28 the following ranges could be reset (confirmed):
93 	 * $00400000 upto/incl. $004002fc;
94 	 * $00400400 upto/incl. $004017fc;
95 	 * $0040180c upto/incl. $00401948;
96 	 * $00401994 upto/incl. $00401a80;
97 	 * $00401a94 upto/incl. $00401ffc.
98 	 * The intermediate ranges hang the engine upon resetting. */
99 	if (si->ps.card_arch < NV20A)
100 	{
101 		/* actively reset the PGRAPH registerset (acceleration engine) */
102 		for (cnt = 0x00400000; cnt < 0x00402000; cnt +=4)
103 		{
104 			NV_REG32(cnt) = 0x00000000;
105 		}
106 	}
107 
108 	/* setup PTIMER: */
109 	LOG(4,("ACC_DMA: timer numerator $%08x, denominator $%08x\n", ACCR(PT_NUMERATOR), ACCR(PT_DENOMINATR)));
110 
111 	/* The NV28 BIOS programs PTIMER like this (see coldstarting in nv_info.c) */
112 	//ACCW(PT_NUMERATOR, (si->ps.std_engine_clock * 20));
113 	//ACCW(PT_DENOMINATR, 0x00000271);
114 	/* Nouveau (march 2009) mentions something like: writing 8 and 3 to these regs breaks the timings
115 	 * on the LVDS hardware sequencing microcode. A correct solution involves calculations with the GPU PLL. */
116 
117 	/* For now use BIOS pre-programmed values if there */
118 	if (!ACCR(PT_NUMERATOR) || !ACCR(PT_DENOMINATR)) {
119 		/* set timer numerator to 8 (in b0-15) */
120 		ACCW(PT_NUMERATOR, 0x00000008);
121 		/* set timer denominator to 3 (in b0-15) */
122 		ACCW(PT_DENOMINATR, 0x00000003);
123 	}
124 
125 	/* disable timer-alarm INT requests (b0) */
126 	ACCW(PT_INTEN, 0x00000000);
127 	/* reset timer-alarm INT status bit (b0) */
128 	ACCW(PT_INTSTAT, 0xffffffff);
129 
130 	/* enable PRAMIN write access on pre NV10 before programming it! */
131 	if (si->ps.card_arch == NV04A)
132 	{
133 		/* set framebuffer config: type = notiling, PRAMIN write access enabled */
134 		NV_REG32(NV32_PFB_CONFIG_0) = 0x00001114;
135 	}
136 	else
137 	{
138 		/* setup acc engine 'source' tile adressranges */
139 		if ((si->ps.card_type <= NV40) || (si->ps.card_type == NV45))
140 		{
141 			ACCW(NV10_FBTIL0AD, 0);
142 			ACCW(NV10_FBTIL1AD, 0);
143 			ACCW(NV10_FBTIL2AD, 0);
144 			ACCW(NV10_FBTIL3AD, 0);
145 			ACCW(NV10_FBTIL4AD, 0);
146 			ACCW(NV10_FBTIL5AD, 0);
147 			ACCW(NV10_FBTIL6AD, 0);
148 			ACCW(NV10_FBTIL7AD, 0);
149 			ACCW(NV10_FBTIL0ED, (si->ps.memory_size - 1));
150 			ACCW(NV10_FBTIL1ED, (si->ps.memory_size - 1));
151 			ACCW(NV10_FBTIL2ED, (si->ps.memory_size - 1));
152 			ACCW(NV10_FBTIL3ED, (si->ps.memory_size - 1));
153 			ACCW(NV10_FBTIL4ED, (si->ps.memory_size - 1));
154 			ACCW(NV10_FBTIL5ED, (si->ps.memory_size - 1));
155 			ACCW(NV10_FBTIL6ED, (si->ps.memory_size - 1));
156 			ACCW(NV10_FBTIL7ED, (si->ps.memory_size - 1));
157 		}
158 		else
159 		{
160 			/* NV41, 43, 44, G70 and up */
161 			ACCW(NV41_FBTIL0AD, 0);
162 			ACCW(NV41_FBTIL1AD, 0);
163 			ACCW(NV41_FBTIL2AD, 0);
164 			ACCW(NV41_FBTIL3AD, 0);
165 			ACCW(NV41_FBTIL4AD, 0);
166 			ACCW(NV41_FBTIL5AD, 0);
167 			ACCW(NV41_FBTIL6AD, 0);
168 			ACCW(NV41_FBTIL7AD, 0);
169 			ACCW(NV41_FBTIL8AD, 0);
170 			ACCW(NV41_FBTIL9AD, 0);
171 			ACCW(NV41_FBTILAAD, 0);
172 			ACCW(NV41_FBTILBAD, 0);
173 			ACCW(NV41_FBTIL0ED, (si->ps.memory_size - 1));
174 			ACCW(NV41_FBTIL1ED, (si->ps.memory_size - 1));
175 			ACCW(NV41_FBTIL2ED, (si->ps.memory_size - 1));
176 			ACCW(NV41_FBTIL3ED, (si->ps.memory_size - 1));
177 			ACCW(NV41_FBTIL4ED, (si->ps.memory_size - 1));
178 			ACCW(NV41_FBTIL5ED, (si->ps.memory_size - 1));
179 			ACCW(NV41_FBTIL6ED, (si->ps.memory_size - 1));
180 			ACCW(NV41_FBTIL7ED, (si->ps.memory_size - 1));
181 			ACCW(NV41_FBTIL8ED, (si->ps.memory_size - 1));
182 			ACCW(NV41_FBTIL9ED, (si->ps.memory_size - 1));
183 			ACCW(NV41_FBTILAED, (si->ps.memory_size - 1));
184 			ACCW(NV41_FBTILBED, (si->ps.memory_size - 1));
185 
186 			if (si->ps.card_type >= G70)
187 			{
188 				ACCW(G70_FBTILCAD, 0);
189 				ACCW(G70_FBTILDAD, 0);
190 				ACCW(G70_FBTILEAD, 0);
191 				ACCW(G70_FBTILCED, (si->ps.memory_size - 1));
192 				ACCW(G70_FBTILDED, (si->ps.memory_size - 1));
193 				ACCW(G70_FBTILEED, (si->ps.memory_size - 1));
194 			}
195 		}
196 	}
197 
198 	/*** PRAMIN ***/
199 	/* first clear the entire RAMHT (hash-table) space to a defined state. It turns
200 	 * out at least NV11 will keep the previously programmed handles over resets and
201 	 * power-outages upto about 15 seconds!! Faulty entries might well hang the
202 	 * engine (confirmed on NV11).
203 	 * Note:
204 	 * this behaviour is not very strange: even very old DRAM chips are known to be
205 	 * able to do this, even though you should refresh them every few milliseconds or
206 	 * so. (Large memory cell capacitors, though different cells vary a lot in their
207 	 * capacity.)
208 	 * Of course data validity is not certain by a long shot over this large
209 	 * amount of time.. */
210 	for(cnt = 0; cnt < 0x0400; cnt++)
211 		NV_REG32(NVACC_HT_HANDL_00 + (cnt << 2)) = 0;
212 	/* RAMHT (hash-table) space SETUP FIFO HANDLES */
213 	/* note:
214 	 * 'instance' tells you where the engine command is stored in 'PR_CTXx_x' sets
215 	 * below: instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000).
216 	 * That command is linked to the handle noted here. This handle is then used to
217 	 * tell the FIFO to which engine command it is connected!
218 	 * (CTX registers are actually a sort of RAM space.) */
219 	if (si->ps.card_arch >= NV40A)
220 	{
221 		/* (first set) */
222 		ACCW(HT_HANDL_00, (0x80000000 | NV10_CONTEXT_SURFACES_2D)); /* 32bit handle (not used) */
223 		ACCW(HT_VALUE_00, 0x0010114c); /* instance $114c, engine = acc engine, CHID = $00 */
224 
225 		ACCW(HT_HANDL_01, (0x80000000 | NV_IMAGE_BLIT)); /* 32bit handle */
226 		ACCW(HT_VALUE_01, 0x00101148); /* instance $1148, engine = acc engine, CHID = $00 */
227 
228 		ACCW(HT_HANDL_02, (0x80000000 | NV4_GDI_RECTANGLE_TEXT)); /* 32bit handle */
229 		ACCW(HT_VALUE_02, 0x0010114a); /* instance $114a, engine = acc engine, CHID = $00 */
230 
231 		/* (second set) */
232 		ACCW(HT_HANDL_10, (0x80000000 | NV_ROP5_SOLID)); /* 32bit handle */
233 		ACCW(HT_VALUE_10, 0x00101142); /* instance $1142, engine = acc engine, CHID = $00 */
234 
235 		ACCW(HT_HANDL_11, (0x80000000 | NV_IMAGE_BLACK_RECTANGLE)); /* 32bit handle */
236 		ACCW(HT_VALUE_11, 0x00101144); /* instance $1144, engine = acc engine, CHID = $00 */
237 
238 		ACCW(HT_HANDL_12, (0x80000000 | NV_IMAGE_PATTERN)); /* 32bit handle */
239 		ACCW(HT_VALUE_12, 0x00101146); /* instance $1146, engine = acc engine, CHID = $00 */
240 
241 		ACCW(HT_HANDL_13, (0x80000000 | NV_SCALED_IMAGE_FROM_MEMORY)); /* 32bit handle */
242 		ACCW(HT_VALUE_13, 0x0010114e); /* instance $114e, engine = acc engine, CHID = $00 */
243 	}
244 	else
245 	{
246 		/* (first set) */
247 		ACCW(HT_HANDL_00, (0x80000000 | NV4_SURFACE)); /* 32bit handle */
248 		ACCW(HT_VALUE_00, 0x80011145); /* instance $1145, engine = acc engine, CHID = $00 */
249 
250 		ACCW(HT_HANDL_01, (0x80000000 | NV_IMAGE_BLIT)); /* 32bit handle */
251 		ACCW(HT_VALUE_01, 0x80011146); /* instance $1146, engine = acc engine, CHID = $00 */
252 
253 		ACCW(HT_HANDL_02, (0x80000000 | NV4_GDI_RECTANGLE_TEXT)); /* 32bit handle */
254 		ACCW(HT_VALUE_02, 0x80011147); /* instance $1147, engine = acc engine, CHID = $00 */
255 
256 		ACCW(HT_HANDL_03, (0x80000000 | NV4_CONTEXT_SURFACES_ARGB_ZS)); /* 32bit handle (3D) */
257 		ACCW(HT_VALUE_03, 0x80011148); /* instance $1148, engine = acc engine, CHID = $00 */
258 
259 		/* NV4_ and NV10_DX5_TEXTURE_TRIANGLE should be identical */
260 		ACCW(HT_HANDL_04, (0x80000000 | NV4_DX5_TEXTURE_TRIANGLE)); /* 32bit handle (3D) */
261 		ACCW(HT_VALUE_04, 0x80011149); /* instance $1149, engine = acc engine, CHID = $00 */
262 
263 		/* NV4_ and NV10_DX6_MULTI_TEXTURE_TRIANGLE should be identical */
264 		ACCW(HT_HANDL_05, (0x80000000 | NV4_DX6_MULTI_TEXTURE_TRIANGLE)); /* 32bit handle (not used) */
265 		ACCW(HT_VALUE_05, 0x8001114a); /* instance $114a, engine = acc engine, CHID = $00 */
266 
267 		ACCW(HT_HANDL_06, (0x80000000 | NV1_RENDER_SOLID_LIN)); /* 32bit handle (not used) */
268 		ACCW(HT_VALUE_06, 0x8001114c); /* instance $114c, engine = acc engine, CHID = $00 */
269 
270 		/* (second set) */
271 		ACCW(HT_HANDL_10, (0x80000000 | NV_ROP5_SOLID)); /* 32bit handle */
272 		ACCW(HT_VALUE_10, 0x80011142); /* instance $1142, engine = acc engine, CHID = $00 */
273 
274 		ACCW(HT_HANDL_11, (0x80000000 | NV_IMAGE_BLACK_RECTANGLE)); /* 32bit handle */
275 		ACCW(HT_VALUE_11, 0x80011143); /* instance $1143, engine = acc engine, CHID = $00 */
276 
277 		ACCW(HT_HANDL_12, (0x80000000 | NV_IMAGE_PATTERN)); /* 32bit handle */
278 		ACCW(HT_VALUE_12, 0x80011144); /* instance $1144, engine = acc engine, CHID = $00 */
279 
280 		ACCW(HT_HANDL_13, (0x80000000 | NV_SCALED_IMAGE_FROM_MEMORY)); /* 32bit handle */
281 		ACCW(HT_VALUE_13, 0x8001114b); /* instance $114b, engine = acc engine, CHID = $00 */
282 
283 		//2007 3D tests..
284 		if (si->ps.card_type == NV15)
285 		{
286 			ACCW(HT_HANDL_14, (0x80000000 | NV_TCL_PRIMITIVE_3D)); /* 32bit handle */
287 			ACCW(HT_VALUE_14, 0x8001114d); /* instance $114d, engine = acc engine, CHID = $00 */
288 		}
289 
290 	}
291 
292 	/* program CTX registers: CTX1 is mostly done later (colorspace dependant) */
293 	/* note:
294 	 * CTX determines which HT handles point to what engine commands. */
295 	/* note also:
296 	 * CTX registers are in fact in the same GPU internal RAM space as the engine's
297 	 * hashtable. This means that stuff programmed in here also survives resets and
298 	 * power-outages! (confirmed NV11) */
299 	if (si->ps.card_arch >= NV40A)
300 	{
301 		/* setup a DMA define for use by command defines below. */
302 		ACCW(PR_CTX0_R, 0x00003000); /* DMA page table present and of linear type;
303 									  * DMA target node is NVM (non-volatile memory?)
304 									  * (instead of doing PCI or AGP transfers) */
305 		ACCW(PR_CTX1_R, (si->ps.memory_size - 1)); /* DMA limit: size is all cardRAM */
306 		ACCW(PR_CTX2_R, ((0x00000000 & 0xfffff000) | 0x00000002));
307 									 /* DMA access type is READ_AND_WRITE;
308 									  * memory starts at start of cardRAM (b12-31):
309 									  * It's adress needs to be at a 4kb boundary! */
310 		ACCW(PR_CTX3_R, 0x00000002); /* unknown (looks like this is rubbish/not needed?) */
311 		/* setup set '0' for cmd NV_ROP5_SOLID */
312 		ACCW(PR_CTX0_0, 0x02080043); /* NVclass $043, patchcfg ROP_AND, nv10+: little endian */
313 		ACCW(PR_CTX1_0, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
314 		ACCW(PR_CTX2_0, 0x00000000); /* DMA0 and DMA1 instance invalid */
315 		ACCW(PR_CTX3_0, 0x00000000); /* method traps disabled */
316 		ACCW(PR_CTX0_1, 0x00000000); /* extra */
317 		ACCW(PR_CTX1_1, 0x00000000); /* extra */
318 		/* setup set '1' for cmd NV_IMAGE_BLACK_RECTANGLE */
319 		ACCW(PR_CTX0_2, 0x02080019); /* NVclass $019, patchcfg ROP_AND, nv10+: little endian */
320 		ACCW(PR_CTX1_2, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
321 		ACCW(PR_CTX2_2, 0x00000000); /* DMA0 and DMA1 instance invalid */
322 		ACCW(PR_CTX3_2, 0x00000000); /* method traps disabled */
323 		ACCW(PR_CTX0_3, 0x00000000); /* extra */
324 		ACCW(PR_CTX1_3, 0x00000000); /* extra */
325 		/* setup set '2' for cmd NV_IMAGE_PATTERN */
326 		ACCW(PR_CTX0_4, 0x02080018); /* NVclass $018, patchcfg ROP_AND, nv10+: little endian */
327 		ACCW(PR_CTX1_4, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
328 		ACCW(PR_CTX2_4, 0x00000000); /* DMA0 and DMA1 instance invalid */
329 		ACCW(PR_CTX3_4, 0x00000000); /* method traps disabled */
330 		ACCW(PR_CTX0_5, 0x00000000); /* extra */
331 		ACCW(PR_CTX1_5, 0x00000000); /* extra */
332 		/* setup set '4' for cmd NV12_IMAGE_BLIT */
333 		ACCW(PR_CTX0_6, 0x0208009f); /* NVclass $09f, patchcfg ROP_AND, nv10+: little endian */
334 		ACCW(PR_CTX1_6, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
335 		ACCW(PR_CTX2_6, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
336 		ACCW(PR_CTX3_6, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
337 		ACCW(PR_CTX0_7, 0x00000000); /* extra */
338 		ACCW(PR_CTX1_7, 0x00000000); /* extra */
339 		/* setup set '5' for cmd NV4_GDI_RECTANGLE_TEXT */
340 		ACCW(PR_CTX0_8, 0x0208004a); /* NVclass $04a, patchcfg ROP_AND, nv10+: little endian */
341 		ACCW(PR_CTX1_8, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
342 		ACCW(PR_CTX2_8, 0x00000000); /* DMA0 and DMA1 instance invalid */
343 		ACCW(PR_CTX3_8, 0x00000000); /* method traps disabled */
344 		ACCW(PR_CTX0_9, 0x00000000); /* extra */
345 		ACCW(PR_CTX1_9, 0x00000000); /* extra */
346 		/* setup set '6' for cmd NV10_CONTEXT_SURFACES_2D */
347 		ACCW(PR_CTX0_A, 0x02080062); /* NVclass $062, nv10+: little endian */
348 		ACCW(PR_CTX1_A, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
349 		ACCW(PR_CTX2_A, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
350 		ACCW(PR_CTX3_A, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
351 		ACCW(PR_CTX0_B, 0x00000000); /* extra */
352 		ACCW(PR_CTX1_B, 0x00000000); /* extra */
353 		/* setup set '7' for cmd NV_SCALED_IMAGE_FROM_MEMORY */
354 		ACCW(PR_CTX0_C, 0x02080077); /* NVclass $077, nv10+: little endian */
355 		ACCW(PR_CTX1_C, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
356 		ACCW(PR_CTX2_C, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
357 		ACCW(PR_CTX3_C, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
358 		ACCW(PR_CTX0_D, 0x00000000); /* extra */
359 		ACCW(PR_CTX1_D, 0x00000000); /* extra */
360 		/* setup DMA set pointed at by PF_CACH1_DMAI */
361 		ACCW(PR_CTX0_E, 0x00003002); /* DMA page table present and of linear type;
362 									  * DMA class is $002 (b0-11);
363 									  * DMA target node is NVM (non-volatile memory?)
364 									  * (instead of doing PCI or AGP transfers) */
365 		ACCW(PR_CTX1_E, 0x00007fff); /* DMA limit: tablesize is 32k bytes */
366 		ACCW(PR_CTX2_E, (((si->ps.memory_size - 1) & 0xffff8000) | 0x00000002));
367 									 /* DMA access type is READ_AND_WRITE;
368 									  * table is located at end of cardRAM (b12-31):
369 									  * It's adress needs to be at a 4kb boundary! */
370 	}
371 	else
372 	{
373 		/* setup a DMA define for use by command defines below. */
374 		ACCW(PR_CTX0_R, 0x00003000); /* DMA page table present and of linear type;
375 									  * DMA target node is NVM (non-volatile memory?)
376 									  * (instead of doing PCI or AGP transfers) */
377 		ACCW(PR_CTX1_R, (si->ps.memory_size - 1)); /* DMA limit: size is all cardRAM */
378 		ACCW(PR_CTX2_R, ((0x00000000 & 0xfffff000) | 0x00000002));
379 									 /* DMA access type is READ_AND_WRITE;
380 									  * memory starts at start of cardRAM (b12-31):
381 									  * It's adress needs to be at a 4kb boundary! */
382 		ACCW(PR_CTX3_R, 0x00000002); /* unknown (looks like this is rubbish/not needed?) */
383 		/* setup set '0' for cmd NV_ROP5_SOLID */
384 		ACCW(PR_CTX0_0, 0x01008043); /* NVclass $043, patchcfg ROP_AND, nv10+: little endian */
385 		ACCW(PR_CTX1_0, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
386 		ACCW(PR_CTX2_0, 0x00000000); /* DMA0 and DMA1 instance invalid */
387 		ACCW(PR_CTX3_0, 0x00000000); /* method traps disabled */
388 		/* setup set '1' for cmd NV_IMAGE_BLACK_RECTANGLE */
389 		ACCW(PR_CTX0_1, 0x01008019); /* NVclass $019, patchcfg ROP_AND, nv10+: little endian */
390 		ACCW(PR_CTX1_1, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
391 		ACCW(PR_CTX2_1, 0x00000000); /* DMA0 and DMA1 instance invalid */
392 		ACCW(PR_CTX3_1, 0x00000000); /* method traps disabled */
393 		/* setup set '2' for cmd NV_IMAGE_PATTERN */
394 		ACCW(PR_CTX0_2, 0x01008018); /* NVclass $018, patchcfg ROP_AND, nv10+: little endian */
395 		ACCW(PR_CTX1_2, 0x00000002); /* colorspace not set, notify instance is $0200 (b16-31) */
396 		ACCW(PR_CTX2_2, 0x00000000); /* DMA0 and DMA1 instance invalid */
397 		ACCW(PR_CTX3_2, 0x00000000); /* method traps disabled */
398 		/* setup set '3' for ... */
399 		if(si->ps.card_arch >= NV10A)
400 		{
401 			/* ... cmd NV10_CONTEXT_SURFACES_2D */
402 			ACCW(PR_CTX0_3, 0x01008062); /* NVclass $062, nv10+: little endian */
403 		}
404 		else
405 		{
406 			/* ... cmd NV4_SURFACE */
407 			ACCW(PR_CTX0_3, 0x01008042); /* NVclass $042, nv10+: little endian */
408 		}
409 		ACCW(PR_CTX1_3, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
410 		ACCW(PR_CTX2_3, 0x11401140); /* DMA0 instance is $1140, DMA1 instance invalid */
411 		ACCW(PR_CTX3_3, 0x00000000); /* method trap 0 is $1140, trap 1 disabled */
412 		/* setup set '4' for ... */
413 		if (si->ps.card_type >= NV11)
414 		{
415 			/* ... cmd NV12_IMAGE_BLIT */
416 			ACCW(PR_CTX0_4, 0x0100809f); /* NVclass $09f, patchcfg ROP_AND, nv10+: little endian */
417 		}
418 		else
419 		{
420 			/* ... cmd NV_IMAGE_BLIT */
421 			ACCW(PR_CTX0_4, 0x0100805f); /* NVclass $05f, patchcfg ROP_AND, nv10+: little endian */
422 		}
423 		ACCW(PR_CTX1_4, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
424 		ACCW(PR_CTX2_4, 0x11401140); /* DMA0 instance is $1140, DMA1 instance invalid */
425 		ACCW(PR_CTX3_4, 0x00000000); /* method trap 0 is $1140, trap 1 disabled */
426 		/* setup set '5' for cmd NV4_GDI_RECTANGLE_TEXT */
427 		ACCW(PR_CTX0_5, 0x0100804a); /* NVclass $04a, patchcfg ROP_AND, nv10+: little endian */
428 		ACCW(PR_CTX1_5, 0x00000002); /* colorspace not set, notify instance is $0200 (b16-31) */
429 		ACCW(PR_CTX2_5, 0x00000000); /* DMA0 and DMA1 instance invalid */
430 		ACCW(PR_CTX3_5, 0x00000000); /* method traps disabled */
431 		/* setup set '6' ... */
432 		if (si->ps.card_arch >= NV10A)
433 		{
434 			/* ... for cmd NV10_CONTEXT_SURFACES_ARGB_ZS */
435 			ACCW(PR_CTX0_6, 0x00000093); /* NVclass $093, nv10+: little endian */
436 		}
437 		else
438 		{
439 			/* ... for cmd NV4_CONTEXT_SURFACES_ARGB_ZS */
440 			ACCW(PR_CTX0_6, 0x00000053); /* NVclass $053, nv10+: little endian */
441 		}
442 		ACCW(PR_CTX1_6, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
443 		ACCW(PR_CTX2_6, 0x11401140); /* DMA0, DMA1 instance = $1140 */
444 		ACCW(PR_CTX3_6, 0x00000000); /* method traps disabled */
445 		/* setup set '7' ... */
446 		if (si->ps.card_arch >= NV10A)
447 		{
448 			/* ... for cmd NV10_DX5_TEXTURE_TRIANGLE */
449 			ACCW(PR_CTX0_7, 0x0300a094); /* NVclass $094, patchcfg ROP_AND, userclip enable,
450 										  * context surface0 valid, nv10+: little endian */
451 		}
452 		else
453 		{
454 			/* ... for cmd NV4_DX5_TEXTURE_TRIANGLE */
455 			ACCW(PR_CTX0_7, 0x0300a054); /* NVclass $054, patchcfg ROP_AND, userclip enable,
456 										  * context surface0 valid */
457 		}
458 		ACCW(PR_CTX1_7, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
459 		ACCW(PR_CTX2_7, 0x11401140); /* DMA0, DMA1 instance = $1140 */
460 		ACCW(PR_CTX3_7, 0x00000000); /* method traps disabled */
461 		/* setup set '8' ... */
462 		if (si->ps.card_arch >= NV10A)
463 		{
464 			/* ... for cmd NV10_DX6_MULTI_TEXTURE_TRIANGLE (not used) */
465 			ACCW(PR_CTX0_8, 0x0300a095); /* NVclass $095, patchcfg ROP_AND, userclip enable,
466 										  * context surface0 valid, nv10+: little endian */
467 		}
468 		else
469 		{
470 			/* ... for cmd NV4_DX6_MULTI_TEXTURE_TRIANGLE (not used) */
471 			ACCW(PR_CTX0_8, 0x0300a055); /* NVclass $055, patchcfg ROP_AND, userclip enable,
472 										  * context surface0 valid */
473 		}
474 		ACCW(PR_CTX1_8, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
475 		ACCW(PR_CTX2_8, 0x11401140); /* DMA0, DMA1 instance = $1140 */
476 		ACCW(PR_CTX3_8, 0x00000000); /* method traps disabled */
477 		/* setup set '9' for cmd NV_SCALED_IMAGE_FROM_MEMORY */
478 		ACCW(PR_CTX0_9, 0x01018077); /* NVclass $077, patchcfg SRC_COPY,
479 									  * context surface0 valid, nv10+: little endian */
480 		ACCW(PR_CTX1_9, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
481 		ACCW(PR_CTX2_9, 0x11401140); /* DMA0, DMA1 instance = $1140 */
482 		ACCW(PR_CTX3_9, 0x00000000); /* method traps disabled */
483 		/* setup set 'A' for cmd NV1_RENDER_SOLID_LIN (not used) */
484 		ACCW(PR_CTX0_A, 0x0300a01c); /* NVclass $01c, patchcfg ROP_AND, userclip enable,
485 									  * context surface0 valid, nv10+: little endian */
486 		ACCW(PR_CTX1_A, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
487 		ACCW(PR_CTX2_A, 0x11401140); /* DMA0, DMA1 instance = $1140 */
488 		ACCW(PR_CTX3_A, 0x00000000); /* method traps disabled */
489 		//2007 3D tests..
490 		/* setup set 'B' ... */
491 		if (si->ps.card_type == NV15)
492 		{
493 			/* ... for cmd NV11_TCL_PRIMITIVE_3D */
494 			ACCW(PR_CTX0_B, 0x0300a096); /* NVclass $096, patchcfg ROP_AND, userclip enable,
495 										  * context surface0 valid, nv10+: little endian */
496 			ACCW(PR_CTX1_B, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
497 			ACCW(PR_CTX2_B, 0x11401140); /* DMA0, DMA1 instance = $1140 */
498 			ACCW(PR_CTX3_B, 0x00000000); /* method traps disabled */
499 		}
500 		/* setup DMA set pointed at by PF_CACH1_DMAI */
501 		if (si->engine.agp_mode)
502 		{
503 			/* DMA page table present and of linear type;
504 			 * DMA class is $002 (b0-11);
505 			 * DMA target node is AGP */
506 			ACCW(PR_CTX0_C, 0x00033002);
507 		}
508 		else
509 		{
510 			/* DMA page table present and of linear type;
511 			 * DMA class is $002 (b0-11);
512 			 * DMA target node is PCI */
513 			ACCW(PR_CTX0_C, 0x00023002);
514 		}
515 		ACCW(PR_CTX1_C, 0x000fffff); /* DMA limit: tablesize is 1M bytes */
516 		ACCW(PR_CTX2_C, (((uint32)((uint8 *)(si->dma_buffer_pci))) | 0x00000002));
517 									 /* DMA access type is READ_AND_WRITE;
518 									  * table is located in main system RAM (b12-31):
519 									  * It's adress needs to be at a 4kb boundary! */
520 
521 		/* set the 3D rendering functions colordepth via BPIXEL's 'depth 2' */
522 		/* note:
523 		 * setting a depth to 'invalid' (zero) makes the engine report
524 		 * ready with drawing 'immediately'. */
525 		//fixme: NV30A and above (probably) needs to be corrected...
526 		switch(si->dm.space)
527 		{
528 		case B_CMAP8:
529 			if (si->ps.card_arch < NV30A)
530 				/* set depth 2: $1 = Y8 */
531 				ACCW(BPIXEL, 0x00000100);
532 			else
533 				/* set depth 0-1: $1 = Y8, $2 = X1R5G5B5_Z1R5G5B5 */
534 				ACCW(BPIXEL, 0x00000021);
535 			break;
536 		case B_RGB15_LITTLE:
537 			if (si->ps.card_arch < NV30A)
538 				/* set depth 2: $4 = A1R5G5B5 */
539 				ACCW(BPIXEL, 0x00000400);
540 			else
541 				/* set depth 0-1: $2 = X1R5G5B5_Z1R5G5B5, $4 = A1R5G5B5 */
542 				ACCW(BPIXEL, 0x00000042);
543 			break;
544 		case B_RGB16_LITTLE:
545 			if (si->ps.card_arch < NV30A)
546 				/* set depth 2: $5 = R5G6B5 */
547 				ACCW(BPIXEL, 0x00000500);
548 			else
549 				/* set depth 0-1: $5 = R5G6B5, $a = X1A7R8G8B8_O1A7R8G8B8 */
550 				ACCW(BPIXEL, 0x000000a5);
551 			break;
552 		case B_RGB32_LITTLE:
553 		case B_RGBA32_LITTLE:
554 			if (si->ps.card_arch < NV30A)
555 				/* set depth 2: $c = A8R8G8B8 */
556 				ACCW(BPIXEL, 0x00000c00);
557 			else
558 				/* set depth 0-1: $7 = X8R8G8B8_Z8R8G8B8, $e = V8YB8U8YA8 */
559 				ACCW(BPIXEL, 0x000000e7);
560 			break;
561 		default:
562 			LOG(8,("ACC: init, invalid bit depth\n"));
563 			return B_ERROR;
564 		}
565 	}
566 
567 	if (si->ps.card_arch == NV04A)
568 	{
569 		/* do a explicit engine reset */
570 		ACCW(DEBUG0, 0x000001ff);
571 
572 		/* init some function blocks */
573 		/* DEBUG0, b20 and b21 should be high, this has a big influence on
574 		 * 3D rendering speed! (on all cards, confirmed) */
575 		ACCW(DEBUG0, 0x1230c000);
576 		/* DEBUG1, b19 = 1 increases 3D rendering speed on TNT2 (M64) a bit,
577 		 * TNT1 rendering speed stays the same (all cards confirmed) */
578 		ACCW(DEBUG1, 0x72191101);
579 		ACCW(DEBUG2, 0x11d5f071);
580 		ACCW(DEBUG3, 0x0004ff31);
581 		/* init OP methods */
582 		ACCW(DEBUG3, 0x4004ff31);
583 
584 		/* disable all acceleration engine INT reguests */
585 		ACCW(ACC_INTE, 0x00000000);
586 		/* reset all acceration engine INT status bits */
587 		ACCW(ACC_INTS, 0xffffffff);
588 		/* context control enabled */
589 		ACCW(NV04_CTX_CTRL, 0x10010100);
590 		/* all acceleration buffers, pitches and colors are valid */
591 		ACCW(NV04_ACC_STAT, 0xffffffff);
592 		/* enable acceleration engine command FIFO */
593 		ACCW(FIFO_EN, 0x00000001);
594 
595 		/* setup location of active screen in framebuffer */
596 		ACCW(OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
597 		ACCW(OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
598 		/* setup accesible card memory range */
599 		ACCW(BLIMIT0, (si->ps.memory_size - 1));
600 		ACCW(BLIMIT1, (si->ps.memory_size - 1));
601 
602 		/* pattern shape value = 8x8, 2 color */
603 		//fixme: not needed, unless the engine has a hardware fault (setting via cmd)!
604 		//ACCW(PAT_SHP, 0x00000000);
605 		/* Pgraph Beta AND value (fraction) b23-30 */
606 		ACCW(BETA_AND_VAL, 0xffffffff);
607 	}
608 	else
609 	{
610 		/* do a explicit engine reset */
611 		ACCW(DEBUG0, 0xffffffff);
612 		ACCW(DEBUG0, 0x00000000);
613 		/* disable all acceleration engine INT reguests */
614 		ACCW(ACC_INTE, 0x00000000);
615 		/* reset all acceration engine INT status bits */
616 		ACCW(ACC_INTS, 0xffffffff);
617 		/* context control enabled */
618 		ACCW(NV10_CTX_CTRL, 0x10010100);
619 		/* all acceleration buffers, pitches and colors are valid */
620 		ACCW(NV10_ACC_STAT, 0xffffffff);
621 		/* enable acceleration engine command FIFO */
622 		ACCW(FIFO_EN, 0x00000001);
623 		/* setup surface type:
624 		 * b1-0 = %01 = surface type is non-swizzle;
625 		 * this is needed to enable 3D on NV1x (confirmed) and maybe others? */
626 		ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) & 0x0007ff00));
627 		ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) | 0x00020101));
628 	}
629 
630 	if (si->ps.card_arch == NV10A)
631 	{
632 		/* init some function blocks */
633 		ACCW(DEBUG1, 0x00118700);
634 		/* DEBUG2 has a big influence on 3D speed for NV11 and NV15
635 		 * (confirmed b3 and b18 should both be '1' on both cards!)
636 		 * (b16 should also be '1', increases 3D speed on NV11 a bit more) */
637 		ACCW(DEBUG2, 0x24fd2ad9);
638 		ACCW(DEBUG3, 0x55de0030);
639 		/* NV10_DEBUG4 has a big influence on 3D speed for NV11, NV15 and NV18
640 		 * (confirmed b14 and b15 should both be '1' on these cards!)
641 		 * (confirmed b8 should be '0' on NV18 to prevent complete engine crash!) */
642 		ACCW(NV10_DEBUG4, 0x0000c000);
643 
644 		/* copy tile setup stuff from 'source' to acc engine */
645 		for (cnt = 0; cnt < 32; cnt++)
646 		{
647 			NV_REG32(NVACC_NV10_TIL0AD + (cnt << 2)) =
648 				NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
649 		}
650 
651 		/* setup location of active screen in framebuffer */
652 		ACCW(OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
653 		ACCW(OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
654 		/* setup accesible card memory range */
655 		ACCW(BLIMIT0, (si->ps.memory_size - 1));
656 		ACCW(BLIMIT1, (si->ps.memory_size - 1));
657 
658 		/* pattern shape value = 8x8, 2 color */
659 		//fixme: not needed, unless the engine has a hardware fault (setting via cmd)!
660 		//ACCW(PAT_SHP, 0x00000000);
661 		/* Pgraph Beta AND value (fraction) b23-30 */
662 		ACCW(BETA_AND_VAL, 0xffffffff);
663 	}
664 
665 	if (si->ps.card_arch >= NV20A)
666 	{
667 		switch (si->ps.card_arch)
668 		{
669 		case NV40A:
670 			/* init some function blocks */
671 			ACCW(DEBUG1, 0x401287c0);
672 			ACCW(DEBUG3, 0x60de8051);
673 			/* disable specific functions, but enable SETUP_SPARE2 register */
674 			ACCW(NV10_DEBUG4, 0x00008000);
675 			/* set limit_viol_pix_adress(?): more likely something unknown.. */
676 			ACCW(NV25_WHAT0, 0x00be3c5f);
677 
678 			/* setup some unknown serially accessed registers (?) */
679 			tmp = (NV_REG32(NV32_NV4X_WHAT0) & 0x000000ff);
680 			for (cnt = 0; (tmp && !(tmp & 0x00000001)); tmp >>= 1, cnt++);
681 			{
682 				ACCW(NV4X_WHAT2, cnt);
683 			}
684 
685 			/* unknown.. */
686 			switch (si->ps.card_type)
687 			{
688 			case NV40:
689 			case NV45:
690 			/* and NV48: but these are pgm'd as NV45 currently */
691 				ACCW(NV40_WHAT0, 0x83280fff);
692 				ACCW(NV40_WHAT1, 0x000000a0);
693 				ACCW(NV40_WHAT2, 0x0078e366);
694 				ACCW(NV40_WHAT3, 0x0000014c);
695 				break;
696 			case NV41:
697 			/* and ID == 0x012x: but no cards defined yet */
698 				ACCW(NV40P_WHAT0, 0x83280eff);
699 				ACCW(NV40P_WHAT1, 0x000000a0);
700 				ACCW(NV40P_WHAT2, 0x007596ff);
701 				ACCW(NV40P_WHAT3, 0x00000108);
702 				break;
703 			case NV43:
704 				ACCW(NV40P_WHAT0, 0x83280eff);
705 				ACCW(NV40P_WHAT1, 0x000000a0);
706 				ACCW(NV40P_WHAT2, 0x0072cb77);
707 				ACCW(NV40P_WHAT3, 0x00000108);
708 				break;
709 			case NV44:
710 			case G72:
711 				ACCW(NV40P_WHAT0, 0x83280eff);
712 				ACCW(NV40P_WHAT1, 0x000000a0);
713 
714 				NV_REG32(NV32_NV44_WHAT10) = NV_REG32(NV32_NV10STRAPINFO);
715 				NV_REG32(NV32_NV44_WHAT11) = 0x00000000;
716 				NV_REG32(NV32_NV44_WHAT12) = 0x00000000;
717 				NV_REG32(NV32_NV44_WHAT13) = NV_REG32(NV32_NV10STRAPINFO);
718 
719 				ACCW(NV44_WHAT2, 0x00000000);
720 				ACCW(NV44_WHAT3, 0x00000000);
721 				break;
722 /*			case NV44 type 2: (cardID 0x022x)
723 				//fixme if needed: doesn't seem to need the strapinfo thing..
724 				ACCW(NV40P_WHAT0, 0x83280eff);
725 				ACCW(NV40P_WHAT1, 0x000000a0);
726 
727 				ACCW(NV44_WHAT2, 0x00000000);
728 				ACCW(NV44_WHAT3, 0x00000000);
729 				break;
730 */			case G70:
731 			case G71:
732 			case G73:
733 				ACCW(NV40P_WHAT0, 0x83280eff);
734 				ACCW(NV40P_WHAT1, 0x000000a0);
735 				ACCW(NV40P_WHAT2, 0x07830610);
736 				ACCW(NV40P_WHAT3, 0x0000016a);
737 				break;
738 			default:
739 				ACCW(NV40P_WHAT0, 0x83280eff);
740 				ACCW(NV40P_WHAT1, 0x000000a0);
741 				break;
742 			}
743 
744 			ACCW(NV10_TIL3PT, 0x2ffff800);
745 			ACCW(NV10_TIL3ST, 0x00006000);
746 			ACCW(NV4X_WHAT1, 0x01000000);
747 			/* engine data source DMA instance = $1140 */
748 			ACCW(NV4X_DMA_SRC, 0x00001140);
749 			break;
750 		case NV30A:
751 			/* init some function blocks, but most is unknown.. */
752 			ACCW(DEBUG1, 0x40108700);
753 			ACCW(NV25_WHAT1, 0x00140000);
754 			ACCW(DEBUG3, 0xf00e0431);
755 			ACCW(NV10_DEBUG4, 0x00008000);
756 			ACCW(NV25_WHAT0, 0xf04b1f36);
757 			ACCW(NV20_WHAT3, 0x1002d888);
758 			ACCW(NV25_WHAT2, 0x62ff007f);
759 			break;
760 		case NV20A:
761 			/* init some function blocks, but most is unknown.. */
762 			ACCW(DEBUG1, 0x00118700);
763 			ACCW(DEBUG3, 0xf20e0431);
764 			ACCW(NV10_DEBUG4, 0x00000000);
765 			ACCW(NV20_WHAT1, 0x00000040);
766 			if (si->ps.card_type < NV25)
767 			{
768 				ACCW(NV20_WHAT2, 0x00080000);
769 				ACCW(NV10_DEBUG5, 0x00000005);
770 				ACCW(NV20_WHAT3, 0x45caa208);
771 				ACCW(NV20_WHAT4, 0x24000000);
772 				ACCW(NV20_WHAT5, 0x00000040);
773 
774 				/* copy some fixed RAM(?) configuration info(?) to some indexed registers: */
775 				/* b16-24 is select; b2-13 is adress in 32-bit words */
776 				ACCW(RDI_INDEX, 0x00e00038);
777 				/* data is 32-bit */
778 				ACCW(RDI_DATA, 0x00000030);
779 				/* copy some fixed RAM(?) configuration info(?) to some indexed registers: */
780 				/* b16-24 is select; b2-13 is adress in 32-bit words */
781 				ACCW(RDI_INDEX, 0x00e10038);
782 				/* data is 32-bit */
783 				ACCW(RDI_DATA, 0x00000030);
784 			}
785 			else
786 			{
787 				ACCW(NV25_WHAT1, 0x00080000);
788 				ACCW(NV25_WHAT0, 0x304b1fb6);
789 				ACCW(NV20_WHAT3, 0x18b82880);
790 				ACCW(NV20_WHAT4, 0x44000000);
791 				ACCW(NV20_WHAT5, 0x40000080);
792 				ACCW(NV25_WHAT2, 0x000000ff);
793 			}
794 			break;
795 		}
796 
797 		/* NV20A, NV30A and NV40A: */
798 		/* copy tile setup stuff from previous setup 'source' to acc engine
799 		 * (pattern colorRAM?) */
800 		if ((si->ps.card_type <= NV40) || (si->ps.card_type == NV45))
801 		{
802 			for (cnt = 0; cnt < 32; cnt++)
803 			{
804 				/* copy NV10_FBTIL0AD upto/including NV10_FBTIL7ST */
805 				NV_REG32(NVACC_NV20_WHAT0 + (cnt << 2)) =
806 					NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
807 
808 				/* copy NV10_FBTIL0AD upto/including NV10_FBTIL7ST */
809 				NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
810 					NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
811 			}
812 		}
813 		else
814 		{
815 			/* NV41, 43, 44, G70 and later */
816 			if (si->ps.card_type >= G70)
817 			{
818 				for (cnt = 0; cnt < 60; cnt++)
819 				{
820 					/* copy NV41_FBTIL0AD upto/including G70_FBTILEST */
821 					NV_REG32(NVACC_NV41_WHAT0 + (cnt << 2)) =
822 						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
823 
824 					/* copy NV41_FBTIL0AD upto/including G70_FBTILEST */
825 					NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
826 						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
827 				}
828 			}
829 			else
830 			{
831 				/* NV41, 43, 44 */
832 				for (cnt = 0; cnt < 48; cnt++)
833 				{
834 					/* copy NV41_FBTIL0AD upto/including NV41_FBTILBST */
835 					NV_REG32(NVACC_NV20_WHAT0 + (cnt << 2)) =
836 						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
837 
838 					if (si->ps.card_type != NV44)
839 					{
840 						/* copy NV41_FBTIL0AD upto/including NV41_FBTILBST */
841 						NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
842 							NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
843 					}
844 				}
845 			}
846 		}
847 
848 		if (si->ps.card_arch >= NV40A)
849 		{
850 			if ((si->ps.card_type == NV40) || (si->ps.card_type == NV45))
851 			{
852 				/* copy some RAM configuration info(?) */
853  				ACCW(NV20_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
854 				ACCW(NV20_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
855 				ACCW(NV40_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
856 				ACCW(NV40_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));
857 
858 				/* setup location of active screen in framebuffer */
859 				ACCW(NV20_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
860 				ACCW(NV20_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
861 				/* setup accesible card memory range */
862 				ACCW(NV20_BLIMIT6, (si->ps.memory_size - 1));
863 				ACCW(NV20_BLIMIT7, (si->ps.memory_size - 1));
864 			}
865 			else
866 			{
867 				/* NV41, 43, 44, G70 and later */
868 
869 				/* copy some RAM configuration info(?) */
870 				if (si->ps.card_type >= G70)
871 				{
872 					ACCW(G70_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
873 					ACCW(G70_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
874 				}
875 				else
876 				{
877 					/* NV41, 43, 44 */
878 					ACCW(NV40P_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
879 					ACCW(NV40P_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
880 				}
881 				ACCW(NV40P_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
882 				ACCW(NV40P_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));
883 
884 				/* setup location of active screen in framebuffer */
885 				ACCW(NV40P_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
886 				ACCW(NV40P_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
887 				/* setup accesible card memory range */
888 				ACCW(NV40P_BLIMIT6, (si->ps.memory_size - 1));
889 				ACCW(NV40P_BLIMIT7, (si->ps.memory_size - 1));
890 			}
891 		}
892 		else /* NV20A and NV30A: */
893 		{
894 			/* copy some RAM configuration info(?) */
895 			ACCW(NV20_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
896 			ACCW(NV20_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
897 			/* copy some RAM configuration info(?) to some indexed registers: */
898 			/* b16-24 is select; b2-13 is adress in 32-bit words */
899 			ACCW(RDI_INDEX, 0x00ea0000);
900 			/* data is 32-bit */
901 			ACCW(RDI_DATA, NV_REG32(NV32_PFB_CONFIG_0));
902 			/* b16-24 is select; b2-13 is adress in 32-bit words */
903 			ACCW(RDI_INDEX, 0x00ea0004);
904 			/* data is 32-bit */
905 			ACCW(RDI_DATA, NV_REG32(NV32_PFB_CONFIG_1));
906 
907 			/* setup location of active screen in framebuffer */
908 			ACCW(NV20_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
909 			ACCW(NV20_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
910 			/* setup accesible card memory range */
911 			ACCW(NV20_BLIMIT6, (si->ps.memory_size - 1));
912 			ACCW(NV20_BLIMIT7, (si->ps.memory_size - 1));
913 		}
914 
915 		/* NV20A, NV30A and NV40A: */
916 		/* setup some acc engine tile stuff */
917 		ACCW(NV10_TIL2AD, 0x00000000);
918 		ACCW(NV10_TIL0ED, 0xffffffff);
919 	}
920 
921 	/* all cards: */
922 	/* setup clipping: rect size is 32768 x 32768, probably max. setting */
923 	/* note:
924 	 * can also be done via the NV_IMAGE_BLACK_RECTANGLE engine command. */
925 	ACCW(ABS_UCLP_XMIN, 0x00000000);
926 	ACCW(ABS_UCLP_YMIN, 0x00000000);
927 	ACCW(ABS_UCLP_XMAX, 0x00007fff);
928 	ACCW(ABS_UCLP_YMAX, 0x00007fff);
929 
930 	/* setup sync parameters for NV12_IMAGE_BLIT command for the current mode:
931 	 * values given are CRTC vertical counter limit values. The NV12 command will wait
932 	 * for the specified's CRTC's vertical counter to be in between the given values */
933 	if (si->ps.card_type >= NV11)
934 	{
935 		ACCW(NV11_CRTC_LO, si->dm.timing.v_display - 1);
936 		ACCW(NV11_CRTC_HI, si->dm.timing.v_display + 1);
937 	}
938 
939 	/*** PFIFO ***/
940 	/* (setup caches) */
941 	/* disable caches reassign */
942 	ACCW(PF_CACHES, 0x00000000);
943 	/* PFIFO mode: channel 0 is in DMA mode, channels 1 - 32 are in PIO mode */
944 	ACCW(PF_MODE, 0x00000001);
945 	/* cache1 push0 access disabled */
946 	ACCW(PF_CACH1_PSH0, 0x00000000);
947 	/* cache1 pull0 access disabled */
948 	ACCW(PF_CACH1_PUL0, 0x00000000);
949 	/* cache1 push1 mode = DMA */
950 	if (si->ps.card_arch >= NV40A)
951 		ACCW(PF_CACH1_PSH1, 0x00010000);
952 	else
953 		ACCW(PF_CACH1_PSH1, 0x00000100);
954 	/* cache1 DMA Put offset = 0 (b2-28) */
955 	ACCW(PF_CACH1_DMAP, 0x00000000);
956 	/* cache1 DMA Get offset = 0 (b2-28) */
957 	ACCW(PF_CACH1_DMAG, 0x00000000);
958 	/* cache1 DMA instance adress = $114e (b0-15);
959 	 * instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000). */
960 	/* note:
961 	 * should point to a DMA definition in CTX register space (which is sort of RAM).
962 	 * This define tells the engine where the DMA cmd buffer is and what it's size is.
963 	 * Inside that cmd buffer you'll find the actual issued engine commands. */
964 	if (si->ps.card_arch >= NV40A)
965 		ACCW(PF_CACH1_DMAI, 0x00001150);
966 	else
967 		//2007 3d test..
968 		ACCW(PF_CACH1_DMAI, 0x0000114e);
969 	/* cache0 push0 access disabled */
970 	ACCW(PF_CACH0_PSH0, 0x00000000);
971 	/* cache0 pull0 access disabled */
972 	ACCW(PF_CACH0_PUL0, 0x00000000);
973 	/* RAM HT (hash table) baseadress = $10000 (b4-8), size = 4k,
974 	 * search = 128 (is byte offset between hash 'sets') */
975 	/* note:
976 	 * so HT base is $00710000, last is $00710fff.
977 	 * In this space you define the engine command handles (HT_HANDL_XX), which
978 	 * in turn points to the defines in CTX register space (which is sort of RAM) */
979 	ACCW(PF_RAMHT, 0x03000100);
980 	/* RAM FC baseadress = $11000 (b3-8) (size is fixed to 0.5k(?)) */
981 	/* note:
982 	 * so FC base is $00711000, last is $007111ff. (not used?) */
983 	ACCW(PF_RAMFC, 0x00000110);
984 	/* RAM RO baseadress = $11200 (b1-8), size = 0.5k */
985 	/* note:
986 	 * so RO base is $00711200, last is $007113ff. (not used?) */
987 	/* note also:
988 	 * This means(?) the PRAMIN CTX registers are accessible from base $00711400. */
989 	ACCW(PF_RAMRO, 0x00000112);
990 	/* PFIFO size: ch0-15 = 512 bytes, ch16-31 = 124 bytes */
991 	ACCW(PF_SIZE, 0x0000ffff);
992 	/* cache1 hash instance = $ffff (b0-15) */
993 	ACCW(PF_CACH1_HASH, 0x0000ffff);
994 	/* disable all PFIFO INTs */
995 	ACCW(PF_INTEN, 0x00000000);
996 	/* reset all PFIFO INT status bits */
997 	ACCW(PF_INTSTAT, 0xffffffff);
998 	/* cache0 pull0 engine = acceleration engine (graphics) */
999 	ACCW(PF_CACH0_PUL1, 0x00000001);
1000 	/* cache1 DMA control: disable some stuff */
1001 	ACCW(PF_CACH1_DMAC, 0x00000000);
1002 	/* cache1 engine 0 upto/including 7 is software (could also be graphics or DVD) */
1003 	ACCW(PF_CACH1_ENG, 0x00000000);
1004 	/* cache1 DMA fetch: trigger at 128 bytes, size is 32 bytes, max requests is 15,
1005 	 * use little endian */
1006 	ACCW(PF_CACH1_DMAF, 0x000f0078);
1007 	/* cache1 DMA push: b0 = 1: access is enabled */
1008 	ACCW(PF_CACH1_DMAS, 0x00000001);
1009 	/* cache1 push0 access enabled */
1010 	ACCW(PF_CACH1_PSH0, 0x00000001);
1011 	/* cache1 pull0 access enabled */
1012 	ACCW(PF_CACH1_PUL0, 0x00000001);
1013 	/* cache1 pull1 engine = acceleration engine (graphics) */
1014 	ACCW(PF_CACH1_PUL1, 0x00000001);
1015 	/* enable PFIFO caches reassign */
1016 	ACCW(PF_CACHES, 0x00000001);
1017 
1018 	/* setup 3D specifics */
1019 	nv_init_for_3D_dma();
1020 
1021 	/*** init acceleration engine command info ***/
1022 	/* set object handles */
1023 	/* note:
1024 	 * probably depending on some other setup, there are 8 or 32 FIFO channels
1025 	 * available. Assuming the current setup only has 8 channels because the 'rest'
1026 	 * isn't setup here... */
1027 	si->engine.fifo.handle[0] = NV_ROP5_SOLID;
1028 	si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
1029 	si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
1030 	si->engine.fifo.handle[3] = NV4_SURFACE; /* NV10_CONTEXT_SURFACES_2D is identical */
1031 	si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
1032 	si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
1033 	si->engine.fifo.handle[6] = NV4_CONTEXT_SURFACES_ARGB_ZS;//NV1_RENDER_SOLID_LIN;
1034 	si->engine.fifo.handle[7] = NV4_DX5_TEXTURE_TRIANGLE;
1035 	/* preset no FIFO channels assigned to cmd's */
1036 	for (cnt = 0; cnt < 0x20; cnt++)
1037 	{
1038 		si->engine.fifo.ch_ptr[cnt] = 0;
1039 	}
1040 	/* set handle's pointers to their assigned FIFO channels */
1041 	/* note:
1042 	 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
1043 	for (cnt = 0; cnt < 0x08; cnt++)
1044 	{
1045 		si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
1046 												(0x00000001 + (cnt * 0x00002000));
1047 	}
1048 
1049 	/*** init DMA command buffer info ***/
1050 	if (si->ps.card_arch >= NV40A) //main mem DMA buf on pre-NV40
1051 	{
1052 		si->dma_buffer = (void *)((char *)si->framebuffer +
1053 			((si->ps.memory_size - 1) & 0xffff8000));
1054 	}
1055 	LOG(4,("ACC_DMA: command buffer is at adress $%08x\n",
1056 		((uint32)(si->dma_buffer))));
1057 	/* we have issued no DMA cmd's to the engine yet */
1058 	si->engine.dma.put = 0;
1059 	/* the current first free adress in the DMA buffer is at offset 0 */
1060 	si->engine.dma.current = 0;
1061 	/* the DMA buffer can hold 8k 32-bit words (it's 32kb in size),
1062 	 * or 256k 32-bit words (1Mb in size) dependant on architecture (for now) */
1063 	/* note:
1064 	 * one word is reserved at the end of the DMA buffer to be able to instruct the
1065 	 * engine to do a buffer wrap-around!
1066 	 * (DMA opcode 'noninc method': issue word $20000000.) */
1067 	if (si->ps.card_arch < NV40A)
1068 		si->engine.dma.max = ((1 * 1024 * 1024) >> 2) - 1;
1069 	else
1070 		si->engine.dma.max = 8192 - 1;
1071 	/* note the current free space we have left in the DMA buffer */
1072 	si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
1073 
1074 	/*** init FIFO via DMA command buffer. ***/
1075 	/* wait for room in fifo for new FIFO assigment cmds if needed: */
1076 	if (si->ps.card_arch >= NV40A)
1077 	{
1078 		if (nv_acc_fifofree_dma(12) != B_OK) return B_ERROR;
1079 	}
1080 	else
1081 	{
1082 		if (nv_acc_fifofree_dma(16) != B_OK) return B_ERROR;
1083 	}
1084 
1085 	/* program new FIFO assignments */
1086 	/* Raster OPeration: */
1087 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
1088 	/* Clip: */
1089 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
1090 	/* Pattern: */
1091 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
1092 	/* 2D Surfaces: */
1093 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
1094 	/* Blit: */
1095 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
1096 	/* Bitmap: */
1097 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);
1098 	if (si->ps.card_arch < NV40A)
1099 	{
1100 		/* 3D surfaces: (3D related only) */
1101 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH6, si->engine.fifo.handle[6]);
1102 		/* Textured Triangle: (3D only) */
1103 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH7, si->engine.fifo.handle[7]);
1104 	}
1105 
1106 	/*** Set pixel width ***/
1107 	switch(si->dm.space)
1108 	{
1109 	case B_CMAP8:
1110 		surf_depth = 0x00000001;
1111 		cmd_depth = 0x00000003;
1112 		break;
1113 	case B_RGB15_LITTLE:
1114 	case B_RGB16_LITTLE:
1115 		surf_depth = 0x00000004;
1116 		cmd_depth = 0x00000001;
1117 		break;
1118 	case B_RGB32_LITTLE:
1119 	case B_RGBA32_LITTLE:
1120 		surf_depth = 0x00000006;
1121 		cmd_depth = 0x00000003;
1122 		break;
1123 	default:
1124 		LOG(8,("ACC_DMA: init, invalid bit depth\n"));
1125 		return B_ERROR;
1126 	}
1127 
1128 	/* wait for room in fifo for surface setup cmd if needed */
1129 	if (nv_acc_fifofree_dma(5) != B_OK) return B_ERROR;
1130 	/* now setup 2D surface (writing 5 32bit words) */
1131 	nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 4);
1132 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = surf_depth; /* Format */
1133 	/* setup screen pitch */
1134 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1135 		((si->fbc.bytes_per_row & 0x0000ffff) | (si->fbc.bytes_per_row << 16)); /* Pitch */
1136 	/* setup screen location */
1137 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1138 		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetSource */
1139 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1140 		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetDest */
1141 
1142 	/* wait for room in fifo for pattern colordepth setup cmd if needed */
1143 	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
1144 	/* set pattern colordepth (writing 2 32bit words) */
1145 	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLORFORMAT, 1);
1146 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1147 
1148 	/* wait for room in fifo for bitmap colordepth setup cmd if needed */
1149 	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
1150 	/* set bitmap colordepth (writing 2 32bit words) */
1151 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_SETCOLORFORMAT, 1);
1152 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1153 
1154 	/* Load our pattern into the engine: */
1155 	/* wait for room in fifo for pattern cmd if needed. */
1156 	if (nv_acc_fifofree_dma(7) != B_OK) return B_ERROR;
1157 	/* now setup pattern (writing 7 32bit words) */
1158 	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETSHAPE, 1);
1159 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* SetShape: 0 = 8x8, 1 = 64x1, 2 = 1x64 */
1160 	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLOR0, 4);
1161 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor0 */
1162 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor1 */
1163 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[0] */
1164 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[1] */
1165 
1166 	/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
1167 	nv_start_dma();
1168 
1169 	return B_OK;
1170 }
1171 
1172 static void nv_init_for_3D_dma(void)
1173 {
1174 	/* setup PGRAPH unknown registers and modify (pre-cleared) pipe stuff for 3D use */
1175 	if (si->ps.card_arch >= NV10A)
1176 	{
1177 		/* setup unknown PGRAPH stuff */
1178 		ACCW(PGWHAT_00, 0x00000000);
1179 		ACCW(PGWHAT_01, 0x00000000);
1180 		ACCW(PGWHAT_02, 0x00000000);
1181 		ACCW(PGWHAT_03, 0x00000000);
1182 
1183 		ACCW(PGWHAT_04, 0x00001000);
1184 		ACCW(PGWHAT_05, 0x00001000);
1185 		ACCW(PGWHAT_06, 0x4003ff80);
1186 
1187 		ACCW(PGWHAT_07, 0x00000000);
1188 		ACCW(PGWHAT_08, 0x00000000);
1189 		ACCW(PGWHAT_09, 0x00000000);
1190 		ACCW(PGWHAT_0A, 0x00000000);
1191 		ACCW(PGWHAT_0B, 0x00000000);
1192 
1193 		ACCW(PGWHAT_0C, 0x00080008);
1194 		ACCW(PGWHAT_0D, 0x00080008);
1195 
1196 		ACCW(PGWHAT_0E, 0x00000000);
1197 		ACCW(PGWHAT_0F, 0x00000000);
1198 		ACCW(PGWHAT_10, 0x00000000);
1199 		ACCW(PGWHAT_11, 0x00000000);
1200 		ACCW(PGWHAT_12, 0x00000000);
1201 		ACCW(PGWHAT_13, 0x00000000);
1202 		ACCW(PGWHAT_14, 0x00000000);
1203 		ACCW(PGWHAT_15, 0x00000000);
1204 		ACCW(PGWHAT_16, 0x00000000);
1205 		ACCW(PGWHAT_17, 0x00000000);
1206 		ACCW(PGWHAT_18, 0x00000000);
1207 
1208 		ACCW(PGWHAT_19, 0x10000000);
1209 
1210 		ACCW(PGWHAT_1A, 0x00000000);
1211 		ACCW(PGWHAT_1B, 0x00000000);
1212 		ACCW(PGWHAT_1C, 0x00000000);
1213 		ACCW(PGWHAT_1D, 0x00000000);
1214 		ACCW(PGWHAT_1E, 0x00000000);
1215 		ACCW(PGWHAT_1F, 0x00000000);
1216 		ACCW(PGWHAT_20, 0x00000000);
1217 		ACCW(PGWHAT_21, 0x00000000);
1218 
1219 		ACCW(PGWHAT_22, 0x08000000);
1220 
1221 		ACCW(PGWHAT_23, 0x00000000);
1222 		ACCW(PGWHAT_24, 0x00000000);
1223 		ACCW(PGWHAT_25, 0x00000000);
1224 		ACCW(PGWHAT_26, 0x00000000);
1225 
1226 		ACCW(PGWHAT_27, 0x4b7fffff);
1227 
1228 		ACCW(PGWHAT_28, 0x00000000);
1229 		ACCW(PGWHAT_29, 0x00000000);
1230 		ACCW(PGWHAT_2A, 0x00000000);
1231 
1232 		/* setup window clipping */
1233 		/* b0-11 = min; b16-27 = max.
1234 		 * note:
1235 		 * probably two's complement values, so setting to max range here:
1236 		 * which would be -2048 upto/including +2047. */
1237 		/* horizontal */
1238 		ACCW(WINCLIP_H_0, 0x07ff0800);
1239 		ACCW(WINCLIP_H_1, 0x07ff0800);
1240 		ACCW(WINCLIP_H_2, 0x07ff0800);
1241 		ACCW(WINCLIP_H_3, 0x07ff0800);
1242 		ACCW(WINCLIP_H_4, 0x07ff0800);
1243 		ACCW(WINCLIP_H_5, 0x07ff0800);
1244 		ACCW(WINCLIP_H_6, 0x07ff0800);
1245 		ACCW(WINCLIP_H_7, 0x07ff0800);
1246 		/* vertical */
1247 		ACCW(WINCLIP_V_0, 0x07ff0800);
1248 		ACCW(WINCLIP_V_1, 0x07ff0800);
1249 		ACCW(WINCLIP_V_2, 0x07ff0800);
1250 		ACCW(WINCLIP_V_3, 0x07ff0800);
1251 		ACCW(WINCLIP_V_4, 0x07ff0800);
1252 		ACCW(WINCLIP_V_5, 0x07ff0800);
1253 		ACCW(WINCLIP_V_6, 0x07ff0800);
1254 		ACCW(WINCLIP_V_7, 0x07ff0800);
1255 
1256 		/* setup (initialize) pipe:
1257 		 * needed to get valid 3D rendering on (at least) NV1x cards. Without this
1258 		 * those cards produce rubbish instead of 3D, although the engine itself keeps
1259 		 * running and 2D stays OK. */
1260 
1261 		/* set eyetype to local, lightning etc. is off */
1262 		ACCW(NV10_XFMOD0, 0x10000000);
1263 		/* disable all lights */
1264 		ACCW(NV10_XFMOD1, 0x00000000);
1265 
1266 		/* note: upon writing data into the PIPEDAT register, the PIPEADR is
1267 		 * probably auto-incremented! */
1268 		/* (pipe adress = b2-16, pipe data = b0-31) */
1269 		/* note: pipe adresses IGRAPH registers! */
1270 		ACCW(NV10_PIPEADR, 0x00006740);
1271 		ACCW(NV10_PIPEDAT, 0x00000000);
1272 		ACCW(NV10_PIPEDAT, 0x00000000);
1273 		ACCW(NV10_PIPEDAT, 0x00000000);
1274 		ACCW(NV10_PIPEDAT, 0x3f800000);
1275 
1276 		ACCW(NV10_PIPEADR, 0x00006750);
1277 		ACCW(NV10_PIPEDAT, 0x40000000);
1278 		ACCW(NV10_PIPEDAT, 0x40000000);
1279 		ACCW(NV10_PIPEDAT, 0x40000000);
1280 		ACCW(NV10_PIPEDAT, 0x40000000);
1281 
1282 		ACCW(NV10_PIPEADR, 0x00006760);
1283 		ACCW(NV10_PIPEDAT, 0x00000000);
1284 		ACCW(NV10_PIPEDAT, 0x00000000);
1285 		ACCW(NV10_PIPEDAT, 0x3f800000);
1286 		ACCW(NV10_PIPEDAT, 0x00000000);
1287 
1288 		ACCW(NV10_PIPEADR, 0x00006770);
1289 		ACCW(NV10_PIPEDAT, 0xc5000000);
1290 		ACCW(NV10_PIPEDAT, 0xc5000000);
1291 		ACCW(NV10_PIPEDAT, 0x00000000);
1292 		ACCW(NV10_PIPEDAT, 0x00000000);
1293 
1294 		ACCW(NV10_PIPEADR, 0x00006780);
1295 		ACCW(NV10_PIPEDAT, 0x00000000);
1296 		ACCW(NV10_PIPEDAT, 0x00000000);
1297 		ACCW(NV10_PIPEDAT, 0x3f800000);
1298 		ACCW(NV10_PIPEDAT, 0x00000000);
1299 
1300 		ACCW(NV10_PIPEADR, 0x000067a0);
1301 		ACCW(NV10_PIPEDAT, 0x3f800000);
1302 		ACCW(NV10_PIPEDAT, 0x3f800000);
1303 		ACCW(NV10_PIPEDAT, 0x3f800000);
1304 		ACCW(NV10_PIPEDAT, 0x3f800000);
1305 
1306 		ACCW(NV10_PIPEADR, 0x00006ab0);
1307 		ACCW(NV10_PIPEDAT, 0x3f800000);
1308 		ACCW(NV10_PIPEDAT, 0x3f800000);
1309 		ACCW(NV10_PIPEDAT, 0x3f800000);
1310 
1311 		ACCW(NV10_PIPEADR, 0x00006ac0);
1312 		ACCW(NV10_PIPEDAT, 0x00000000);
1313 		ACCW(NV10_PIPEDAT, 0x00000000);
1314 		ACCW(NV10_PIPEDAT, 0x00000000);
1315 
1316 		ACCW(NV10_PIPEADR, 0x00006c10);
1317 		ACCW(NV10_PIPEDAT, 0xbf800000);
1318 
1319 		ACCW(NV10_PIPEADR, 0x00007030);
1320 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1321 
1322 		ACCW(NV10_PIPEADR, 0x00007040);
1323 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1324 
1325 		ACCW(NV10_PIPEADR, 0x00007050);
1326 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1327 
1328 		ACCW(NV10_PIPEADR, 0x00007060);
1329 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1330 
1331 		ACCW(NV10_PIPEADR, 0x00007070);
1332 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1333 
1334 		ACCW(NV10_PIPEADR, 0x00007080);
1335 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1336 
1337 		ACCW(NV10_PIPEADR, 0x00007090);
1338 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1339 
1340 		ACCW(NV10_PIPEADR, 0x000070a0);
1341 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1342 
1343 		ACCW(NV10_PIPEADR, 0x00006a80);
1344 		ACCW(NV10_PIPEDAT, 0x00000000);
1345 		ACCW(NV10_PIPEDAT, 0x00000000);
1346 		ACCW(NV10_PIPEDAT, 0x3f800000);
1347 
1348 		ACCW(NV10_PIPEADR, 0x00006aa0);
1349 		ACCW(NV10_PIPEDAT, 0x00000000);
1350 		ACCW(NV10_PIPEDAT, 0x00000000);
1351 		ACCW(NV10_PIPEDAT, 0x00000000);
1352 
1353 		/* select primitive type that will be drawn (tri's) */
1354 		ACCW(NV10_PIPEADR, 0x00000040);
1355 		ACCW(NV10_PIPEDAT, 0x00000005);
1356 
1357 		ACCW(NV10_PIPEADR, 0x00006400);
1358 		ACCW(NV10_PIPEDAT, 0x3f800000);
1359 		ACCW(NV10_PIPEDAT, 0x3f800000);
1360 		ACCW(NV10_PIPEDAT, 0x4b7fffff);
1361 		ACCW(NV10_PIPEDAT, 0x00000000);
1362 
1363 		ACCW(NV10_PIPEADR, 0x00006410);
1364 		ACCW(NV10_PIPEDAT, 0xc5000000);
1365 		ACCW(NV10_PIPEDAT, 0xc5000000);
1366 		ACCW(NV10_PIPEDAT, 0x00000000);
1367 		ACCW(NV10_PIPEDAT, 0x00000000);
1368 
1369 		ACCW(NV10_PIPEADR, 0x00006420);
1370 		ACCW(NV10_PIPEDAT, 0x00000000);
1371 		ACCW(NV10_PIPEDAT, 0x00000000);
1372 		ACCW(NV10_PIPEDAT, 0x00000000);
1373 		ACCW(NV10_PIPEDAT, 0x00000000);
1374 
1375 		ACCW(NV10_PIPEADR, 0x00006430);
1376 		ACCW(NV10_PIPEDAT, 0x00000000);
1377 		ACCW(NV10_PIPEDAT, 0x00000000);
1378 		ACCW(NV10_PIPEDAT, 0x00000000);
1379 		ACCW(NV10_PIPEDAT, 0x00000000);
1380 
1381 		ACCW(NV10_PIPEADR, 0x000064c0);
1382 		ACCW(NV10_PIPEDAT, 0x3f800000);
1383 		ACCW(NV10_PIPEDAT, 0x3f800000);
1384 		ACCW(NV10_PIPEDAT, 0x477fffff);
1385 		ACCW(NV10_PIPEDAT, 0x3f800000);
1386 
1387 		ACCW(NV10_PIPEADR, 0x000064d0);
1388 		ACCW(NV10_PIPEDAT, 0xc5000000);
1389 		ACCW(NV10_PIPEDAT, 0xc5000000);
1390 		ACCW(NV10_PIPEDAT, 0x00000000);
1391 		ACCW(NV10_PIPEDAT, 0x00000000);
1392 
1393 		ACCW(NV10_PIPEADR, 0x000064e0);
1394 		ACCW(NV10_PIPEDAT, 0xc4fff000);
1395 		ACCW(NV10_PIPEDAT, 0xc4fff000);
1396 		ACCW(NV10_PIPEDAT, 0x00000000);
1397 		ACCW(NV10_PIPEDAT, 0x00000000);
1398 
1399 		ACCW(NV10_PIPEADR, 0x000064f0);
1400 		ACCW(NV10_PIPEDAT, 0x00000000);
1401 		ACCW(NV10_PIPEDAT, 0x00000000);
1402 		ACCW(NV10_PIPEDAT, 0x00000000);
1403 		ACCW(NV10_PIPEDAT, 0x00000000);
1404 
1405 		/* turn lightning on */
1406 		ACCW(NV10_XFMOD0, 0x30000000);
1407 		/* set light 1 to infinite type, other lights remain off */
1408 		ACCW(NV10_XFMOD1, 0x00000004);
1409 
1410 		/* Z-buffer state is:
1411 		 * initialized, set to: 'fixed point' (integer?); Z-buffer; 16bits depth */
1412 		/* note:
1413 		 * other options possible are: floating point; 24bits depth; W-buffer */
1414 		ACCW(GLOB_STAT_0, 0x10000000);
1415 		/* set DMA instance 2 and 3 to be invalid */
1416 		ACCW(GLOB_STAT_1, 0x00000000);
1417 	}
1418 }
1419 
1420 static void nv_start_dma(void)
1421 {
1422 	uint32 dummy;
1423 
1424 	if (si->engine.dma.current != si->engine.dma.put)
1425 	{
1426 		si->engine.dma.put = si->engine.dma.current;
1427 		/* flush used caches so we know for sure the DMA cmd buffer received all data. */
1428 		if (si->ps.card_arch < NV40A)
1429 		{
1430 			/* some CPU's support out-of-order processing (WinChip/Cyrix). Flush them. */
1431 			__asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
1432 			/* read a non-cached adress to flush the cash */
1433 			dummy = ACCR(STATUS);
1434 		}
1435 		else
1436 		{
1437 			/* dummy read the first adress of the framebuffer to flush MTRR-WC buffers */
1438 			dummy = *((volatile uint32 *)(si->framebuffer));
1439 		}
1440 
1441 		/* actually start DMA to execute all commands now in buffer */
1442 		/* note:
1443 		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
1444 		 * fact all the same set. It also doesn't matter if the channel was assigned a
1445 		 * command or not. */
1446 		/* note also:
1447 		 * NV_GENERAL_DMAPUT is a write-only register on some cards (confirmed NV11). */
1448 		NV_REG32(NVACC_FIFO + NV_GENERAL_DMAPUT) = (si->engine.dma.put << 2);
1449 	}
1450 }
1451 
1452 /* this routine does not check the engine's internal hardware FIFO, but the DMA
1453  * command buffer. You can see this as a FIFO as well, that feeds the hardware FIFO.
1454  * The hardware FIFO state is checked by the DMA hardware automatically. */
1455 static status_t nv_acc_fifofree_dma(uint16 cmd_size)
1456 {
1457 	uint32 dmaget;
1458 
1459 	/* we'd better check for timeouts on the DMA engine as it's theoretically
1460 	 * breakable by malfunctioning software */
1461 	uint16 cnt = 0;
1462 
1463 	/* check if the DMA buffer has enough room for the command.
1464 	 * note:
1465 	 * engine.dma.free is 'cached' */
1466 	while ((si->engine.dma.free < cmd_size) && (cnt < 10000) && (err < 3))
1467 	{
1468 		/* see where the engine is currently fetching from the buffer */
1469 		/* note:
1470 		 * read this only once in the code as accessing registers is relatively slow */
1471 		/* note also:
1472 		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
1473 		 * fact all the same set. It also doesn't matter if the channel was assigned a
1474 		 * command or not. */
1475 		dmaget = ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET)) >> 2);
1476 
1477 		/* update timeout counter: on NV11 on a Pentium4 2.8Ghz max reached count
1478 		 * using BeRoMeter 1.2.6 was about 600; so counting 10000 before generating
1479 		 * a timeout should definately do it. Snooze()-ing cannot be done without a
1480 		 * serious speed penalty, even if done for only 1 microSecond. */
1481 		cnt++;
1482 
1483 		/* where's the engine fetching viewed from us issuing? */
1484 		if (si->engine.dma.put >= dmaget)
1485 		{
1486 			/* engine is fetching 'behind us', the last piece of the buffer is free */
1487 
1488 			/* note the 'updated' free space we have in the DMA buffer */
1489 			si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
1490 			/* if it's enough after all we exit this routine immediately. Else: */
1491 			if (si->engine.dma.free < cmd_size)
1492 			{
1493 				/* not enough room left, so instruct DMA engine to reset the buffer
1494 				 * when it's reaching the end of it */
1495 				((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x20000000;
1496 				/* reset our buffer pointer, so new commands will be placed at the
1497 				 * beginning of the buffer. */
1498 				si->engine.dma.current = 0;
1499 				/* tell the engine to fetch the remaining command(s) in the DMA buffer
1500 				 * that where not executed before. */
1501 				nv_start_dma();
1502 
1503 				/* NOW the engine is fetching 'in front of us', so the first piece
1504 				 * of the buffer is free */
1505 
1506 				/* note the updated current free space we have in the DMA buffer */
1507 				si->engine.dma.free = dmaget - si->engine.dma.current;
1508 				/* mind this pittfall:
1509 				 * Leave some room between where the engine is fetching and where we
1510 				 * put new commands. Otherwise the engine will crash on heavy loads.
1511 				 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
1512 				 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
1513 				 * Note:
1514 				 * The engine is DMA triggered for fetching chunks every 128 bytes,
1515 				 * maybe this is the reason for this behaviour.
1516 				 * Note also:
1517 				 * it looks like the space that needs to be kept free is coupled
1518 				 * with the size of the DMA buffer. */
1519 				if (si->engine.dma.free < 256)
1520 					si->engine.dma.free = 0;
1521 				else
1522 					si->engine.dma.free -= 256;
1523 			}
1524 		}
1525 		else
1526 		{
1527 			/* engine is fetching 'in front of us', so the first piece of the buffer
1528 			 * is free */
1529 
1530 			/* note the updated current free space we have in the DMA buffer */
1531 			si->engine.dma.free = dmaget - si->engine.dma.current;
1532 			/* mind this pittfall:
1533 			 * Leave some room between where the engine is fetching and where we
1534 			 * put new commands. Otherwise the engine will crash on heavy loads.
1535 			 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
1536 			 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
1537 			 * Note:
1538 			 * The engine is DMA triggered for fetching chunks every 128 bytes,
1539 			 * maybe this is the reason for this behaviour.
1540 			 * Note also:
1541 			 * it looks like the space that needs to be kept free is coupled
1542 			 * with the size of the DMA buffer. */
1543 			if (si->engine.dma.free < 256)
1544 				si->engine.dma.free = 0;
1545 			else
1546 				si->engine.dma.free -= 256;
1547 		}
1548 	}
1549 
1550 	/* log timeout if we had one */
1551 	if (cnt == 10000)
1552 	{
1553 		if (err < 3) err++;
1554 		LOG(4,("ACC_DMA: fifofree; DMA timeout #%d, engine trouble!\n", err));
1555 	}
1556 
1557 	/* we must make the acceleration routines abort or the driver will hang! */
1558 	if (err >= 3) return B_ERROR;
1559 
1560 	return B_OK;
1561 }
1562 
1563 static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size)
1564 {
1565 	/* NV_FIFO_DMA_OPCODE: set number of cmd words (b18 - 28); set FIFO offset for
1566 	 * first cmd word (b2 - 15); set DMA opcode = method (b29 - 31).
1567 	 * a 'NOP' is the opcode word $00000000. */
1568 	/* note:
1569 	 * possible DMA opcodes:
1570 	 * b'000' is 'method' (execute cmd);
1571 	 * b'001' is 'jump';
1572 	 * b'002' is 'noninc method' (execute buffer wrap-around);
1573 	 * b'003' is 'call': return is executed by opcode word $00020000 (b17 = 1). */
1574 	/* note also:
1575 	 * this system uses auto-increments for the FIFO offset adresses. Make sure
1576 	 * to set a new adress if a gap exists between the previous one and the new one. */
1577 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((size << 18) |
1578 		((si->engine.fifo.ch_ptr[cmd] + offset) & 0x0000fffc));
1579 
1580 	/* space left after issuing the current command is the cmd AND it's arguments less */
1581 	si->engine.dma.free -= (size + 1);
1582 }
1583 
1584 static void nv_acc_set_ch_dma(uint16 ch, uint32 handle)
1585 {
1586 	/* issue FIFO channel assign cmd */
1587 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((1 << 18) | ch);
1588 	/* set new assignment */
1589 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = (0x80000000 | handle);
1590 
1591 	/* space left after issuing the current command is the cmd AND it's arguments less */
1592 	si->engine.dma.free -= 2;
1593 }
1594 
1595 /* note:
1596  * switching fifo channel assignments this way has no noticable slowdown:
1597  * measured 0.2% with Quake2. */
1598 void nv_acc_assert_fifo_dma(void)
1599 {
1600 	/* does every engine cmd this accelerant needs have a FIFO channel? */
1601 	//fixme: can probably be optimized for both speed and channel selection...
1602 	if (!si->engine.fifo.ch_ptr[NV_ROP5_SOLID] ||
1603 		!si->engine.fifo.ch_ptr[NV_IMAGE_BLACK_RECTANGLE] ||
1604 		!si->engine.fifo.ch_ptr[NV_IMAGE_PATTERN] ||
1605 		!si->engine.fifo.ch_ptr[NV4_SURFACE] ||
1606 		!si->engine.fifo.ch_ptr[NV_IMAGE_BLIT] ||
1607 		!si->engine.fifo.ch_ptr[NV4_GDI_RECTANGLE_TEXT] ||
1608 		!si->engine.fifo.ch_ptr[NV_SCALED_IMAGE_FROM_MEMORY])
1609 	{
1610 		uint16 cnt;
1611 
1612 		/* free the FIFO channels we want from the currently assigned cmd's */
1613 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[0]] = 0;
1614 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[1]] = 0;
1615 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[2]] = 0;
1616 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[3]] = 0;
1617 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[4]] = 0;
1618 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[5]] = 0;
1619 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[6]] = 0;
1620 
1621 		/* set new object handles */
1622 		si->engine.fifo.handle[0] = NV_ROP5_SOLID;
1623 		si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
1624 		si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
1625 		si->engine.fifo.handle[3] = NV4_SURFACE;
1626 		si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
1627 		si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
1628 		si->engine.fifo.handle[6] = NV_SCALED_IMAGE_FROM_MEMORY;
1629 
1630 		/* set handle's pointers to their assigned FIFO channels */
1631 		/* note:
1632 		 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
1633 		for (cnt = 0; cnt < 0x08; cnt++)
1634 		{
1635 			si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
1636 				(0x00000001 + (cnt * 0x00002000));
1637 		}
1638 
1639 		/* wait for room in fifo for new FIFO assigment cmds if needed. */
1640 		if (nv_acc_fifofree_dma(14) != B_OK) return;
1641 
1642 		/* program new FIFO assignments */
1643 		/* Raster OPeration: */
1644 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
1645 		/* Clip: */
1646 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
1647 		/* Pattern: */
1648 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
1649 		/* 2D Surface: */
1650 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
1651 		/* Blit: */
1652 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
1653 		/* Bitmap: */
1654 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);
1655 		/* Scaled and fitered Blit: */
1656 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH6, si->engine.fifo.handle[6]);
1657 
1658 		/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
1659 		nv_start_dma();
1660 	}
1661 }
1662 
1663 /*
1664 	note:
1665 	moved acceleration 'top-level' routines to be integrated in the engine:
1666 	it is costly to call the engine for every single function within a loop!
1667 	(measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.)
1668 
1669 	note also:
1670 	splitting up each command list into sublists (see routines below) prevents
1671 	a lot more nested calls, further increasing the speed with upto 70%.
1672 
1673 	finally:
1674 	sending the sublist to just one single engine command even further increases
1675 	speed with upto another 10%. This can't be done for blits though, as this engine-
1676 	command's hardware does not support multiple objects.
1677 */
1678 
1679 /* screen to screen blit - i.e. move windows around and scroll within them. */
1680 void SCREEN_TO_SCREEN_BLIT_DMA(engine_token *et, blit_params *list, uint32 count)
1681 {
1682 	uint32 i = 0;
1683 	uint16 subcnt;
1684 
1685 	/*** init acc engine for blit function ***/
1686 	/* ROP registers (Raster OPeration):
1687 	 * wait for room in fifo for ROP cmd if needed. */
1688 	if (nv_acc_fifofree_dma(2) != B_OK) return;
1689 	/* now setup ROP (writing 2 32bit words) for GXcopy */
1690 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1691 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
1692 
1693 	/*** do each blit ***/
1694 	/* Note:
1695 	 * blit-copy direction is determined inside nvidia hardware: no setup needed */
1696 	while (count)
1697 	{
1698 		/* break up the list in sublists to minimize calls, while making sure long
1699 		 * lists still get executed without trouble */
1700 		subcnt = 32;
1701 		if (count < 32) subcnt = count;
1702 		count -= subcnt;
1703 
1704 		/* wait for room in fifo for blit cmd if needed. */
1705 		if (nv_acc_fifofree_dma(4 * subcnt) != B_OK) return;
1706 
1707 		while (subcnt--)
1708 		{
1709 			/* now setup blit (writing 4 32bit words) */
1710 			nv_acc_cmd_dma(NV_IMAGE_BLIT, NV_IMAGE_BLIT_SOURCEORG, 3);
1711 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1712 				(((list[i].src_top) << 16) | (list[i].src_left)); /* SourceOrg */
1713 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1714 				(((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */
1715 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1716 				((((list[i].height) + 1) << 16) | ((list[i].width) + 1)); /* HeightWidth */
1717 
1718 			i++;
1719 		}
1720 
1721 		/* tell the engine to fetch the commands in the DMA buffer that where not
1722 		 * executed before. */
1723 		nv_start_dma();
1724 	}
1725 
1726 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1727 	si->engine.threeD.reload = 0xffffffff;
1728 }
1729 
1730 /* scaled and filtered screen to screen blit - i.e. video playback without overlay */
1731 /* note: source and destination may not overlap. */
1732 //fixme? checkout NV5 and NV10 version of cmd: faster?? (or is 0x77 a 'autoselect' version?)
1733 void SCREEN_TO_SCREEN_SCALED_FILTERED_BLIT_DMA(engine_token *et, scaled_blit_params *list, uint32 count)
1734 {
1735 	uint32 i = 0;
1736 	uint16 subcnt;
1737 	uint32 cmd_depth;
1738 	uint8 bpp;
1739 
1740 	/*** init acc engine for scaled filtered blit function ***/
1741 	/* Set pixel width */
1742 	switch(si->dm.space)
1743 	{
1744 	case B_RGB15_LITTLE:
1745 		cmd_depth = 0x00000002;
1746 		bpp = 2;
1747 		break;
1748 	case B_RGB16_LITTLE:
1749 		cmd_depth = 0x00000007;
1750 		bpp = 2;
1751 		break;
1752 	case B_RGB32_LITTLE:
1753 	case B_RGBA32_LITTLE:
1754 		cmd_depth = 0x00000004;
1755 		bpp = 4;
1756 		break;
1757 	/* fixme sometime:
1758 	 * we could do the spaces below if this function would be modified to be able
1759 	 * to use a source outside of the desktop, i.e. using offscreen bitmaps... */
1760 	case B_YCbCr422:
1761 		cmd_depth = 0x00000005;
1762 		bpp = 2;
1763 		break;
1764 	case B_YUV422:
1765 		cmd_depth = 0x00000006;
1766 		bpp = 2;
1767 		break;
1768 	default:
1769 		/* note: this function does not support src or dest in the B_CMAP8 space! */
1770 		//fixme: the NV10 version of this cmd supports B_CMAP8 src though... (checkout)
1771 		LOG(8,("ACC_DMA: scaled_filtered_blit, invalid bit depth\n"));
1772 		return;
1773 	}
1774 
1775 	/* modify surface depth settings for 15-bit colorspace so command works as intended */
1776 	if (si->dm.space == B_RGB15_LITTLE)
1777 	{
1778 		/* wait for room in fifo for surface setup cmd if needed */
1779 		if (nv_acc_fifofree_dma(2) != B_OK) return;
1780 		/* now setup 2D surface (writing 1 32bit word) */
1781 		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
1782 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000002; /* Format */
1783 	}
1784 
1785 	/* TNT1 has fixed operation mode 'SRCcopy' while the rest can be programmed: */
1786 	if (si->ps.card_type != NV04)
1787 	{
1788 		/* wait for room in fifo for cmds if needed. */
1789 		if (nv_acc_fifofree_dma(5) != B_OK) return;
1790 		/* now setup source bitmap colorspace */
1791 		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 2);
1792 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1793 		/* now setup operation mode to SRCcopy */
1794 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000003; /* SetOperation */
1795 	}
1796 	else
1797 	{
1798 		/* wait for room in fifo for cmd if needed. */
1799 		if (nv_acc_fifofree_dma(4) != B_OK) return;
1800 		/* now setup source bitmap colorspace */
1801 		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 1);
1802 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1803 		/* TNT1 has fixed operation mode SRCcopy */
1804 	}
1805 	/* now setup fill color (writing 2 32bit words) */
1806 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1807 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */
1808 
1809 	/*** do each blit ***/
1810 	while (count)
1811 	{
1812 		/* break up the list in sublists to minimize calls, while making sure long
1813 		 * lists still get executed without trouble */
1814 		subcnt = 16;
1815 		if (count < 16) subcnt = count;
1816 		count -= subcnt;
1817 
1818 		/* wait for room in fifo for blit cmd if needed. */
1819 		if (nv_acc_fifofree_dma(12 * subcnt) != B_OK) return;
1820 
1821 		while (subcnt--)
1822 		{
1823 			/* now setup blit (writing 12 32bit words) */
1824 			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCEORG, 6);
1825 			/* setup dest clipping ref for blit (not used) (b0-15 = left, b16-31 = top) */
1826 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0; /* SourceOrg */
1827 			/* setup dest clipping size for blit */
1828 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1829 				(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* SourceHeightWidth */
1830 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1831 			/* setup destination location and size for blit */
1832 				(((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */
1833 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1834 				(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* DestHeightWidth */
1835 			//fixme: findout scaling limits... (although the current cmd interface doesn't support them.)
1836 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1837 				(((list[i].src_width + 1) << 20) / (list[i].dest_width + 1)); /* HorInvScale (in 12.20 format) */
1838 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1839 				(((list[i].src_height + 1) << 20) / (list[i].dest_height + 1)); /* VerInvScale (in 12.20 format) */
1840 
1841 			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCESIZE, 4);
1842 			/* setup horizontal and vertical source (fetching) ends.
1843 			 * note:
1844 			 * horizontal granularity is 2 pixels, vertical granularity is 1 pixel.
1845 			 * look at Matrox or Neomagic bes engines code for usage example. */
1846 			//fixme: tested 15, 16 and 32-bit RGB depth, verify other depths...
1847 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1848 				(((list[i].src_height + 1) << 16) |
1849 				 (((list[i].src_width + 1) + 0x0001) & ~0x0001)); /* SourceHeightWidth */
1850 			/* setup source pitch (b0-15). Set 'format origin center' (b16-17) and
1851 			 * select 'format interpolator foh (bilinear filtering)' (b24). */
1852 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1853 				(si->fbc.bytes_per_row | (1 << 16) | (1 << 24)); /* SourcePitch */
1854 			/* setup source surface location */
1855 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1856 				((uint32)((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer)) +
1857 				(list[i].src_top * si->fbc.bytes_per_row) +	(list[i].src_left * bpp); /* Offset */
1858 			/* setup source start: first (sub)pixel contributing to output picture */
1859 			/* note:
1860 			 * clipping is not asked for.
1861 			 * look at nVidia NV10+ bes engine code for useage example. */
1862 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1863 				0; /* SourceRef (b0-15 = hor, b16-31 = ver: both in 12.4 format) */
1864 
1865 			i++;
1866 		}
1867 
1868 		/* tell the engine to fetch the commands in the DMA buffer that where not
1869 		 * executed before. */
1870 		nv_start_dma();
1871 	}
1872 
1873 	/* reset surface depth settings so the other engine commands works as intended */
1874 	if (si->dm.space == B_RGB15_LITTLE)
1875 	{
1876 		/* wait for room in fifo for surface setup cmd if needed */
1877 		if (nv_acc_fifofree_dma(2) != B_OK) return;
1878 		/* now setup 2D surface (writing 1 32bit word) */
1879 		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
1880 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000004; /* Format */
1881 
1882 		/* tell the engine to fetch the commands in the DMA buffer that where not
1883 		 * executed before. */
1884 		nv_start_dma();
1885 	}
1886 
1887 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1888 	si->engine.threeD.reload = 0xffffffff;
1889 }
1890 
1891 /* scaled and filtered screen to screen blit - i.e. video playback without overlay */
1892 /* note: source and destination may not overlap. */
1893 //fixme? checkout NV5 and NV10 version of cmd: faster?? (or is 0x77 a 'autoselect' version?)
1894 void OFFSCREEN_TO_SCREEN_SCALED_FILTERED_BLIT_DMA(
1895 	engine_token *et, offscreen_buffer_config *config, clipped_scaled_blit_params *list, uint32 count)
1896 {
1897 	uint32 i = 0;
1898 	uint32 cmd_depth;
1899 	uint8 bpp;
1900 
1901 	LOG(4,("ACC_DMA: offscreen src buffer location $%08x\n", (uint32)((uint8*)(config->buffer))));
1902 
1903 	/*** init acc engine for scaled filtered blit function ***/
1904 	/* Set pixel width */
1905 	switch(config->space)
1906 	{
1907 	case B_RGB15_LITTLE:
1908 		cmd_depth = 0x00000002;
1909 		bpp = 2;
1910 		break;
1911 	case B_RGB16_LITTLE:
1912 		cmd_depth = 0x00000007;
1913 		bpp = 2;
1914 		break;
1915 	case B_RGB32_LITTLE:
1916 	case B_RGBA32_LITTLE:
1917 		cmd_depth = 0x00000004;
1918 		bpp = 4;
1919 		break;
1920 	/* fixme sometime:
1921 	 * we could do the spaces below if this function would be modified to be able
1922 	 * to use a source outside of the desktop, i.e. using offscreen bitmaps... */
1923 	case B_YCbCr422:
1924 		cmd_depth = 0x00000005;
1925 		bpp = 2;
1926 		break;
1927 	case B_YUV422:
1928 		cmd_depth = 0x00000006;
1929 		bpp = 2;
1930 		break;
1931 	default:
1932 		/* note: this function does not support src or dest in the B_CMAP8 space! */
1933 		//fixme: the NV10 version of this cmd supports B_CMAP8 src though... (checkout)
1934 		LOG(8,("ACC_DMA: scaled_filtered_blit, invalid bit depth\n"));
1935 		return;
1936 	}
1937 
1938 	/* modify surface depth settings for 15-bit colorspace so command works as intended */
1939 	if (si->dm.space == B_RGB15_LITTLE)
1940 	{
1941 		/* wait for room in fifo for surface setup cmd if needed */
1942 		if (nv_acc_fifofree_dma(2) != B_OK) return;
1943 		/* now setup 2D surface (writing 1 32bit word) */
1944 		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
1945 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000002; /* Format */
1946 	}
1947 
1948 	/* TNT1 has fixed operation mode 'SRCcopy' while the rest can be programmed: */
1949 	if (si->ps.card_type != NV04)
1950 	{
1951 		/* wait for room in fifo for cmds if needed. */
1952 		if (nv_acc_fifofree_dma(5) != B_OK) return;
1953 		/* now setup source bitmap colorspace */
1954 		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 2);
1955 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1956 		/* now setup operation mode to SRCcopy */
1957 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000003; /* SetOperation */
1958 	}
1959 	else
1960 	{
1961 		/* wait for room in fifo for cmd if needed. */
1962 		if (nv_acc_fifofree_dma(4) != B_OK) return;
1963 		/* now setup source bitmap colorspace */
1964 		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 1);
1965 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1966 		/* TNT1 has fixed operation mode SRCcopy */
1967 	}
1968 	/* now setup fill color (writing 2 32bit words) */
1969 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1970 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */
1971 
1972 	/*** do each blit ***/
1973 	while (count--)
1974 	{
1975 		uint32 j = 0;
1976 		uint16 clipcnt = list[i].dest_clipcount;
1977 
1978 		LOG(4,("ACC_DMA: offscreen src left %d, top %d\n", list[i].src_left, list[i].src_top));
1979 		LOG(4,("ACC_DMA: offscreen src width %d, height %d\n", list[i].src_width + 1, list[i].src_height + 1));
1980 		LOG(4,("ACC_DMA: offscreen dest left %d, top %d\n", list[i].dest_left, list[i].dest_top));
1981 		LOG(4,("ACC_DMA: offscreen dest width %d, height %d\n", list[i].dest_width + 1, list[i].dest_height + 1));
1982 
1983 		/* wait for room in fifo for blit cmd if needed. */
1984 		if (nv_acc_fifofree_dma(9 + (5 * clipcnt)) != B_OK) return;
1985 
1986 		/* now setup blit (writing 12 32bit words) */
1987 		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCEORG + 8, 4);
1988 		/* setup destination location and size for blit */
1989 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1990 			((list[i].dest_top << 16) | list[i].dest_left); /* DestTopLeftOutputRect */
1991 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1992 			(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* DestHeightWidthOutputRect */
1993 		/* setup scaling */
1994 		//fixme: findout scaling limits... (although the current cmd interface doesn't support them.)
1995 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1996 			(((list[i].src_width + 1) << 20) / (list[i].dest_width + 1)); /* HorInvScale (in 12.20 format) */
1997 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1998 			(((list[i].src_height + 1) << 20) / (list[i].dest_height + 1)); /* VerInvScale (in 12.20 format) */
1999 
2000 		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCESIZE, 3);
2001 		/* setup horizontal and vertical source (fetching) ends.
2002 		 * note:
2003 		 * horizontal granularity is 2 pixels, vertical granularity is 1 pixel.
2004 		 * look at Matrox or Neomagic bes engines code for usage example. */
2005 		//fixme: tested 15, 16 and 32-bit RGB depth, verify other depths...
2006 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2007 			(((list[i].src_height + 1) << 16) |
2008 			 (((list[i].src_width + 1) + 0x0001) & ~0x0001)); /* SourceHeightWidth */
2009 		/* setup source pitch (b0-15). Set 'format origin center' (b16-17) and
2010 		 * select 'format interpolator foh (bilinear filtering)' (b24). */
2011 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2012 			(config->bytes_per_row | (1 << 16) | (1 << 24)); /* SourcePitch */
2013 
2014 		/* setup source surface location */
2015 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2016 			(uint32)((uint8*)config->buffer - (uint8*)si->framebuffer +
2017 			(list[i].src_top * config->bytes_per_row) +	(list[i].src_left * bpp)); /* Offset */
2018 
2019 		while (clipcnt--)
2020 		{
2021 			LOG(4,("ACC_DMA: offscreen clip left %d, top %d\n",
2022 				list[i].dest_cliplist[j].left, list[i].dest_cliplist[j].top));
2023 			LOG(4,("ACC_DMA: offscreen clip width %d, height %d\n",
2024 				list[i].dest_cliplist[j].width + 1, list[i].dest_cliplist[j].height + 1));
2025 
2026 			/* now setup blit (writing 12 32bit words) */
2027 			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCEORG, 2);
2028 			/* setup dest clipping rect for blit (b0-15 = left, b16-31 = top) */
2029 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2030 					(list[i].dest_cliplist[j].top << 16) | list[i].dest_cliplist[j].left; /* DestTopLeftClipRect */
2031 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2032 					((list[i].dest_cliplist[j].height + 1) << 16) | (list[i].dest_cliplist[j].width + 1); /* DestHeightWidthClipRect */
2033 
2034 			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCESIZE + 12, 1);
2035 			/* setup source start: first (sub)pixel contributing to output picture */
2036 			/* note:
2037 			 * clipping is not asked for.
2038 			 * look at nVidia NV10+ bes engine code for useage example. */
2039 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2040 				0; /* SourceRef (b0-15 = hor, b16-31 = ver: both in 12.4 format) */
2041 
2042 			j++;
2043 		}
2044 
2045 		i++;
2046 	}
2047 
2048 	/* tell the engine to fetch the commands in the DMA buffer that where not
2049 	 * executed before. */
2050 	nv_start_dma();
2051 
2052 	/* reset surface depth settings so the other engine commands works as intended */
2053 	if (si->dm.space == B_RGB15_LITTLE)
2054 	{
2055 		/* wait for room in fifo for surface setup cmd if needed */
2056 		if (nv_acc_fifofree_dma(2) != B_OK) return;
2057 		/* now setup 2D surface (writing 1 32bit word) */
2058 		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
2059 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000004; /* Format */
2060 
2061 		/* tell the engine to fetch the commands in the DMA buffer that where not
2062 		 * executed before. */
2063 		nv_start_dma();
2064 	}
2065 
2066 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
2067 	si->engine.threeD.reload = 0xffffffff;
2068 }
2069 
2070 /* rectangle fill - i.e. workspace and window background color */
2071 void FILL_RECTANGLE_DMA(engine_token *et, uint32 colorIndex, fill_rect_params *list, uint32 count)
2072 {
2073 	uint32 i = 0;
2074 	uint16 subcnt;
2075 
2076 	/*** init acc engine for fill function ***/
2077 	/* ROP registers (Raster OPeration):
2078 	 * wait for room in fifo for ROP and bitmap cmd if needed. */
2079 	if (nv_acc_fifofree_dma(4) != B_OK) return;
2080 	/* now setup ROP (writing 2 32bit words) for GXcopy */
2081 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
2082 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
2083 	/* now setup fill color (writing 2 32bit words) */
2084 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
2085 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */
2086 
2087 	/*** draw each rectangle ***/
2088 	while (count)
2089 	{
2090 		/* break up the list in sublists to minimize calls, while making sure long
2091 		 * lists still get executed without trouble */
2092 		subcnt = 32;
2093 		if (count < 32) subcnt = count;
2094 		count -= subcnt;
2095 
2096 		/* wait for room in fifo for bitmap cmd if needed. */
2097 		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
2098 
2099 		/* issue fill command once... */
2100 		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
2101 		/* ... and send multiple rects (engine cmd supports 32 max) */
2102 		while (subcnt--)
2103 		{
2104 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2105 				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
2106 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2107 				(((((list[i].right)+1) - (list[i].left)) << 16) |
2108 				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */
2109 
2110 			i++;
2111 		}
2112 
2113 		/* tell the engine to fetch the commands in the DMA buffer that where not
2114 		 * executed before. */
2115 		nv_start_dma();
2116 	}
2117 
2118 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
2119 	si->engine.threeD.reload = 0xffffffff;
2120 }
2121 
2122 /* span fill - i.e. (selected) menuitem background color (Dano) */
2123 void FILL_SPAN_DMA(engine_token *et, uint32 colorIndex, uint16 *list, uint32 count)
2124 {
2125 	uint32 i = 0;
2126 	uint16 subcnt;
2127 
2128 	/*** init acc engine for fill function ***/
2129 	/* ROP registers (Raster OPeration):
2130 	 * wait for room in fifo for ROP and bitmap cmd if needed. */
2131 	if (nv_acc_fifofree_dma(4) != B_OK) return;
2132 	/* now setup ROP (writing 2 32bit words) for GXcopy */
2133 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
2134 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
2135 	/* now setup fill color (writing 2 32bit words) */
2136 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
2137 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */
2138 
2139 	/*** draw each span ***/
2140 	while (count)
2141 	{
2142 		/* break up the list in sublists to minimize calls, while making sure long
2143 		 * lists still get executed without trouble */
2144 		subcnt = 32;
2145 		if (count < 32) subcnt = count;
2146 		count -= subcnt;
2147 
2148 		/* wait for room in fifo for bitmap cmd if needed. */
2149 		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
2150 
2151 		/* issue fill command once... */
2152 		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
2153 		/* ... and send multiple rects (spans) (engine cmd supports 32 max) */
2154 		while (subcnt--)
2155 		{
2156 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2157 				(((list[i+1]) << 16) | ((list[i]) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
2158 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2159 				((((list[i+2]+1) - (list[i+1])) << 16) | 0x00000001); /* Unclipped Rect 0 WidthHeight */
2160 
2161 			i+=3;
2162 		}
2163 
2164 		/* tell the engine to fetch the commands in the DMA buffer that where not
2165 		 * executed before. */
2166 		nv_start_dma();
2167 	}
2168 
2169 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
2170 	si->engine.threeD.reload = 0xffffffff;
2171 }
2172 
2173 /* rectangle invert - i.e. text cursor and text selection */
2174 void INVERT_RECTANGLE_DMA(engine_token *et, fill_rect_params *list, uint32 count)
2175 {
2176 	uint32 i = 0;
2177 	uint16 subcnt;
2178 
2179 	/*** init acc engine for invert function ***/
2180 	/* ROP registers (Raster OPeration):
2181 	 * wait for room in fifo for ROP and bitmap cmd if needed. */
2182 	if (nv_acc_fifofree_dma(4) != B_OK) return;
2183 	/* now setup ROP (writing 2 32bit words) for GXinvert */
2184 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
2185 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x55; /* SetRop5 */
2186 	/* now reset fill color (writing 2 32bit words) */
2187 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
2188 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */
2189 
2190 	/*** invert each rectangle ***/
2191 	while (count)
2192 	{
2193 		/* break up the list in sublists to minimize calls, while making sure long
2194 		 * lists still get executed without trouble */
2195 		subcnt = 32;
2196 		if (count < 32) subcnt = count;
2197 		count -= subcnt;
2198 
2199 		/* wait for room in fifo for bitmap cmd if needed. */
2200 		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
2201 
2202 		/* issue fill command once... */
2203 		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
2204 		/* ... and send multiple rects (engine cmd supports 32 max) */
2205 		while (subcnt--)
2206 		{
2207 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2208 				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
2209 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2210 				(((((list[i].right)+1) - (list[i].left)) << 16) |
2211 				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */
2212 
2213 			i++;
2214 		}
2215 
2216 		/* tell the engine to fetch the commands in the DMA buffer that where not
2217 		 * executed before. */
2218 		nv_start_dma();
2219 	}
2220 
2221 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
2222 	si->engine.threeD.reload = 0xffffffff;
2223 }
2224