xref: /haiku/src/add-ons/accelerants/nvidia/engine/nv_acc_dma.c (revision 0b2dbe7d46ee888392907c60131b7f7652314175)
1 /* NV Acceleration functions */
2 
3 /* Author:
4    Rudolf Cornelissen 8/2003-6/2005.
5 
6    This code was possible thanks to:
7     - the Linux XFree86 NV driver,
8     - the Linux UtahGLX 3D driver.
9 */
10 
11 #define MODULE_BIT 0x00080000
12 
13 /* 3D command defines (needed for concurrent overlay/3D 'workaround')
14  * note:
15  * the workaround contains of two pieces:
16  * - we have to issue a 3D drawing command before overlay is activated to prevent
17  *   the acceleration engine to crash;
18  * - we have to forego FIFO assignment switching: switching while we use overlay
19  *   crashes the acceleration engine as well.
20  *
21  * Hopefully we can find the _real_ solution for this one day... */
22 #define RIVA_STATE3D_05(t0, t1, t2, bb, cc) \
23 { \
24 	nv_acc_cmd_dma(NV4_DX5_TEXTURE_TRIANGLE, NV4_DX5_TEXTURE_TRIANGLE_COLORKEY, 7); \
25 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0; /* Colorkey */ \
26 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = t0; /* Offset */ \
27 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = t1; /* Format */ \
28 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = t2; /* Filter */ \
29 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = bb; /* Blend */ \
30 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cc; /* Control */ \
31 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0; /* FogColor */ \
32 }
33 
34 #define RIVA_VERTEX3D_05(ii, xx, yy) \
35 { \
36 	nv_acc_cmd_dma(NV4_DX5_TEXTURE_TRIANGLE, NV4_DX5_TEXTURE_TRIANGLE_TLVERTEX(ii), 8); \
37 	((float *)(si->dma_buffer))[si->engine.dma.current++] = xx; /* ScreenX */ \
38 	((float *)(si->dma_buffer))[si->engine.dma.current++] = yy; /* ScreenY */ \
39 	((float *)(si->dma_buffer))[si->engine.dma.current++] = 0.0f; /* ScreenZ */ \
40 	((float *)(si->dma_buffer))[si->engine.dma.current++] = 1.0f; /* RWH */ \
41 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0; /* Color */ \
42 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0; /* Specular */ \
43 	((float *)(si->dma_buffer))[si->engine.dma.current++] = 0.0f; /* TU */ \
44 	((float *)(si->dma_buffer))[si->engine.dma.current++] = 0.0f; /* TV */ \
45 }
46 
47 #define RIVA_DRAWQUAD3D_05(v0, v1, v2, v3) \
48 { \
49 	nv_acc_cmd_dma(NV4_DX5_TEXTURE_TRIANGLE, NV4_DX5_TEXTURE_TRIANGLE_TLVDRAWPRIM(0), 1); \
50 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = \
51 		(((v3)<<20)|((v2)<<16)|((v0)<<12)|((v2)<<8)|((v1)<<4)|(v0)); /* TLVDrawPrim */ \
52 }
53 
54 
55 #include "nv_std.h"
56 
57 /*acceleration notes*/
58 
59 /*functions Be's app_server uses:
60 fill span (horizontal only)
61 fill rectangle (these 2 are very similar)
62 invert rectangle
63 blit
64 */
65 
66 static void nv_init_for_3D_dma(void);
67 static void nv_start_dma(void);
68 static status_t nv_acc_fifofree_dma(uint16 cmd_size);
69 static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size);
70 static void nv_acc_set_ch_dma(uint16 ch, uint32 handle);
71 
72 /* used to track engine DMA stalls */
73 static uint8 err;
74 
75 /* wait until engine completely idle */
76 status_t nv_acc_wait_idle_dma()
77 {
78 	/* we'd better check for timeouts on the DMA engine as it's theoretically
79 	 * breakable by malfunctioning software */
80 	uint16 cnt = 0;
81 
82 	/* wait until all upcoming commands are in execution at least. Do this until
83 	 * we hit a timeout; abort if we failed at least three times before:
84 	 * if DMA stalls, we have to forget about it alltogether at some point, or
85 	 * the system will almost come to a complete halt.. */
86 	/* note:
87 	 * it doesn't matter which FIFO channel's DMA registers we access, they are in
88 	 * fact all the same set. It also doesn't matter if the channel was assigned a
89 	 * command or not. */
90 	while ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET) != (si->engine.dma.put << 2)) &&
91 			(cnt < 10000) && (err < 3))
92 	{
93 		/* snooze a bit so I do not hammer the bus */
94 		snooze (100);
95 		cnt++;
96 	}
97 
98 	/* log timeout if we had one */
99 	if (cnt == 10000)
100 	{
101 		if (err < 3) err++;
102 		LOG(4,("ACC_DMA: wait_idle; DMA timeout #%d, engine trouble!\n", err));
103 	}
104 
105 	/* wait until execution completed */
106 	while (ACCR(STATUS))
107 	{
108 		/* snooze a bit so I do not hammer the bus */
109 		snooze (100);
110 	}
111 
112 	return B_OK;
113 }
114 
115 /* AFAIK this must be done for every new screenmode.
116  * Engine required init. */
117 status_t nv_acc_init_dma()
118 {
119 	uint32 cnt;
120 	uint32 surf_depth, cmd_depth;
121 	/* reset the engine DMA stalls counter */
122 	err = 0;
123 
124 	/* a hanging engine only recovers from a complete power-down/power-up cycle */
125 	NV_REG32(NV32_PWRUPCTRL) = 0x13110011;
126 	snooze(1000);
127 	NV_REG32(NV32_PWRUPCTRL) = 0x13111111;
128 
129 	/* don't try this on NV20 and later.. */
130 	if (si->ps.card_arch < NV20A)
131 	{
132 		/* actively reset the PGRAPH registerset (acceleration engine) */
133 		for (cnt = 0x00400000; cnt < 0x00402000; cnt +=4)
134 		{
135 			NV_REG32(cnt) = 0x00000000;
136 		}
137 	}
138 
139 	/* setup PTIMER: */
140 	//fixme? how about NV28 setup as just after coldstarting? (see nv_info.c)
141 	/* set timer numerator to 8 (in b0-15) */
142 	ACCW(PT_NUMERATOR, 0x00000008);
143 	/* set timer denominator to 3 (in b0-15) */
144 	ACCW(PT_DENOMINATR, 0x00000003);
145 
146 	/* disable timer-alarm INT requests (b0) */
147 	ACCW(PT_INTEN, 0x00000000);
148 	/* reset timer-alarm INT status bit (b0) */
149 	ACCW(PT_INTSTAT, 0xffffffff);
150 
151 	/* enable PRAMIN write access on pre NV10 before programming it! */
152 	if (si->ps.card_arch == NV04A)
153 	{
154 		/* set framebuffer config: type = notiling, PRAMIN write access enabled */
155 		NV_REG32(NV32_PFB_CONFIG_0) = 0x00001114;
156 	}
157 	else
158 	{
159 		/* setup acc engine 'source' tile adressranges */
160 		ACCW(NV10_FBTIL0AD, 0);
161 		ACCW(NV10_FBTIL1AD, 0);
162 		ACCW(NV10_FBTIL2AD, 0);
163 		ACCW(NV10_FBTIL3AD, 0);
164 		ACCW(NV10_FBTIL4AD, 0);
165 		ACCW(NV10_FBTIL5AD, 0);
166 		ACCW(NV10_FBTIL6AD, 0);
167 		ACCW(NV10_FBTIL7AD, 0);
168 		ACCW(NV10_FBTIL0ED, (si->ps.memory_size - 1));
169 		ACCW(NV10_FBTIL1ED, (si->ps.memory_size - 1));
170 		ACCW(NV10_FBTIL2ED, (si->ps.memory_size - 1));
171 		ACCW(NV10_FBTIL3ED, (si->ps.memory_size - 1));
172 		ACCW(NV10_FBTIL4ED, (si->ps.memory_size - 1));
173 		ACCW(NV10_FBTIL5ED, (si->ps.memory_size - 1));
174 		ACCW(NV10_FBTIL6ED, (si->ps.memory_size - 1));
175 		ACCW(NV10_FBTIL7ED, (si->ps.memory_size - 1));
176 	}
177 
178 	/*** PRAMIN ***/
179 	/* first clear the entire RAMHT (hash-table) space to a defined state. It turns
180 	 * out at least NV11 will keep the previously programmed handles over resets and
181 	 * power-outages upto about 15 seconds!! Faulty entries might well hang the
182 	 * engine (confirmed on NV11).
183 	 * Note:
184 	 * this behaviour is not very strange: even very old DRAM chips are known to be
185 	 * able to do this, even though you should refresh them every few milliseconds or
186 	 * so. (Large memory cell capacitors, though different cells vary a lot in their
187 	 * capacity.)
188 	 * Of course data validity is not certain by a long shot over this large
189 	 * amount of time.. */
190 	for(cnt = 0; cnt < 0x0400; cnt++)
191 		NV_REG32(NVACC_HT_HANDL_00 + (cnt << 2)) = 0;
192 	/* RAMHT (hash-table) space SETUP FIFO HANDLES */
193 	/* note:
194 	 * 'instance' tells you where the engine command is stored in 'PR_CTXx_x' sets
195 	 * below: instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000).
196 	 * That command is linked to the handle noted here. This handle is then used to
197 	 * tell the FIFO to which engine command it is connected!
198 	 * (CTX registers are actually a sort of RAM space.) */
199 	if (si->ps.card_arch >= NV40A)
200 	{
201 		/* (first set) */
202 		ACCW(HT_HANDL_00, (0x80000000 | NV10_CONTEXT_SURFACES_2D)); /* 32bit handle (not used) */
203 		ACCW(HT_VALUE_00, 0x0010114c); /* instance $114c, engine = acc engine, CHID = $00 */
204 
205 		ACCW(HT_HANDL_01, (0x80000000 | NV_IMAGE_BLIT)); /* 32bit handle */
206 		ACCW(HT_VALUE_01, 0x00101148); /* instance $1148, engine = acc engine, CHID = $00 */
207 
208 		ACCW(HT_HANDL_02, (0x80000000 | NV4_GDI_RECTANGLE_TEXT)); /* 32bit handle */
209 		ACCW(HT_VALUE_02, 0x0010114a); /* instance $114a, engine = acc engine, CHID = $00 */
210 
211 		/* (second set) */
212 		ACCW(HT_HANDL_10, (0x80000000 | NV_ROP5_SOLID)); /* 32bit handle */
213 		ACCW(HT_VALUE_10, 0x00101142); /* instance $1142, engine = acc engine, CHID = $00 */
214 
215 		ACCW(HT_HANDL_11, (0x80000000 | NV_IMAGE_BLACK_RECTANGLE)); /* 32bit handle */
216 		ACCW(HT_VALUE_11, 0x00101144); /* instance $1144, engine = acc engine, CHID = $00 */
217 
218 		ACCW(HT_HANDL_12, (0x80000000 | NV_IMAGE_PATTERN)); /* 32bit handle */
219 		ACCW(HT_VALUE_12, 0x00101146); /* instance $1146, engine = acc engine, CHID = $00 */
220 	}
221 	else
222 	{
223 		/* (first set) */
224 		ACCW(HT_HANDL_00, (0x80000000 | NV4_SURFACE)); /* 32bit handle */
225 		ACCW(HT_VALUE_00, 0x80011145); /* instance $1145, engine = acc engine, CHID = $00 */
226 
227 		ACCW(HT_HANDL_01, (0x80000000 | NV_IMAGE_BLIT)); /* 32bit handle */
228 		ACCW(HT_VALUE_01, 0x80011146); /* instance $1146, engine = acc engine, CHID = $00 */
229 
230 		ACCW(HT_HANDL_02, (0x80000000 | NV4_GDI_RECTANGLE_TEXT)); /* 32bit handle */
231 		ACCW(HT_VALUE_02, 0x80011147); /* instance $1147, engine = acc engine, CHID = $00 */
232 
233 		ACCW(HT_HANDL_03, (0x80000000 | NV4_CONTEXT_SURFACES_ARGB_ZS)); /* 32bit handle (3D) */
234 		ACCW(HT_VALUE_03, 0x80011148); /* instance $1148, engine = acc engine, CHID = $00 */
235 
236 		/* NV4_ and NV10_DX5_TEXTURE_TRIANGLE should be identical */
237 		ACCW(HT_HANDL_04, (0x80000000 | NV4_DX5_TEXTURE_TRIANGLE)); /* 32bit handle (3D) */
238 		ACCW(HT_VALUE_04, 0x80011149); /* instance $1149, engine = acc engine, CHID = $00 */
239 
240 		/* NV4_ and NV10_DX6_MULTI_TEXTURE_TRIANGLE should be identical */
241 		ACCW(HT_HANDL_05, (0x80000000 | NV4_DX6_MULTI_TEXTURE_TRIANGLE)); /* 32bit handle (not used) */
242 		ACCW(HT_VALUE_05, 0x8001114a); /* instance $114a, engine = acc engine, CHID = $00 */
243 
244 		ACCW(HT_HANDL_06, (0x80000000 | NV1_RENDER_SOLID_LIN)); /* 32bit handle (not used) */
245 		ACCW(HT_VALUE_06, 0x8001114b); /* instance $114b, engine = acc engine, CHID = $00 */
246 
247 		/* (second set) */
248 		ACCW(HT_HANDL_10, (0x80000000 | NV_ROP5_SOLID)); /* 32bit handle */
249 		ACCW(HT_VALUE_10, 0x80011142); /* instance $1142, engine = acc engine, CHID = $00 */
250 
251 		ACCW(HT_HANDL_11, (0x80000000 | NV_IMAGE_BLACK_RECTANGLE)); /* 32bit handle */
252 		ACCW(HT_VALUE_11, 0x80011143); /* instance $1143, engine = acc engine, CHID = $00 */
253 
254 		ACCW(HT_HANDL_12, (0x80000000 | NV_IMAGE_PATTERN)); /* 32bit handle */
255 		ACCW(HT_VALUE_12, 0x80011144); /* instance $1144, engine = acc engine, CHID = $00 */
256 	}
257 
258 	/* program CTX registers: CTX1 is mostly done later (colorspace dependant) */
259 	/* note:
260 	 * CTX determines which HT handles point to what engine commands. */
261 	/* note also:
262 	 * CTX registers are in fact in the same GPU internal RAM space as the engine's
263 	 * hashtable. This means that stuff programmed in here also survives resets and
264 	 * power-outages! (confirmed NV11) */
265 	if (si->ps.card_arch >= NV40A)
266 	{
267 		/* setup a DMA define for use by command defines below. */
268 		ACCW(PR_CTX0_R, 0x00003000); /* DMA page table present and of linear type;
269 									  * DMA target node is NVM (non-volatile memory?)
270 									  * (instead of doing PCI or AGP transfers) */
271 		ACCW(PR_CTX1_R, (si->ps.memory_size - 1)); /* DMA limit: size is all cardRAM */
272 		ACCW(PR_CTX2_R, ((0x00000000 & 0xfffff000) | 0x00000002));
273 									 /* DMA access type is READ_AND_WRITE;
274 									  * memory starts at start of cardRAM (b12-31):
275 									  * It's adress needs to be at a 4kb boundary! */
276 		ACCW(PR_CTX3_R, 0x00000002); /* unknown (looks like this is rubbish/not needed?) */
277 		/* setup set '0' for cmd NV_ROP5_SOLID */
278 		ACCW(PR_CTX0_0, 0x02080043); /* NVclass $043, patchcfg ROP_AND, nv10+: little endian */
279 		ACCW(PR_CTX1_0, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
280 		ACCW(PR_CTX2_0, 0x00000000); /* DMA0 and DMA1 instance invalid */
281 		ACCW(PR_CTX3_0, 0x00000000); /* method traps disabled */
282 		ACCW(PR_CTX0_1, 0x00000000); /* extra */
283 		ACCW(PR_CTX1_1, 0x00000000); /* extra */
284 		/* setup set '1' for cmd NV_IMAGE_BLACK_RECTANGLE */
285 		ACCW(PR_CTX0_2, 0x02080019); /* NVclass $019, patchcfg ROP_AND, nv10+: little endian */
286 		ACCW(PR_CTX1_2, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
287 		ACCW(PR_CTX2_2, 0x00000000); /* DMA0 and DMA1 instance invalid */
288 		ACCW(PR_CTX3_2, 0x00000000); /* method traps disabled */
289 		ACCW(PR_CTX0_3, 0x00000000); /* extra */
290 		ACCW(PR_CTX1_3, 0x00000000); /* extra */
291 		/* setup set '2' for cmd NV_IMAGE_PATTERN */
292 		ACCW(PR_CTX0_4, 0x02080018); /* NVclass $018, patchcfg ROP_AND, nv10+: little endian */
293 		ACCW(PR_CTX1_4, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
294 		ACCW(PR_CTX2_4, 0x00000000); /* DMA0 and DMA1 instance invalid */
295 		ACCW(PR_CTX3_4, 0x00000000); /* method traps disabled */
296 		ACCW(PR_CTX0_5, 0x00000000); /* extra */
297 		ACCW(PR_CTX1_5, 0x00000000); /* extra */
298 		/* setup set '4' for cmd NV_IMAGE_BLIT */
299 		ACCW(PR_CTX0_6, 0x0208005f); /* NVclass $05f, patchcfg ROP_AND, nv10+: little endian */
300 		ACCW(PR_CTX1_6, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
301 		ACCW(PR_CTX2_6, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
302 		ACCW(PR_CTX3_6, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
303 		ACCW(PR_CTX0_7, 0x00000000); /* extra */
304 		ACCW(PR_CTX1_7, 0x00000000); /* extra */
305 		/* setup set '5' for cmd NV4_GDI_RECTANGLE_TEXT */
306 		ACCW(PR_CTX0_8, 0x0208004a); /* NVclass $04a, patchcfg ROP_AND, nv10+: little endian */
307 		ACCW(PR_CTX1_8, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
308 		ACCW(PR_CTX2_8, 0x00000000); /* DMA0 and DMA1 instance invalid */
309 		ACCW(PR_CTX3_8, 0x00000000); /* method traps disabled */
310 		ACCW(PR_CTX0_9, 0x00000000); /* extra */
311 		ACCW(PR_CTX1_9, 0x00000000); /* extra */
312 		/* setup set '6' for cmd NV10_CONTEXT_SURFACES_2D */
313 		ACCW(PR_CTX0_A, 0x02080062); /* NVclass $062, nv10+: little endian */
314 		ACCW(PR_CTX1_A, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
315 		ACCW(PR_CTX2_A, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
316 		ACCW(PR_CTX3_A, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
317 		ACCW(PR_CTX0_B, 0x00000000); /* extra */
318 		ACCW(PR_CTX1_B, 0x00000000); /* extra */
319 		/* setup DMA set pointed at by PF_CACH1_DMAI */
320 		ACCW(PR_CTX0_C, 0x00003002); /* DMA page table present and of linear type;
321 									  * DMA class is $002 (b0-11);
322 									  * DMA target node is NVM (non-volatile memory?)
323 									  * (instead of doing PCI or AGP transfers) */
324 		ACCW(PR_CTX1_C, 0x00007fff); /* DMA limit: tablesize is 32k bytes */
325 		ACCW(PR_CTX2_C, (((si->ps.memory_size - 1) & 0xffff8000) | 0x00000002));
326 									 /* DMA access type is READ_AND_WRITE;
327 									  * table is located at end of cardRAM (b12-31):
328 									  * It's adress needs to be at a 4kb boundary! */
329 	}
330 	else
331 	{
332 		/* setup a DMA define for use by command defines below. */
333 		ACCW(PR_CTX0_R, 0x00003000); /* DMA page table present and of linear type;
334 									  * DMA target node is NVM (non-volatile memory?)
335 									  * (instead of doing PCI or AGP transfers) */
336 		ACCW(PR_CTX1_R, (si->ps.memory_size - 1)); /* DMA limit: size is all cardRAM */
337 		ACCW(PR_CTX2_R, ((0x00000000 & 0xfffff000) | 0x00000002));
338 									 /* DMA access type is READ_AND_WRITE;
339 									  * memory starts at start of cardRAM (b12-31):
340 									  * It's adress needs to be at a 4kb boundary! */
341 		ACCW(PR_CTX3_R, 0x00000002); /* unknown (looks like this is rubbish/not needed?) */
342 		/* setup set '0' for cmd NV_ROP5_SOLID */
343 		ACCW(PR_CTX0_0, 0x01008043); /* NVclass $043, patchcfg ROP_AND, nv10+: little endian */
344 		ACCW(PR_CTX1_0, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
345 		ACCW(PR_CTX2_0, 0x00000000); /* DMA0 and DMA1 instance invalid */
346 		ACCW(PR_CTX3_0, 0x00000000); /* method traps disabled */
347 		/* setup set '1' for cmd NV_IMAGE_BLACK_RECTANGLE */
348 		ACCW(PR_CTX0_1, 0x01008019); /* NVclass $019, patchcfg ROP_AND, nv10+: little endian */
349 		ACCW(PR_CTX1_1, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
350 		ACCW(PR_CTX2_1, 0x00000000); /* DMA0 and DMA1 instance invalid */
351 		ACCW(PR_CTX3_1, 0x00000000); /* method traps disabled */
352 		/* setup set '2' for cmd NV_IMAGE_PATTERN */
353 		ACCW(PR_CTX0_2, 0x01008018); /* NVclass $018, patchcfg ROP_AND, nv10+: little endian */
354 		ACCW(PR_CTX1_2, 0x00000002); /* colorspace not set, notify instance is $0200 (b16-31) */
355 		ACCW(PR_CTX2_2, 0x00000000); /* DMA0 and DMA1 instance invalid */
356 		ACCW(PR_CTX3_2, 0x00000000); /* method traps disabled */
357 		/* setup set '3' for ... */
358 		if(si->ps.card_arch >= NV10A)
359 		{
360 			/* ... cmd NV10_CONTEXT_SURFACES_2D */
361 			ACCW(PR_CTX0_3, 0x01008062); /* NVclass $062, nv10+: little endian */
362 		}
363 		else
364 		{
365 			/* ... cmd NV4_SURFACE */
366 			ACCW(PR_CTX0_3, 0x01008042); /* NVclass $042, nv10+: little endian */
367 		}
368 		ACCW(PR_CTX1_3, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
369 		ACCW(PR_CTX2_3, 0x11401140); /* DMA0 instance is $1140, DMA1 instance invalid */
370 		ACCW(PR_CTX3_3, 0x00000000); /* method trap 0 is $1140, trap 1 disabled */
371 		/* setup set '4' for cmd NV_IMAGE_BLIT */
372 		ACCW(PR_CTX0_4, 0x0100805f); /* NVclass $05f, patchcfg ROP_AND, nv10+: little endian */
373 		ACCW(PR_CTX1_4, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
374 		ACCW(PR_CTX2_4, 0x11401140); /* DMA0 instance is $1140, DMA1 instance invalid */
375 		ACCW(PR_CTX3_4, 0x00000000); /* method trap 0 is $1140, trap 1 disabled */
376 		/* setup set '5' for cmd NV4_GDI_RECTANGLE_TEXT */
377 		ACCW(PR_CTX0_5, 0x0100804a); /* NVclass $04a, patchcfg ROP_AND, nv10+: little endian */
378 		ACCW(PR_CTX1_5, 0x00000002); /* colorspace not set, notify instance is $0200 (b16-31) */
379 		ACCW(PR_CTX2_5, 0x00000000); /* DMA0 and DMA1 instance invalid */
380 		ACCW(PR_CTX3_5, 0x00000000); /* method traps disabled */
381 		/* setup set '6' ... */
382 		if (si->ps.card_arch >= NV10A)
383 		{
384 			/* ... for cmd NV10_CONTEXT_SURFACES_ARGB_ZS */
385 			ACCW(PR_CTX0_6, 0x00000093); /* NVclass $093, nv10+: little endian */
386 		}
387 		else
388 		{
389 			/* ... for cmd NV4_CONTEXT_SURFACES_ARGB_ZS */
390 			ACCW(PR_CTX0_6, 0x00000053); /* NVclass $053, nv10+: little endian */
391 		}
392 		ACCW(PR_CTX1_6, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
393 		ACCW(PR_CTX2_6, 0x11401140); /* DMA0, DMA1 instance = $1140 */
394 		ACCW(PR_CTX3_6, 0x00000000); /* method traps disabled */
395 		/* setup set '7' ... */
396 		if (si->ps.card_arch >= NV10A)
397 		{
398 			/* ... for cmd NV10_DX5_TEXTURE_TRIANGLE */
399 			ACCW(PR_CTX0_7, 0x0300a094); /* NVclass $094, patchcfg ROP_AND, userclip enable,
400 										  * context surface0 valid, nv10+: little endian */
401 		}
402 		else
403 		{
404 			/* ... for cmd NV4_DX5_TEXTURE_TRIANGLE */
405 			ACCW(PR_CTX0_7, 0x0300a054); /* NVclass $054, patchcfg ROP_AND, userclip enable,
406 										  * context surface0 valid */
407 		}
408 		ACCW(PR_CTX1_7, 0x00000d01); /* format is A8RGB24, MSB mono */
409 		ACCW(PR_CTX2_7, 0x11401140); /* DMA0, DMA1 instance = $1140 */
410 		ACCW(PR_CTX3_7, 0x00000000); /* method traps disabled */
411 		/* setup set '8' ... */
412 		if (si->ps.card_arch >= NV10A)
413 		{
414 			/* ... for cmd NV10_DX6_MULTI_TEXTURE_TRIANGLE (not used) */
415 			ACCW(PR_CTX0_8, 0x0300a095); /* NVclass $095, patchcfg ROP_AND, userclip enable,
416 										  * context surface0 valid, nv10+: little endian */
417 		}
418 		else
419 		{
420 			/* ... for cmd NV4_DX6_MULTI_TEXTURE_TRIANGLE (not used) */
421 			ACCW(PR_CTX0_8, 0x0300a055); /* NVclass $055, patchcfg ROP_AND, userclip enable,
422 										  * context surface0 valid */
423 		}
424 		ACCW(PR_CTX1_8, 0x00000d01); /* format is A8RGB24, MSB mono */
425 		ACCW(PR_CTX2_8, 0x11401140); /* DMA0, DMA1 instance = $1140 */
426 		ACCW(PR_CTX3_8, 0x00000000); /* method traps disabled */
427 		/* setup set '9' for cmd NV1_RENDER_SOLID_LIN (not used) */
428 		ACCW(PR_CTX0_9, 0x0300a01c); /* NVclass $01c, patchcfg ROP_AND, userclip enable,
429 									  * context surface0 valid, nv10+: little endian */
430 		ACCW(PR_CTX1_9, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
431 		ACCW(PR_CTX2_9, 0x11401140); /* DMA0, DMA1 instance = $1140 */
432 		ACCW(PR_CTX3_9, 0x00000000); /* method traps disabled */
433 		/* setup DMA set pointed at by PF_CACH1_DMAI */
434 		if (si->engine.agp_mode)
435 		{
436 			/* DMA page table present and of linear type;
437 			 * DMA class is $002 (b0-11);
438 			 * DMA target node is AGP */
439 			ACCW(PR_CTX0_A, 0x00033002);
440 		}
441 		else
442 		{
443 			/* DMA page table present and of linear type;
444 			 * DMA class is $002 (b0-11);
445 			 * DMA target node is PCI */
446 			ACCW(PR_CTX0_A, 0x00023002);
447 		}
448 		ACCW(PR_CTX1_A, 0x000fffff); /* DMA limit: tablesize is 1M bytes */
449 		ACCW(PR_CTX2_A, (((uint32)((uint8 *)(si->dma_buffer_pci))) | 0x00000002));
450 									 /* DMA access type is READ_AND_WRITE;
451 									  * table is located in main system RAM (b12-31):
452 									  * It's adress needs to be at a 4kb boundary! */
453 
454 //3D stuff:
455 /*
456 	rud's (temp.) notes:
457 	(problem: 3D driver renders in 32bit whatever the frontbuffer space in DMA mode.)
458 	- the colorspace dependant info under 'acc engine' also sets the outcome for the
459 	  3D add-on. I don't know yet if the 3D render funcs render in the frontbuffer
460 	  space and the back-to-front blit isn't set (stays in 32bit!) (likely),
461 	  or if the 3D funcs render always in 32bit space and back-to-front blit color-
462 	  space converts... I'll try to nail this down at some point.
463 	- the colorspace dependant info under 'pramin' is needed to get the 3D related
464 	  surface commands up and running. An alternate solution would probably be calling
465 	  the surface command with the colorspace set.
466 */
467 		switch(si->dm.space)
468 		{
469 		case B_CMAP8:
470 			/* acc engine */
471 			ACCW(FORMATS, 0x00001010);
472 			if (si->ps.card_arch < NV30A)
473 				/* set depth 0-5: $1 = Y8 */
474 				ACCW(BPIXEL, 0x00111111);
475 			else
476 				/* set depth 0-1: $1 = Y8, $2 = X1R5G5B5_Z1R5G5B5 */
477 				ACCW(BPIXEL, 0x00000021);
478 			ACCW(STRD_FMT, 0x03020202);
479 			/* PRAMIN */
480 			if (si->ps.card_arch == NV04A)
481 				ACCW(PR_CTX1_6, 0x00000302); /* format is X24Y8, LSB mono */
482 			else
483 				ACCW(PR_CTX1_6, 0x00000000); /* format is invalid */
484 			ACCW(PR_CTX1_9, 0x00000302); /* format is X24Y8, LSB mono */
485 			break;
486 		case B_RGB15_LITTLE:
487 			/* acc engine */
488 			ACCW(FORMATS, 0x00002071);
489 			if (si->ps.card_arch < NV30A)
490 				/* set depth 0-5: $2 = X1R5G5B5_Z1R5G5B5, $6 = Y16 */
491 				ACCW(BPIXEL, 0x00226222);
492 			else
493 				/* set depth 0-1: $2 = X1R5G5B5_Z1R5G5B5, $4 = A1R5G5B5 */
494 				ACCW(BPIXEL, 0x00000042);
495 			ACCW(STRD_FMT, 0x09080808);
496 			/* PRAMIN */
497 			ACCW(PR_CTX1_6, 0x00000902); /* format is X17RGB15, LSB mono */
498 			ACCW(PR_CTX1_9, 0x00000902); /* format is X17RGB15, LSB mono */
499 			break;
500 		case B_RGB16_LITTLE:
501 			/* acc engine */
502 			ACCW(FORMATS, 0x000050C2);
503 			if (si->ps.card_arch < NV30A)
504 				/* set depth 0-5: $5 = R5G6B5, $6 = Y16 */
505 				ACCW(BPIXEL, 0x00556555);
506 			else
507 				/* set depth 0-1: $5 = R5G6B5, $a = X1A7R8G8B8_O1A7R8G8B8 */
508 				ACCW(BPIXEL, 0x000000a5);
509 			if (si->ps.card_arch == NV04A)
510 				ACCW(STRD_FMT, 0x0c0b0b0b);
511 			else
512 				ACCW(STRD_FMT, 0x000b0b0c);
513 			/* PRAMIN */
514 			ACCW(PR_CTX1_6, 0x00000c02); /* format is X16RGB16, LSB mono */
515 			ACCW(PR_CTX1_9, 0x00000c02); /* format is X16RGB16, LSB mono */
516 			break;
517 		case B_RGB32_LITTLE:
518 		case B_RGBA32_LITTLE:
519 			/* acc engine */
520 			ACCW(FORMATS, 0x000070e5);
521 			if (si->ps.card_arch < NV30A)
522 				/* set depth 0-5: $7 = X8R8G8B8_Z8R8G8B8, $d = Y32 */
523 				ACCW(BPIXEL, 0x0077d777);
524 			else
525 				/* set depth 0-1: $7 = X8R8G8B8_Z8R8G8B8, $e = V8YB8U8YA8 */
526 				ACCW(BPIXEL, 0x000000e7);
527 			ACCW(STRD_FMT, 0x0e0d0d0d);
528 			/* PRAMIN */
529 			ACCW(PR_CTX1_6, 0x00000e02); /* format is X8RGB24, LSB mono */
530 			ACCW(PR_CTX1_9, 0x00000e02); /* format is X8RGB24, LSB mono */
531 			break;
532 		default:
533 			LOG(8,("ACC: init, invalid bit depth\n"));
534 			return B_ERROR;
535 		}
536 //end 3D stuff.
537 	}
538 
539 	if (si->ps.card_arch == NV04A)
540 	{
541 		/* do a explicit engine reset */
542 		ACCW(DEBUG0, 0x000001ff);
543 
544 		/* init some function blocks */
545 		ACCW(DEBUG0, 0x1230c000);
546 		ACCW(DEBUG1, 0x72111101);
547 		ACCW(DEBUG2, 0x11d5f071);
548 		ACCW(DEBUG3, 0x0004ff31);
549 		/* init OP methods */
550 		ACCW(DEBUG3, 0x4004ff31);
551 
552 		/* disable all acceleration engine INT reguests */
553 		ACCW(ACC_INTE, 0x00000000);
554 		/* reset all acceration engine INT status bits */
555 		ACCW(ACC_INTS, 0xffffffff);
556 		/* context control enabled */
557 		ACCW(NV04_CTX_CTRL, 0x10010100);
558 		/* all acceleration buffers, pitches and colors are valid */
559 		ACCW(NV04_ACC_STAT, 0xffffffff);
560 		/* enable acceleration engine command FIFO */
561 		ACCW(FIFO_EN, 0x00000001);
562 
563 		/* setup location of active screen in framebuffer */
564 		ACCW(OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
565 		ACCW(OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
566 		/* setup accesible card memory range */
567 		ACCW(BLIMIT0, (si->ps.memory_size - 1));
568 		ACCW(BLIMIT1, (si->ps.memory_size - 1));
569 
570 		/* pattern shape value = 8x8, 2 color */
571 		//fixme: not needed, unless the engine has a hardware fault (setting via cmd)!
572 		//ACCW(PAT_SHP, 0x00000000);
573 		/* Pgraph Beta AND value (fraction) b23-30 */
574 		ACCW(BETA_AND_VAL, 0xffffffff);
575 	}
576 	else
577 	{
578 		/* do a explicit engine reset */
579 		ACCW(DEBUG0, 0xffffffff);
580 		ACCW(DEBUG0, 0x00000000);
581 		/* disable all acceleration engine INT reguests */
582 		ACCW(ACC_INTE, 0x00000000);
583 		/* reset all acceration engine INT status bits */
584 		ACCW(ACC_INTS, 0xffffffff);
585 		/* context control enabled */
586 		ACCW(NV10_CTX_CTRL, 0x10010100);
587 		/* all acceleration buffers, pitches and colors are valid */
588 		ACCW(NV10_ACC_STAT, 0xffffffff);
589 		/* enable acceleration engine command FIFO */
590 		ACCW(FIFO_EN, 0x00000001);
591 		/* setup surface type:
592 		 * b1-0 = %01 = surface type is non-swizzle;
593 		 * this is needed to enable 3D on NV1x (confirmed) and maybe others? */
594 		ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) & 0x0007ff00));
595 		ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) | 0x00020101));
596 	}
597 
598 	if (si->ps.card_arch == NV10A)
599 	{
600 		/* init some function blocks */
601 		ACCW(DEBUG1, 0x00118700);
602 		/* DEBUG2 has a big influence on 3D speed for NV15 (confirmed) */
603 		ACCW(DEBUG2, 0x24f82ad9);
604 		ACCW(DEBUG3, 0x55de0030);
605 
606 		/* copy tile setup stuff from 'source' to acc engine */
607 		for (cnt = 0; cnt < 32; cnt++)
608 		{
609 			NV_REG32(NVACC_NV10_TIL0AD + (cnt << 2)) =
610 				NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
611 		}
612 
613 		/* setup location of active screen in framebuffer */
614 		ACCW(OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
615 		ACCW(OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
616 		/* setup accesible card memory range */
617 		ACCW(BLIMIT0, (si->ps.memory_size - 1));
618 		ACCW(BLIMIT1, (si->ps.memory_size - 1));
619 
620 		/* pattern shape value = 8x8, 2 color */
621 		//fixme: not needed, unless the engine has a hardware fault (setting via cmd)!
622 		//ACCW(PAT_SHP, 0x00000000);
623 		/* Pgraph Beta AND value (fraction) b23-30 */
624 		ACCW(BETA_AND_VAL, 0xffffffff);
625 	}
626 
627 	if (si->ps.card_arch >= NV20A)
628 	{
629 		switch (si->ps.card_arch)
630 		{
631 		case NV40A:
632 			/* init some function blocks */
633 			ACCW(DEBUG1, 0x401287c0);
634 			ACCW(DEBUG3, 0x60de8051);
635 			/* disable specific functions, but enable SETUP_SPARE2 register */
636 			ACCW(NV10_DEBUG4, 0x00008000);
637 			/* set limit_viol_pix_adress(?): more likely something unknown.. */
638 			ACCW(NV25_WHAT0, 0x00be3c5f);
639 
640 			/* unknown.. */
641 			switch (si->ps.card_type)
642 			{
643 			case NV40:
644 			case NV45:
645 				ACCW(NV40_WHAT0, 0x83280fff);
646 				ACCW(NV40_WHAT1, 0x000000a0);
647 				ACCW(NV40_WHAT2, 0x0078e366);
648 				ACCW(NV40_WHAT3, 0x0000014c);
649 				break;
650 			case NV41:
651 				ACCW(NV40P_WHAT0, 0x83280eff);
652 				ACCW(NV40P_WHAT1, 0x000000a0);
653 				ACCW(NV40P_WHAT2, 0x007596ff);
654 				ACCW(NV40P_WHAT3, 0x00000108);
655 				break;
656 			case NV43:
657 				ACCW(NV40P_WHAT0, 0x83280eff);
658 				ACCW(NV40P_WHAT1, 0x000000a0);
659 				ACCW(NV40P_WHAT2, 0x0072cb77);
660 				ACCW(NV40P_WHAT3, 0x00000108);
661 				break;
662 			case NV44:
663 				ACCW(NV40P_WHAT0, 0x83280eff);
664 				ACCW(NV40P_WHAT1, 0x000000a0);
665 
666 				NV_REG32(NV32_NV44_WHAT10) = NV_REG32(NV32_NV10STRAPINFO);
667 				NV_REG32(NV32_NV44_WHAT11) = 0x00000000;
668 				NV_REG32(NV32_NV44_WHAT12) = 0x00000000;
669 				NV_REG32(NV32_NV44_WHAT13) = NV_REG32(NV32_NV10STRAPINFO);
670 
671 				ACCW(NV44_WHAT2, 0x00000000);
672 				ACCW(NV44_WHAT3, 0x00000000);
673 				break;
674 			default:
675 				ACCW(NV40P_WHAT0, 0x83280eff);
676 				ACCW(NV40P_WHAT1, 0x000000a0);
677 				break;
678 			}
679 
680 			ACCW(NV10_TIL3PT, 0x2ffff800);
681 			ACCW(NV10_TIL3ST, 0x00006000);
682 			ACCW(NV4X_WHAT1, 0x01000000);
683 			/* engine data source DMA instance = $1140 */
684 			ACCW(NV4X_DMA_SRC, 0x00001140);
685 			break;
686 		case NV30A:
687 			/* init some function blocks, but most is unknown.. */
688 			ACCW(DEBUG1, 0x40108700);
689 			ACCW(NV25_WHAT1, 0x00140000);
690 			ACCW(DEBUG3, 0xf00e0431);
691 			ACCW(NV10_DEBUG4, 0x00008000);
692 			ACCW(NV25_WHAT0, 0xf04b1f36);
693 			ACCW(NV20_WHAT3, 0x1002d888);
694 			ACCW(NV25_WHAT2, 0x62ff007f);
695 			break;
696 		case NV20A:
697 			/* init some function blocks, but most is unknown.. */
698 			ACCW(DEBUG1, 0x00118700);
699 			ACCW(DEBUG3, 0xf20e0431);
700 			ACCW(NV10_DEBUG4, 0x00000000);
701 			ACCW(NV20_WHAT1, 0x00000040);
702 			if (si->ps.card_type < NV25)
703 			{
704 				ACCW(NV20_WHAT2, 0x00080000);
705 				ACCW(NV10_DEBUG5, 0x00000005);
706 				ACCW(NV20_WHAT3, 0x45caa208);
707 				ACCW(NV20_WHAT4, 0x24000000);
708 				ACCW(NV20_WHAT5, 0x00000040);
709 
710 				/* copy some fixed RAM(?) configuration info(?) to some indexed registers: */
711 				/* b16-24 is select; b2-13 is adress in 32-bit words */
712 				ACCW(RDI_INDEX, 0x00e00038);
713 				/* data is 32-bit */
714 				ACCW(RDI_DATA, 0x00000030);
715 				/* copy some fixed RAM(?) configuration info(?) to some indexed registers: */
716 				/* b16-24 is select; b2-13 is adress in 32-bit words */
717 				ACCW(RDI_INDEX, 0x00e10038);
718 				/* data is 32-bit */
719 				ACCW(RDI_DATA, 0x00000030);
720 			}
721 			else
722 			{
723 				ACCW(NV25_WHAT1, 0x00080000);
724 				ACCW(NV25_WHAT0, 0x304b1fb6);
725 				ACCW(NV20_WHAT3, 0x18b82880);
726 				ACCW(NV20_WHAT4, 0x44000000);
727 				ACCW(NV20_WHAT5, 0x40000080);
728 				ACCW(NV25_WHAT2, 0x000000ff);
729 			}
730 			break;
731 		}
732 
733 		/* NV20A, NV30A and NV40A: */
734 		/* copy tile setup stuff from 'source' to acc engine (pattern colorRAM?) */
735 		for (cnt = 0; cnt < 32; cnt++)
736 		{
737 			NV_REG32(NVACC_NV20_WHAT0 + (cnt << 2)) =
738 				NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
739 		}
740 
741 		if (si->ps.card_arch >= NV40A)
742 		{
743 			if ((si->ps.card_type == NV40) || (si->ps.card_type == NV45))
744 			{
745 				/* copy some RAM configuration info(?) */
746  				ACCW(NV20_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
747 				ACCW(NV20_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
748 				ACCW(NV40_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
749 				ACCW(NV40_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));
750 
751 				/* setup location of active screen in framebuffer */
752 				ACCW(NV20_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
753 				ACCW(NV20_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
754 				/* setup accesible card memory range */
755 				ACCW(NV20_BLIMIT6, (si->ps.memory_size - 1));
756 				ACCW(NV20_BLIMIT7, (si->ps.memory_size - 1));
757 			}
758 			else
759 			{
760 				/* copy some RAM configuration info(?) */
761 				ACCW(NV40P_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
762 				ACCW(NV40P_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
763 				ACCW(NV40P_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
764 				ACCW(NV40P_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));
765 
766 				/* setup location of active screen in framebuffer */
767 				ACCW(NV40P_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
768 				ACCW(NV40P_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
769 				/* setup accesible card memory range */
770 				ACCW(NV40P_BLIMIT6, (si->ps.memory_size - 1));
771 				ACCW(NV40P_BLIMIT7, (si->ps.memory_size - 1));
772 			}
773 		}
774 		else /* NV20A and NV30A: */
775 		{
776 			/* copy some RAM configuration info(?) */
777 			ACCW(NV20_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
778 			ACCW(NV20_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
779 			/* copy some RAM configuration info(?) to some indexed registers: */
780 			/* b16-24 is select; b2-13 is adress in 32-bit words */
781 			ACCW(RDI_INDEX, 0x00ea0000);
782 			/* data is 32-bit */
783 			ACCW(RDI_DATA, NV_REG32(NV32_PFB_CONFIG_0));
784 			/* b16-24 is select; b2-13 is adress in 32-bit words */
785 			ACCW(RDI_INDEX, 0x00ea0004);
786 			/* data is 32-bit */
787 			ACCW(RDI_DATA, NV_REG32(NV32_PFB_CONFIG_1));
788 
789 			/* setup location of active screen in framebuffer */
790 			ACCW(NV20_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
791 			ACCW(NV20_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
792 			/* setup accesible card memory range */
793 			ACCW(NV20_BLIMIT6, (si->ps.memory_size - 1));
794 			ACCW(NV20_BLIMIT7, (si->ps.memory_size - 1));
795 		}
796 
797 		/* NV20A, NV30A and NV40A: */
798 		/* setup some acc engine tile stuff */
799 		ACCW(NV10_TIL2AD, 0x00000000);
800 		ACCW(NV10_TIL0ED, 0xffffffff);
801 	}
802 
803 	/* all cards: */
804 	/* setup clipping: rect size is 32768 x 32768, probably max. setting */
805 	/* note:
806 	 * can also be done via the NV_IMAGE_BLACK_RECTANGLE engine command. */
807 	ACCW(ABS_UCLP_XMIN, 0x00000000);
808 	ACCW(ABS_UCLP_YMIN, 0x00000000);
809 	ACCW(ABS_UCLP_XMAX, 0x00007fff);
810 	ACCW(ABS_UCLP_YMAX, 0x00007fff);
811 
812 	/*** PFIFO ***/
813 	/* (setup caches) */
814 	/* disable caches reassign */
815 	ACCW(PF_CACHES, 0x00000000);
816 	/* PFIFO mode: channel 0 is in DMA mode, channels 1 - 32 are in PIO mode */
817 	ACCW(PF_MODE, 0x00000001);
818 	/* cache1 push0 access disabled */
819 	ACCW(PF_CACH1_PSH0, 0x00000000);
820 	/* cache1 pull0 access disabled */
821 	ACCW(PF_CACH1_PUL0, 0x00000000);
822 	/* cache1 push1 mode = DMA */
823 	if (si->ps.card_arch >= NV40A)
824 		ACCW(PF_CACH1_PSH1, 0x00010000);
825 	else
826 		ACCW(PF_CACH1_PSH1, 0x00000100);
827 	/* cache1 DMA Put offset = 0 (b2-28) */
828 	ACCW(PF_CACH1_DMAP, 0x00000000);
829 	/* cache1 DMA Get offset = 0 (b2-28) */
830 	ACCW(PF_CACH1_DMAG, 0x00000000);
831 	/* cache1 DMA instance adress = $114e (b0-15);
832 	 * instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000). */
833 	/* note:
834 	 * should point to a DMA definition in CTX register space (which is sort of RAM).
835 	 * This define tells the engine where the DMA cmd buffer is and what it's size is.
836 	 * Inside that cmd buffer you'll find the actual issued engine commands. */
837 	if (si->ps.card_arch >= NV40A)
838 		ACCW(PF_CACH1_DMAI, 0x0000114e);
839 	else
840 		ACCW(PF_CACH1_DMAI, 0x0000114c);
841 	/* cache0 push0 access disabled */
842 	ACCW(PF_CACH0_PSH0, 0x00000000);
843 	/* cache0 pull0 access disabled */
844 	ACCW(PF_CACH0_PUL0, 0x00000000);
845 	/* RAM HT (hash table) baseadress = $10000 (b4-8), size = 4k,
846 	 * search = 128 (is byte offset between hash 'sets') */
847 	/* note:
848 	 * so HT base is $00710000, last is $00710fff.
849 	 * In this space you define the engine command handles (HT_HANDL_XX), which
850 	 * in turn points to the defines in CTX register space (which is sort of RAM) */
851 	ACCW(PF_RAMHT, 0x03000100);
852 	/* RAM FC baseadress = $11000 (b3-8) (size is fixed to 0.5k(?)) */
853 	/* note:
854 	 * so FC base is $00711000, last is $007111ff. (not used?) */
855 	ACCW(PF_RAMFC, 0x00000110);
856 	/* RAM RO baseadress = $11200 (b1-8), size = 0.5k */
857 	/* note:
858 	 * so RO base is $00711200, last is $007113ff. (not used?) */
859 	/* note also:
860 	 * This means(?) the PRAMIN CTX registers are accessible from base $00711400. */
861 	ACCW(PF_RAMRO, 0x00000112);
862 	/* PFIFO size: ch0-15 = 512 bytes, ch16-31 = 124 bytes */
863 	ACCW(PF_SIZE, 0x0000ffff);
864 	/* cache1 hash instance = $ffff (b0-15) */
865 	ACCW(PF_CACH1_HASH, 0x0000ffff);
866 	/* disable all PFIFO INTs */
867 	ACCW(PF_INTEN, 0x00000000);
868 	/* reset all PFIFO INT status bits */
869 	ACCW(PF_INTSTAT, 0xffffffff);
870 	/* cache0 pull0 engine = acceleration engine (graphics) */
871 	ACCW(PF_CACH0_PUL1, 0x00000001);
872 	/* cache1 DMA control: disable some stuff */
873 	ACCW(PF_CACH1_DMAC, 0x00000000);
874 	/* cache1 engine 0 upto/including 7 is software (could also be graphics or DVD) */
875 	ACCW(PF_CACH1_ENG, 0x00000000);
876 	/* cache1 DMA fetch: trigger at 128 bytes, size is 32 bytes, max requests is 15,
877 	 * use little endian */
878 	ACCW(PF_CACH1_DMAF, 0x000f0078);
879 	/* cache1 DMA push: b0 = 1: access is enabled */
880 	ACCW(PF_CACH1_DMAS, 0x00000001);
881 	/* cache1 push0 access enabled */
882 	ACCW(PF_CACH1_PSH0, 0x00000001);
883 	/* cache1 pull0 access enabled */
884 	ACCW(PF_CACH1_PUL0, 0x00000001);
885 	/* cache1 pull1 engine = acceleration engine (graphics) */
886 	ACCW(PF_CACH1_PUL1, 0x00000001);
887 	/* enable PFIFO caches reassign */
888 	ACCW(PF_CACHES, 0x00000001);
889 
890 	/* setup 3D specifics */
891 	nv_init_for_3D_dma();
892 
893 	/*** init acceleration engine command info ***/
894 	/* set object handles */
895 	/* note:
896 	 * probably depending on some other setup, there are 8 or 32 FIFO channels
897 	 * available. Assuming the current setup only has 8 channels because the 'rest'
898 	 * isn't setup here... */
899 	si->engine.fifo.handle[0] = NV_ROP5_SOLID;
900 	si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
901 	si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
902 	si->engine.fifo.handle[3] = NV4_SURFACE; /* NV10_CONTEXT_SURFACES_2D is identical */
903 	si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
904 	si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
905 //fixme: nolonger switching FIFO assignment for 3D as doing that causes trouble when
906 //overlay is concurrently active!!!!
907 //we can forego switching for now as we had FIFO CH6 still unused...
908 //(note btw: switching has no noticable slowdown: measured 0.2% with Quake2)
909 	si->engine.fifo.handle[6] = NV4_CONTEXT_SURFACES_ARGB_ZS;//NV1_RENDER_SOLID_LIN;
910 	si->engine.fifo.handle[7] = NV4_DX5_TEXTURE_TRIANGLE;
911 	/* preset no FIFO channels assigned to cmd's */
912 	for (cnt = 0; cnt < 0x20; cnt++)
913 	{
914 		si->engine.fifo.ch_ptr[cnt] = 0;
915 	}
916 	/* set handle's pointers to their assigned FIFO channels */
917 	/* note:
918 	 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
919 	for (cnt = 0; cnt < 0x08; cnt++)
920 	{
921 		si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
922 												(0x00000001 + (cnt * 0x00002000));
923 	}
924 
925 	/*** init DMA command buffer info ***/
926 	if (si->ps.card_arch >= NV40A) //main mem DMA buf on pre-NV40
927 	{
928 		si->dma_buffer = (void *)((char *)si->framebuffer +
929 			((si->ps.memory_size - 1) & 0xffff8000));
930 	}
931 	LOG(4,("ACC_DMA: command buffer is at adress $%08x\n",
932 		((uint32)(si->dma_buffer))));
933 	/* we have issued no DMA cmd's to the engine yet */
934 	si->engine.dma.put = 0;
935 	/* the current first free adress in the DMA buffer is at offset 0 */
936 	si->engine.dma.current = 0;
937 	/* the DMA buffer can hold 8k 32-bit words (it's 32kb in size),
938 	 * or 256k 32-bit words (1Mb in size) dependant on architecture (for now) */
939 	/* note:
940 	 * one word is reserved at the end of the DMA buffer to be able to instruct the
941 	 * engine to do a buffer wrap-around!
942 	 * (DMA opcode 'noninc method': issue word $20000000.) */
943 	if (si->ps.card_arch < NV40A)
944 		si->engine.dma.max = ((1 * 1024 * 1024) >> 2) - 1;
945 	else
946 		si->engine.dma.max = 8192 - 1;
947 	/* note the current free space we have left in the DMA buffer */
948 	si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
949 
950 	/*** init FIFO via DMA command buffer. ***/
951 	/* wait for room in fifo for new FIFO assigment cmds if needed: */
952 	if (si->ps.card_arch >= NV40A)
953 	{
954 		if (nv_acc_fifofree_dma(12) != B_OK) return B_ERROR;
955 	}
956 	else
957 	{
958 		if (nv_acc_fifofree_dma(16) != B_OK) return B_ERROR;
959 	}
960 
961 	/* program new FIFO assignments */
962 	/* Raster OPeration: */
963 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
964 	/* Clip: */
965 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
966 	/* Pattern: */
967 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
968 	/* 2D Surfaces: */
969 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
970 	/* Blit: */
971 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
972 	/* Bitmap: */
973 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);
974 	if (si->ps.card_arch < NV40A)
975 	{
976 		/* 3D surfaces: (3D related only) */
977 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH6, si->engine.fifo.handle[6]);
978 		/* Textured Triangle: (3D only) */
979 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH7, si->engine.fifo.handle[7]);
980 	}
981 
982 	/*** Set pixel width ***/
983 	switch(si->dm.space)
984 	{
985 	case B_CMAP8:
986 		surf_depth = 0x00000001;
987 		cmd_depth = 0x00000003;
988 		break;
989 	case B_RGB15_LITTLE:
990 	case B_RGB16_LITTLE:
991 		surf_depth = 0x00000004;
992 		cmd_depth = 0x00000001;
993 		break;
994 	case B_RGB32_LITTLE:
995 	case B_RGBA32_LITTLE:
996 		surf_depth = 0x00000006;
997 		cmd_depth = 0x00000003;
998 		break;
999 	default:
1000 		LOG(8,("ACC_DMA: init, invalid bit depth\n"));
1001 		return B_ERROR;
1002 	}
1003 
1004 	/* wait for room in fifo for surface setup cmd if needed */
1005 	if (nv_acc_fifofree_dma(5) != B_OK) return B_ERROR;
1006 	/* now setup 2D surface (writing 5 32bit words) */
1007 	nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 4);
1008 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = surf_depth; /* Format */
1009 	/* setup screen pitch */
1010 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1011 		((si->fbc.bytes_per_row & 0x0000ffff) | (si->fbc.bytes_per_row << 16)); /* Pitch */
1012 	/* setup screen location */
1013 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1014 		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetSource */
1015 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1016 		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetDest */
1017 
1018 	/* wait for room in fifo for pattern colordepth setup cmd if needed */
1019 	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
1020 	/* set pattern colordepth (writing 2 32bit words) */
1021 	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLORFORMAT, 1);
1022 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1023 
1024 	/* wait for room in fifo for bitmap colordepth setup cmd if needed */
1025 	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
1026 	/* set bitmap colordepth (writing 2 32bit words) */
1027 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_SETCOLORFORMAT, 1);
1028 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1029 
1030 	/* Load our pattern into the engine: */
1031 	/* wait for room in fifo for pattern cmd if needed. */
1032 	if (nv_acc_fifofree_dma(7) != B_OK) return B_ERROR;
1033 	/* now setup pattern (writing 7 32bit words) */
1034 	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETSHAPE, 1);
1035 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* SetShape: 0 = 8x8, 1 = 64x1, 2 = 1x64 */
1036 	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLOR0, 4);
1037 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor0 */
1038 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor1 */
1039 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[0] */
1040 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[1] */
1041 
1042 	/* concurrent overlay/3D 'workaround':
1043 	 * we _must_ execute a 3D command before overlay is started to prevent a hard
1044 	 * engine crash! Drawing a small rectangle (Z-only) containing rubbish. */
1045 	/* note:
1046 	 * 3D only works on pre-NV20 currently... */
1047 	if (si->ps.card_arch < NV20A)
1048 	{
1049 		/* wait for room in fifo for 3D 'workaround' cmd if needed */
1050 		if (nv_acc_fifofree_dma(50) != B_OK) return B_ERROR;
1051 
1052 		/* setup fake 3D surfaces: */
1053 		nv_acc_cmd_dma(NV4_CONTEXT_SURFACES_ARGB_ZS, NV4_CONTEXT_SURFACES_ARGB_ZS_PITCH, 3);
1054 		/* Set minimum pitch (granularity) required by hardware */
1055 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =	64 | (64 << 16); /* Pitches */
1056 		/* Place colorbuffer in Desktop */
1057 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1058 			((uint32)si->fbc.frame_buffer - (uint32)si->framebuffer); /* SetOffsetColor */
1059 		/* Place Z-buffer in Desktop */
1060 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1061 			((uint32)si->fbc.frame_buffer - (uint32)si->framebuffer); /* SetOffsetZeta */
1062 
1063 		/* Set a valid 3D state (write Z-buffer only): texture is in Desktop */
1064 		RIVA_STATE3D_05(((uint32)si->fbc.frame_buffer - (uint32)si->framebuffer),
1065 			0x11221551, 0x11000000, 0x21100162, 0x41186800);
1066 		/* Enter a small two dimensional quad */
1067 		RIVA_VERTEX3D_05(0, 0, 0);
1068 		RIVA_VERTEX3D_05(1, 16, 0);
1069 		RIVA_VERTEX3D_05(2, 16, 16);
1070 		RIVA_VERTEX3D_05(3, 0, 16);
1071 		/* Render quad */
1072 		RIVA_DRAWQUAD3D_05(0, 1, 2, 3);
1073 	}
1074 
1075 	/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
1076 	nv_start_dma();
1077 
1078 	return B_OK;
1079 }
1080 
1081 static void nv_init_for_3D_dma(void)
1082 {
1083 	/* setup PGRAPH unknown registers and modify (pre-cleared) pipe stuff for 3D use */
1084 	if (si->ps.card_arch >= NV10A)
1085 	{
1086 		/* setup unknown PGRAPH stuff */
1087 		ACCW(PGWHAT_00, 0x00000000);
1088 		ACCW(PGWHAT_01, 0x00000000);
1089 		ACCW(PGWHAT_02, 0x00000000);
1090 		ACCW(PGWHAT_03, 0x00000000);
1091 
1092 		ACCW(PGWHAT_04, 0x00001000);
1093 		ACCW(PGWHAT_05, 0x00001000);
1094 		ACCW(PGWHAT_06, 0x4003ff80);
1095 
1096 		ACCW(PGWHAT_07, 0x00000000);
1097 		ACCW(PGWHAT_08, 0x00000000);
1098 		ACCW(PGWHAT_09, 0x00000000);
1099 		ACCW(PGWHAT_0A, 0x00000000);
1100 		ACCW(PGWHAT_0B, 0x00000000);
1101 
1102 		ACCW(PGWHAT_0C, 0x00080008);
1103 		ACCW(PGWHAT_0D, 0x00080008);
1104 
1105 		ACCW(PGWHAT_0E, 0x00000000);
1106 		ACCW(PGWHAT_0F, 0x00000000);
1107 		ACCW(PGWHAT_10, 0x00000000);
1108 		ACCW(PGWHAT_11, 0x00000000);
1109 		ACCW(PGWHAT_12, 0x00000000);
1110 		ACCW(PGWHAT_13, 0x00000000);
1111 		ACCW(PGWHAT_14, 0x00000000);
1112 		ACCW(PGWHAT_15, 0x00000000);
1113 		ACCW(PGWHAT_16, 0x00000000);
1114 		ACCW(PGWHAT_17, 0x00000000);
1115 		ACCW(PGWHAT_18, 0x00000000);
1116 
1117 		ACCW(PGWHAT_19, 0x10000000);
1118 
1119 		ACCW(PGWHAT_1A, 0x00000000);
1120 		ACCW(PGWHAT_1B, 0x00000000);
1121 		ACCW(PGWHAT_1C, 0x00000000);
1122 		ACCW(PGWHAT_1D, 0x00000000);
1123 		ACCW(PGWHAT_1E, 0x00000000);
1124 		ACCW(PGWHAT_1F, 0x00000000);
1125 		ACCW(PGWHAT_20, 0x00000000);
1126 		ACCW(PGWHAT_21, 0x00000000);
1127 
1128 		ACCW(PGWHAT_22, 0x08000000);
1129 
1130 		ACCW(PGWHAT_23, 0x00000000);
1131 		ACCW(PGWHAT_24, 0x00000000);
1132 		ACCW(PGWHAT_25, 0x00000000);
1133 		ACCW(PGWHAT_26, 0x00000000);
1134 
1135 		ACCW(PGWHAT_27, 0x4b7fffff);
1136 
1137 		ACCW(PGWHAT_28, 0x00000000);
1138 		ACCW(PGWHAT_29, 0x00000000);
1139 		ACCW(PGWHAT_2A, 0x00000000);
1140 
1141 		/* setup window clipping */
1142 		/* b0-11 = min; b16-27 = max.
1143 		 * note:
1144 		 * probably two's complement values, so setting to max range here:
1145 		 * which would be -2048 upto/including +2047. */
1146 		/* horizontal */
1147 		ACCW(WINCLIP_H_0, 0x07ff0800);
1148 		ACCW(WINCLIP_H_1, 0x07ff0800);
1149 		ACCW(WINCLIP_H_2, 0x07ff0800);
1150 		ACCW(WINCLIP_H_3, 0x07ff0800);
1151 		ACCW(WINCLIP_H_4, 0x07ff0800);
1152 		ACCW(WINCLIP_H_5, 0x07ff0800);
1153 		ACCW(WINCLIP_H_6, 0x07ff0800);
1154 		ACCW(WINCLIP_H_7, 0x07ff0800);
1155 		/* vertical */
1156 		ACCW(WINCLIP_V_0, 0x07ff0800);
1157 		ACCW(WINCLIP_V_1, 0x07ff0800);
1158 		ACCW(WINCLIP_V_2, 0x07ff0800);
1159 		ACCW(WINCLIP_V_3, 0x07ff0800);
1160 		ACCW(WINCLIP_V_4, 0x07ff0800);
1161 		ACCW(WINCLIP_V_5, 0x07ff0800);
1162 		ACCW(WINCLIP_V_6, 0x07ff0800);
1163 		ACCW(WINCLIP_V_7, 0x07ff0800);
1164 
1165 		/* setup (initialize) pipe:
1166 		 * needed to get valid 3D rendering on (at least) NV1x cards. Without this
1167 		 * those cards produce rubbish instead of 3D, although the engine itself keeps
1168 		 * running and 2D stays OK. */
1169 
1170 		/* set eyetype to local, lightning etc. is off */
1171 		ACCW(NV10_XFMOD0, 0x10000000);
1172 		/* disable all lights */
1173 		ACCW(NV10_XFMOD1, 0x00000000);
1174 
1175 		/* note: upon writing data into the PIPEDAT register, the PIPEADR is
1176 		 * probably auto-incremented! */
1177 		/* (pipe adress = b2-16, pipe data = b0-31) */
1178 		/* note: pipe adresses IGRAPH registers? */
1179 		ACCW(NV10_PIPEADR, 0x00006740);
1180 		ACCW(NV10_PIPEDAT, 0x00000000);
1181 		ACCW(NV10_PIPEDAT, 0x00000000);
1182 		ACCW(NV10_PIPEDAT, 0x00000000);
1183 		ACCW(NV10_PIPEDAT, 0x3f800000);
1184 
1185 		ACCW(NV10_PIPEADR, 0x00006750);
1186 		ACCW(NV10_PIPEDAT, 0x40000000);
1187 		ACCW(NV10_PIPEDAT, 0x40000000);
1188 		ACCW(NV10_PIPEDAT, 0x40000000);
1189 		ACCW(NV10_PIPEDAT, 0x40000000);
1190 
1191 		ACCW(NV10_PIPEADR, 0x00006760);
1192 		ACCW(NV10_PIPEDAT, 0x00000000);
1193 		ACCW(NV10_PIPEDAT, 0x00000000);
1194 		ACCW(NV10_PIPEDAT, 0x3f800000);
1195 		ACCW(NV10_PIPEDAT, 0x00000000);
1196 
1197 		ACCW(NV10_PIPEADR, 0x00006770);
1198 		ACCW(NV10_PIPEDAT, 0xc5000000);
1199 		ACCW(NV10_PIPEDAT, 0xc5000000);
1200 		ACCW(NV10_PIPEDAT, 0x00000000);
1201 		ACCW(NV10_PIPEDAT, 0x00000000);
1202 
1203 		ACCW(NV10_PIPEADR, 0x00006780);
1204 		ACCW(NV10_PIPEDAT, 0x00000000);
1205 		ACCW(NV10_PIPEDAT, 0x00000000);
1206 		ACCW(NV10_PIPEDAT, 0x3f800000);
1207 		ACCW(NV10_PIPEDAT, 0x00000000);
1208 
1209 		ACCW(NV10_PIPEADR, 0x000067a0);
1210 		ACCW(NV10_PIPEDAT, 0x3f800000);
1211 		ACCW(NV10_PIPEDAT, 0x3f800000);
1212 		ACCW(NV10_PIPEDAT, 0x3f800000);
1213 		ACCW(NV10_PIPEDAT, 0x3f800000);
1214 
1215 		ACCW(NV10_PIPEADR, 0x00006ab0);
1216 		ACCW(NV10_PIPEDAT, 0x3f800000);
1217 		ACCW(NV10_PIPEDAT, 0x3f800000);
1218 		ACCW(NV10_PIPEDAT, 0x3f800000);
1219 
1220 		ACCW(NV10_PIPEADR, 0x00006ac0);
1221 		ACCW(NV10_PIPEDAT, 0x00000000);
1222 		ACCW(NV10_PIPEDAT, 0x00000000);
1223 		ACCW(NV10_PIPEDAT, 0x00000000);
1224 
1225 		ACCW(NV10_PIPEADR, 0x00006c10);
1226 		ACCW(NV10_PIPEDAT, 0xbf800000);
1227 
1228 		ACCW(NV10_PIPEADR, 0x00007030);
1229 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1230 
1231 		ACCW(NV10_PIPEADR, 0x00007040);
1232 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1233 
1234 		ACCW(NV10_PIPEADR, 0x00007050);
1235 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1236 
1237 		ACCW(NV10_PIPEADR, 0x00007060);
1238 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1239 
1240 		ACCW(NV10_PIPEADR, 0x00007070);
1241 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1242 
1243 		ACCW(NV10_PIPEADR, 0x00007080);
1244 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1245 
1246 		ACCW(NV10_PIPEADR, 0x00007090);
1247 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1248 
1249 		ACCW(NV10_PIPEADR, 0x000070a0);
1250 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1251 
1252 		ACCW(NV10_PIPEADR, 0x00006a80);
1253 		ACCW(NV10_PIPEDAT, 0x00000000);
1254 		ACCW(NV10_PIPEDAT, 0x00000000);
1255 		ACCW(NV10_PIPEDAT, 0x3f800000);
1256 
1257 		ACCW(NV10_PIPEADR, 0x00006aa0);
1258 		ACCW(NV10_PIPEDAT, 0x00000000);
1259 		ACCW(NV10_PIPEDAT, 0x00000000);
1260 		ACCW(NV10_PIPEDAT, 0x00000000);
1261 
1262 		ACCW(NV10_PIPEADR, 0x00000040);
1263 		ACCW(NV10_PIPEDAT, 0x00000005);
1264 
1265 		ACCW(NV10_PIPEADR, 0x00006400);
1266 		ACCW(NV10_PIPEDAT, 0x3f800000);
1267 		ACCW(NV10_PIPEDAT, 0x3f800000);
1268 		ACCW(NV10_PIPEDAT, 0x4b7fffff);
1269 		ACCW(NV10_PIPEDAT, 0x00000000);
1270 
1271 		ACCW(NV10_PIPEADR, 0x00006410);
1272 		ACCW(NV10_PIPEDAT, 0xc5000000);
1273 		ACCW(NV10_PIPEDAT, 0xc5000000);
1274 		ACCW(NV10_PIPEDAT, 0x00000000);
1275 		ACCW(NV10_PIPEDAT, 0x00000000);
1276 
1277 		ACCW(NV10_PIPEADR, 0x00006420);
1278 		ACCW(NV10_PIPEDAT, 0x00000000);
1279 		ACCW(NV10_PIPEDAT, 0x00000000);
1280 		ACCW(NV10_PIPEDAT, 0x00000000);
1281 		ACCW(NV10_PIPEDAT, 0x00000000);
1282 
1283 		ACCW(NV10_PIPEADR, 0x00006430);
1284 		ACCW(NV10_PIPEDAT, 0x00000000);
1285 		ACCW(NV10_PIPEDAT, 0x00000000);
1286 		ACCW(NV10_PIPEDAT, 0x00000000);
1287 		ACCW(NV10_PIPEDAT, 0x00000000);
1288 
1289 		ACCW(NV10_PIPEADR, 0x000064c0);
1290 		ACCW(NV10_PIPEDAT, 0x3f800000);
1291 		ACCW(NV10_PIPEDAT, 0x3f800000);
1292 		ACCW(NV10_PIPEDAT, 0x477fffff);
1293 		ACCW(NV10_PIPEDAT, 0x3f800000);
1294 
1295 		ACCW(NV10_PIPEADR, 0x000064d0);
1296 		ACCW(NV10_PIPEDAT, 0xc5000000);
1297 		ACCW(NV10_PIPEDAT, 0xc5000000);
1298 		ACCW(NV10_PIPEDAT, 0x00000000);
1299 		ACCW(NV10_PIPEDAT, 0x00000000);
1300 
1301 		ACCW(NV10_PIPEADR, 0x000064e0);
1302 		ACCW(NV10_PIPEDAT, 0xc4fff000);
1303 		ACCW(NV10_PIPEDAT, 0xc4fff000);
1304 		ACCW(NV10_PIPEDAT, 0x00000000);
1305 		ACCW(NV10_PIPEDAT, 0x00000000);
1306 
1307 		ACCW(NV10_PIPEADR, 0x000064f0);
1308 		ACCW(NV10_PIPEDAT, 0x00000000);
1309 		ACCW(NV10_PIPEDAT, 0x00000000);
1310 		ACCW(NV10_PIPEDAT, 0x00000000);
1311 		ACCW(NV10_PIPEDAT, 0x00000000);
1312 
1313 		/* turn lightning on */
1314 		ACCW(NV10_XFMOD0, 0x30000000);
1315 		/* set light 1 to infinite type, other lights remain off */
1316 		ACCW(NV10_XFMOD1, 0x00000004);
1317 
1318 		/* Z-buffer state is:
1319 		 * initialized, set to: 'fixed point' (integer?); Z-buffer; 16bits depth */
1320 		/* note:
1321 		 * other options possible are: floating point; 24bits depth; W-buffer(?) */
1322 		ACCW(GLOB_STAT_0, 0x10000000);
1323 		/* set DMA instance 2 and 3 to be invalid */
1324 		ACCW(GLOB_STAT_1, 0x00000000);
1325 	}
1326 }
1327 
1328 static void nv_start_dma(void)
1329 {
1330 	uint32 dummy;
1331 
1332 	if (si->engine.dma.current != si->engine.dma.put)
1333 	{
1334 		si->engine.dma.put = si->engine.dma.current;
1335 		/* flush used caches so we know for sure the DMA cmd buffer received all data. */
1336 		if (si->ps.card_arch < NV40A)
1337 		{
1338 			/* some CPU's support out-of-order processing (WinChip/Cyrix). Flush them. */
1339 			__asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
1340 			/* read a non-cached adress to flush the cash */
1341 			dummy = ACCR(STATUS);
1342 		}
1343 		else
1344 		{
1345 			/* dummy read the first adress of the framebuffer to flush MTRR-WC buffers */
1346 			dummy = *((volatile uint32 *)(si->framebuffer));
1347 		}
1348 
1349 		/* actually start DMA to execute all commands now in buffer */
1350 		/* note:
1351 		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
1352 		 * fact all the same set. It also doesn't matter if the channel was assigned a
1353 		 * command or not. */
1354 		/* note also:
1355 		 * NV_GENERAL_DMAPUT is a write-only register on some cards (confirmed NV11). */
1356 		NV_REG32(NVACC_FIFO + NV_GENERAL_DMAPUT) = (si->engine.dma.put << 2);
1357 	}
1358 }
1359 
1360 /* this routine does not check the engine's internal hardware FIFO, but the DMA
1361  * command buffer. You can see this as a FIFO as well, that feeds the hardware FIFO.
1362  * The hardware FIFO state is checked by the DMA hardware automatically. */
1363 static status_t nv_acc_fifofree_dma(uint16 cmd_size)
1364 {
1365 	uint32 dmaget;
1366 
1367 	/* we'd better check for timeouts on the DMA engine as it's theoretically
1368 	 * breakable by malfunctioning software */
1369 	uint16 cnt = 0;
1370 
1371 	/* check if the DMA buffer has enough room for the command.
1372 	 * note:
1373 	 * engine.dma.free is 'cached' */
1374 	while ((si->engine.dma.free < cmd_size) && (cnt < 10000) && (err < 3))
1375 	{
1376 		/* see where the engine is currently fetching from the buffer */
1377 		/* note:
1378 		 * read this only once in the code as accessing registers is relatively slow */
1379 		/* note also:
1380 		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
1381 		 * fact all the same set. It also doesn't matter if the channel was assigned a
1382 		 * command or not. */
1383 		dmaget = ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET)) >> 2);
1384 
1385 		/* update timeout counter: on NV11 on a Pentium4 2.8Ghz max reached count
1386 		 * using BeRoMeter 1.2.6 was about 600; so counting 10000 before generating
1387 		 * a timeout should definately do it. Snooze()-ing cannot be done without a
1388 		 * serious speed penalty, even if done for only 1 microSecond. */
1389 		cnt++;
1390 
1391 		/* where's the engine fetching viewed from us issuing? */
1392 		if (si->engine.dma.put >= dmaget)
1393 		{
1394 			/* engine is fetching 'behind us', the last piece of the buffer is free */
1395 
1396 			/* note the 'updated' free space we have in the DMA buffer */
1397 			si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
1398 			/* if it's enough after all we exit this routine immediately. Else: */
1399 			if (si->engine.dma.free < cmd_size)
1400 			{
1401 				/* not enough room left, so instruct DMA engine to reset the buffer
1402 				 * when it's reaching the end of it */
1403 				((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x20000000;
1404 				/* reset our buffer pointer, so new commands will be placed at the
1405 				 * beginning of the buffer. */
1406 				si->engine.dma.current = 0;
1407 				/* tell the engine to fetch the remaining command(s) in the DMA buffer
1408 				 * that where not executed before. */
1409 				nv_start_dma();
1410 
1411 				/* NOW the engine is fetching 'in front of us', so the first piece
1412 				 * of the buffer is free */
1413 
1414 				/* note the updated current free space we have in the DMA buffer */
1415 				si->engine.dma.free = dmaget - si->engine.dma.current;
1416 				/* mind this pittfall:
1417 				 * Leave some room between where the engine is fetching and where we
1418 				 * put new commands. Otherwise the engine will crash on heavy loads.
1419 				 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
1420 				 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
1421 				 * Note:
1422 				 * The engine is DMA triggered for fetching chunks every 128 bytes,
1423 				 * maybe this is the reason for this behaviour.
1424 				 * Note also:
1425 				 * it looks like the space that needs to be kept free is coupled
1426 				 * with the size of the DMA buffer. */
1427 				if (si->engine.dma.free < 256)
1428 					si->engine.dma.free = 0;
1429 				else
1430 					si->engine.dma.free -= 256;
1431 			}
1432 		}
1433 		else
1434 		{
1435 			/* engine is fetching 'in front of us', so the first piece of the buffer
1436 			 * is free */
1437 
1438 			/* note the updated current free space we have in the DMA buffer */
1439 			si->engine.dma.free = dmaget - si->engine.dma.current;
1440 			/* mind this pittfall:
1441 			 * Leave some room between where the engine is fetching and where we
1442 			 * put new commands. Otherwise the engine will crash on heavy loads.
1443 			 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
1444 			 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
1445 			 * Note:
1446 			 * The engine is DMA triggered for fetching chunks every 128 bytes,
1447 			 * maybe this is the reason for this behaviour.
1448 			 * Note also:
1449 			 * it looks like the space that needs to be kept free is coupled
1450 			 * with the size of the DMA buffer. */
1451 			if (si->engine.dma.free < 256)
1452 				si->engine.dma.free = 0;
1453 			else
1454 				si->engine.dma.free -= 256;
1455 		}
1456 	}
1457 
1458 	/* log timeout if we had one */
1459 	if (cnt == 10000)
1460 	{
1461 		if (err < 3) err++;
1462 		LOG(4,("ACC_DMA: fifofree; DMA timeout #%d, engine trouble!\n", err));
1463 	}
1464 
1465 	/* we must make the acceleration routines abort or the driver will hang! */
1466 	if (err >= 3) return B_ERROR;
1467 
1468 	return B_OK;
1469 }
1470 
1471 static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size)
1472 {
1473 	/* NV_FIFO_DMA_OPCODE: set number of cmd words (b18 - 28); set FIFO offset for
1474 	 * first cmd word (b2 - 15); set DMA opcode = method (b29 - 31).
1475 	 * a 'NOP' is the opcode word $00000000. */
1476 	/* note:
1477 	 * possible DMA opcodes:
1478 	 * b'000' is 'method' (execute cmd);
1479 	 * b'001' is 'jump';
1480 	 * b'002' is 'noninc method' (execute buffer wrap-around);
1481 	 * b'003' is 'call': return is executed by opcode word $00020000 (b17 = 1). */
1482 	/* note also:
1483 	 * this system uses auto-increments for the FIFO offset adresses. Make sure
1484 	 * to set a new adress if a gap exists between the previous one and the new one. */
1485 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((size << 18) |
1486 		((si->engine.fifo.ch_ptr[cmd] + offset) & 0x0000fffc));
1487 
1488 	/* space left after issuing the current command is the cmd AND it's arguments less */
1489 	si->engine.dma.free -= (size + 1);
1490 }
1491 
1492 static void nv_acc_set_ch_dma(uint16 ch, uint32 handle)
1493 {
1494 	/* issue FIFO channel assign cmd */
1495 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((1 << 18) | ch);
1496 	/* set new assignment */
1497 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = (0x80000000 | handle);
1498 
1499 	/* space left after issuing the current command is the cmd AND it's arguments less */
1500 	si->engine.dma.free -= 2;
1501 }
1502 
1503 void nv_acc_assert_fifo_dma(void)
1504 {
1505 	/* does every engine cmd this accelerant needs have a FIFO channel? */
1506 	//fixme: can probably be optimized for both speed and channel selection...
1507 	if (!si->engine.fifo.ch_ptr[NV_ROP5_SOLID] ||
1508 		!si->engine.fifo.ch_ptr[NV_IMAGE_BLACK_RECTANGLE] ||
1509 		!si->engine.fifo.ch_ptr[NV_IMAGE_PATTERN] ||
1510 		!si->engine.fifo.ch_ptr[NV4_SURFACE] ||
1511 		!si->engine.fifo.ch_ptr[NV_IMAGE_BLIT] ||
1512 		!si->engine.fifo.ch_ptr[NV4_GDI_RECTANGLE_TEXT])
1513 	{
1514 		uint16 cnt;
1515 
1516 		/* free the FIFO channels we want from the currently assigned cmd's */
1517 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[0]] = 0;
1518 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[1]] = 0;
1519 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[2]] = 0;
1520 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[3]] = 0;
1521 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[4]] = 0;
1522 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[5]] = 0;
1523 
1524 		/* set new object handles */
1525 		si->engine.fifo.handle[0] = NV_ROP5_SOLID;
1526 		si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
1527 		si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
1528 		si->engine.fifo.handle[3] = NV4_SURFACE;
1529 		si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
1530 		si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
1531 
1532 		/* set handle's pointers to their assigned FIFO channels */
1533 		/* note:
1534 		 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
1535 		for (cnt = 0; cnt < 0x08; cnt++)
1536 		{
1537 			si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
1538 				(0x00000001 + (cnt * 0x00002000));
1539 		}
1540 
1541 		/* wait for room in fifo for new FIFO assigment cmds if needed. */
1542 		if (nv_acc_fifofree_dma(12) != B_OK) return;
1543 
1544 		/* program new FIFO assignments */
1545 		/* Raster OPeration: */
1546 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
1547 		/* Clip: */
1548 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
1549 		/* Pattern: */
1550 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
1551 		/* 2D Surface: */
1552 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
1553 		/* Blit: */
1554 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
1555 		/* Bitmap: */
1556 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);
1557 
1558 		/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
1559 		nv_start_dma();
1560 	}
1561 }
1562 
1563 /*
1564 	note:
1565 	moved acceleration 'top-level' routines to be integrated in the engine:
1566 	it is costly to call the engine for every single function within a loop!
1567 	(measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.)
1568 
1569 	note also:
1570 	splitting up each command list into sublists (see routines below) prevents
1571 	a lot more nested calls, further increasing the speed with upto 70%.
1572 
1573 	finally:
1574 	sending the sublist to just one single engine command even further increases
1575 	speed with upto another 10%. This can't be done for blits though, as this engine-
1576 	command's hardware does not support multiple objects.
1577 */
1578 
1579 /* screen to screen blit - i.e. move windows around and scroll within them. */
1580 void SCREEN_TO_SCREEN_BLIT_DMA(engine_token *et, blit_params *list, uint32 count)
1581 {
1582 	uint32 i = 0;
1583 	uint16 subcnt;
1584 
1585 	/*** init acc engine for blit function ***/
1586 	/* ROP registers (Raster OPeration):
1587 	 * wait for room in fifo for ROP cmd if needed. */
1588 	if (nv_acc_fifofree_dma(2) != B_OK) return;
1589 	/* now setup ROP (writing 2 32bit words) for GXcopy */
1590 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1591 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
1592 
1593 	/*** do each blit ***/
1594 	/* Note:
1595 	 * blit-copy direction is determined inside nvidia hardware: no setup needed */
1596 	while (count)
1597 	{
1598 		/* break up the list in sublists to minimize calls, while making sure long
1599 		 * lists still get executed without trouble */
1600 		subcnt = 32;
1601 		if (count < 32) subcnt = count;
1602 		count -= subcnt;
1603 
1604 		/* wait for room in fifo for blit cmd if needed. */
1605 		if (nv_acc_fifofree_dma(4 * subcnt) != B_OK) return;
1606 
1607 		while (subcnt--)
1608 		{
1609 			/* now setup blit (writing 4 32bit words) */
1610 			nv_acc_cmd_dma(NV_IMAGE_BLIT, NV_IMAGE_BLIT_SOURCEORG, 3);
1611 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1612 				(((list[i].src_top) << 16) | (list[i].src_left)); /* SourceOrg */
1613 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1614 				(((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */
1615 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1616 				((((list[i].height) + 1) << 16) | ((list[i].width) + 1)); /* HeightWidth */
1617 
1618 			i++;
1619 		}
1620 
1621 		/* tell the engine to fetch the commands in the DMA buffer that where not
1622 		 * executed before. */
1623 		nv_start_dma();
1624 	}
1625 
1626 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1627 	si->engine.threeD.reload = 0xffffffff;
1628 }
1629 
1630 /* rectangle fill - i.e. workspace and window background color */
1631 void FILL_RECTANGLE_DMA(engine_token *et, uint32 colorIndex, fill_rect_params *list, uint32 count)
1632 {
1633 	uint32 i = 0;
1634 	uint16 subcnt;
1635 
1636 	/*** init acc engine for fill function ***/
1637 	/* ROP registers (Raster OPeration):
1638 	 * wait for room in fifo for ROP and bitmap cmd if needed. */
1639 	if (nv_acc_fifofree_dma(4) != B_OK) return;
1640 	/* now setup ROP (writing 2 32bit words) for GXcopy */
1641 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1642 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
1643 	/* now setup fill color (writing 2 32bit words) */
1644 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1645 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */
1646 
1647 	/*** draw each rectangle ***/
1648 	while (count)
1649 	{
1650 		/* break up the list in sublists to minimize calls, while making sure long
1651 		 * lists still get executed without trouble */
1652 		subcnt = 32;
1653 		if (count < 32) subcnt = count;
1654 		count -= subcnt;
1655 
1656 		/* wait for room in fifo for bitmap cmd if needed. */
1657 		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
1658 
1659 		/* issue fill command once... */
1660 		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
1661 		/* ... and send multiple rects (engine cmd supports 32 max) */
1662 		while (subcnt--)
1663 		{
1664 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1665 				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
1666 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1667 				(((((list[i].right)+1) - (list[i].left)) << 16) |
1668 				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */
1669 
1670 			i++;
1671 		}
1672 
1673 		/* tell the engine to fetch the commands in the DMA buffer that where not
1674 		 * executed before. */
1675 		nv_start_dma();
1676 	}
1677 
1678 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1679 	si->engine.threeD.reload = 0xffffffff;
1680 }
1681 
1682 /* span fill - i.e. (selected) menuitem background color (Dano) */
1683 void FILL_SPAN_DMA(engine_token *et, uint32 colorIndex, uint16 *list, uint32 count)
1684 {
1685 	uint32 i = 0;
1686 	uint16 subcnt;
1687 
1688 	/*** init acc engine for fill function ***/
1689 	/* ROP registers (Raster OPeration):
1690 	 * wait for room in fifo for ROP and bitmap cmd if needed. */
1691 	if (nv_acc_fifofree_dma(4) != B_OK) return;
1692 	/* now setup ROP (writing 2 32bit words) for GXcopy */
1693 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1694 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
1695 	/* now setup fill color (writing 2 32bit words) */
1696 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1697 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */
1698 
1699 	/*** draw each span ***/
1700 	while (count)
1701 	{
1702 		/* break up the list in sublists to minimize calls, while making sure long
1703 		 * lists still get executed without trouble */
1704 		subcnt = 32;
1705 		if (count < 32) subcnt = count;
1706 		count -= subcnt;
1707 
1708 		/* wait for room in fifo for bitmap cmd if needed. */
1709 		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
1710 
1711 		/* issue fill command once... */
1712 		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
1713 		/* ... and send multiple rects (spans) (engine cmd supports 32 max) */
1714 		while (subcnt--)
1715 		{
1716 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1717 				(((list[i+1]) << 16) | ((list[i]) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
1718 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1719 				((((list[i+2]+1) - (list[i+1])) << 16) | 0x00000001); /* Unclipped Rect 0 WidthHeight */
1720 
1721 			i+=3;
1722 		}
1723 
1724 		/* tell the engine to fetch the commands in the DMA buffer that where not
1725 		 * executed before. */
1726 		nv_start_dma();
1727 	}
1728 
1729 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1730 	si->engine.threeD.reload = 0xffffffff;
1731 }
1732 
1733 /* rectangle invert - i.e. text cursor and text selection */
1734 void INVERT_RECTANGLE_DMA(engine_token *et, fill_rect_params *list, uint32 count)
1735 {
1736 	uint32 i = 0;
1737 	uint16 subcnt;
1738 
1739 	/*** init acc engine for invert function ***/
1740 	/* ROP registers (Raster OPeration):
1741 	 * wait for room in fifo for ROP and bitmap cmd if needed. */
1742 	if (nv_acc_fifofree_dma(4) != B_OK) return;
1743 	/* now setup ROP (writing 2 32bit words) for GXinvert */
1744 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1745 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x55; /* SetRop5 */
1746 	/* now reset fill color (writing 2 32bit words) */
1747 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1748 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */
1749 
1750 	/*** invert each rectangle ***/
1751 	while (count)
1752 	{
1753 		/* break up the list in sublists to minimize calls, while making sure long
1754 		 * lists still get executed without trouble */
1755 		subcnt = 32;
1756 		if (count < 32) subcnt = count;
1757 		count -= subcnt;
1758 
1759 		/* wait for room in fifo for bitmap cmd if needed. */
1760 		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
1761 
1762 		/* issue fill command once... */
1763 		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
1764 		/* ... and send multiple rects (engine cmd supports 32 max) */
1765 		while (subcnt--)
1766 		{
1767 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1768 				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
1769 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1770 				(((((list[i].right)+1) - (list[i].left)) << 16) |
1771 				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */
1772 
1773 			i++;
1774 		}
1775 
1776 		/* tell the engine to fetch the commands in the DMA buffer that where not
1777 		 * executed before. */
1778 		nv_start_dma();
1779 	}
1780 
1781 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1782 	si->engine.threeD.reload = 0xffffffff;
1783 }
1784