xref: /haiku/src/add-ons/accelerants/nvidia/engine/nv_acc_dma.c (revision a4f6a81235ca2522c01f532de13cad9b729d4029)
1 /* NV Acceleration functions */
2 
3 /* Author:
4    Rudolf Cornelissen 8/2003-2/2006.
5 
6    This code was possible thanks to:
7     - the Linux XFree86 NV driver,
8     - the Linux UtahGLX 3D driver.
9 */
10 
11 #define MODULE_BIT 0x00080000
12 
13 #include "nv_std.h"
14 
15 /*acceleration notes*/
16 
17 /*functions Be's app_server uses:
18 fill span (horizontal only)
19 fill rectangle (these 2 are very similar)
20 invert rectangle
21 blit
22 */
23 
24 static void nv_init_for_3D_dma(void);
25 static void nv_start_dma(void);
26 static status_t nv_acc_fifofree_dma(uint16 cmd_size);
27 static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size);
28 static void nv_acc_set_ch_dma(uint16 ch, uint32 handle);
29 
30 /* used to track engine DMA stalls */
31 static uint8 err;
32 
33 /* wait until engine completely idle */
34 status_t nv_acc_wait_idle_dma()
35 {
36 	/* we'd better check for timeouts on the DMA engine as it's theoretically
37 	 * breakable by malfunctioning software */
38 	uint16 cnt = 0;
39 
40 	/* wait until all upcoming commands are in execution at least. Do this until
41 	 * we hit a timeout; abort if we failed at least three times before:
42 	 * if DMA stalls, we have to forget about it alltogether at some point, or
43 	 * the system will almost come to a complete halt.. */
44 	/* note:
45 	 * it doesn't matter which FIFO channel's DMA registers we access, they are in
46 	 * fact all the same set. It also doesn't matter if the channel was assigned a
47 	 * command or not. */
48 	while ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET) != (si->engine.dma.put << 2)) &&
49 			(cnt < 10000) && (err < 3))
50 	{
51 		/* snooze a bit so I do not hammer the bus */
52 		snooze (100);
53 		cnt++;
54 	}
55 
56 	/* log timeout if we had one */
57 	if (cnt == 10000)
58 	{
59 		if (err < 3) err++;
60 		LOG(4,("ACC_DMA: wait_idle; DMA timeout #%d, engine trouble!\n", err));
61 	}
62 
63 	/* wait until execution completed */
64 	while (ACCR(STATUS))
65 	{
66 		/* snooze a bit so I do not hammer the bus */
67 		snooze (100);
68 	}
69 
70 	return B_OK;
71 }
72 
73 /* AFAIK this must be done for every new screenmode.
74  * Engine required init. */
75 status_t nv_acc_init_dma()
76 {
77 	uint32 cnt, tmp;
78 	uint32 surf_depth, cmd_depth;
79 	/* reset the engine DMA stalls counter */
80 	err = 0;
81 
82 	/* a hanging engine only recovers from a complete power-down/power-up cycle */
83 	NV_REG32(NV32_PWRUPCTRL) = 0x13110011;
84 	snooze(1000);
85 	NV_REG32(NV32_PWRUPCTRL) = 0x13111111;
86 
87 	/* don't try this on NV20 and later.. */
88 	/* note:
89 	 * the specific register that's responsible for the speedfix on NV18 is
90 	 * $00400ed8: bit 6 needs to be zero for fastest rendering (confirmed). */
91 	/* note also:
92 	 * on NV28 the following ranges could be reset (confirmed):
93 	 * $00400000 upto/incl. $004002fc;
94 	 * $00400400 upto/incl. $004017fc;
95 	 * $0040180c upto/incl. $00401948;
96 	 * $00401994 upto/incl. $00401a80;
97 	 * $00401a94 upto/incl. $00401ffc.
98 	 * The intermediate ranges hang the engine upon resetting. */
99 	if (si->ps.card_arch < NV20A)
100 	{
101 		/* actively reset the PGRAPH registerset (acceleration engine) */
102 		for (cnt = 0x00400000; cnt < 0x00402000; cnt +=4)
103 		{
104 			NV_REG32(cnt) = 0x00000000;
105 		}
106 	}
107 
108 	/* setup PTIMER: */
109 	//fixme? how about NV28 setup as just after coldstarting? (see nv_info.c)
110 	/* set timer numerator to 8 (in b0-15) */
111 	ACCW(PT_NUMERATOR, 0x00000008);
112 	/* set timer denominator to 3 (in b0-15) */
113 	ACCW(PT_DENOMINATR, 0x00000003);
114 
115 	/* disable timer-alarm INT requests (b0) */
116 	ACCW(PT_INTEN, 0x00000000);
117 	/* reset timer-alarm INT status bit (b0) */
118 	ACCW(PT_INTSTAT, 0xffffffff);
119 
120 	/* enable PRAMIN write access on pre NV10 before programming it! */
121 	if (si->ps.card_arch == NV04A)
122 	{
123 		/* set framebuffer config: type = notiling, PRAMIN write access enabled */
124 		NV_REG32(NV32_PFB_CONFIG_0) = 0x00001114;
125 	}
126 	else
127 	{
128 		/* setup acc engine 'source' tile adressranges */
129 		if ((si->ps.card_type <= NV40) || (si->ps.card_type == NV45))
130 		{
131 			ACCW(NV10_FBTIL0AD, 0);
132 			ACCW(NV10_FBTIL1AD, 0);
133 			ACCW(NV10_FBTIL2AD, 0);
134 			ACCW(NV10_FBTIL3AD, 0);
135 			ACCW(NV10_FBTIL4AD, 0);
136 			ACCW(NV10_FBTIL5AD, 0);
137 			ACCW(NV10_FBTIL6AD, 0);
138 			ACCW(NV10_FBTIL7AD, 0);
139 			ACCW(NV10_FBTIL0ED, (si->ps.memory_size - 1));
140 			ACCW(NV10_FBTIL1ED, (si->ps.memory_size - 1));
141 			ACCW(NV10_FBTIL2ED, (si->ps.memory_size - 1));
142 			ACCW(NV10_FBTIL3ED, (si->ps.memory_size - 1));
143 			ACCW(NV10_FBTIL4ED, (si->ps.memory_size - 1));
144 			ACCW(NV10_FBTIL5ED, (si->ps.memory_size - 1));
145 			ACCW(NV10_FBTIL6ED, (si->ps.memory_size - 1));
146 			ACCW(NV10_FBTIL7ED, (si->ps.memory_size - 1));
147 		}
148 		else
149 		{
150 			/* NV41, 43, 44, 47 */
151 			ACCW(NV41_FBTIL0AD, 0);
152 			ACCW(NV41_FBTIL1AD, 0);
153 			ACCW(NV41_FBTIL2AD, 0);
154 			ACCW(NV41_FBTIL3AD, 0);
155 			ACCW(NV41_FBTIL4AD, 0);
156 			ACCW(NV41_FBTIL5AD, 0);
157 			ACCW(NV41_FBTIL6AD, 0);
158 			ACCW(NV41_FBTIL7AD, 0);
159 			ACCW(NV41_FBTIL8AD, 0);
160 			ACCW(NV41_FBTIL9AD, 0);
161 			ACCW(NV41_FBTILAAD, 0);
162 			ACCW(NV41_FBTILBAD, 0);
163 			ACCW(NV41_FBTIL0ED, (si->ps.memory_size - 1));
164 			ACCW(NV41_FBTIL1ED, (si->ps.memory_size - 1));
165 			ACCW(NV41_FBTIL2ED, (si->ps.memory_size - 1));
166 			ACCW(NV41_FBTIL3ED, (si->ps.memory_size - 1));
167 			ACCW(NV41_FBTIL4ED, (si->ps.memory_size - 1));
168 			ACCW(NV41_FBTIL5ED, (si->ps.memory_size - 1));
169 			ACCW(NV41_FBTIL6ED, (si->ps.memory_size - 1));
170 			ACCW(NV41_FBTIL7ED, (si->ps.memory_size - 1));
171 			ACCW(NV41_FBTIL8ED, (si->ps.memory_size - 1));
172 			ACCW(NV41_FBTIL9ED, (si->ps.memory_size - 1));
173 			ACCW(NV41_FBTILAED, (si->ps.memory_size - 1));
174 			ACCW(NV41_FBTILBED, (si->ps.memory_size - 1));
175 
176 			if (si->ps.card_type == NV47)
177 			/* or ID == 0x01dx or ID == 0x029x: but no cards defined yet */
178 			{
179 				ACCW(NV47_FBTILCAD, 0);
180 				ACCW(NV47_FBTILDAD, 0);
181 				ACCW(NV47_FBTILEAD, 0);
182 				ACCW(NV47_FBTILCED, (si->ps.memory_size - 1));
183 				ACCW(NV47_FBTILDED, (si->ps.memory_size - 1));
184 				ACCW(NV47_FBTILEED, (si->ps.memory_size - 1));
185 			}
186 		}
187 	}
188 
189 	/*** PRAMIN ***/
190 	/* first clear the entire RAMHT (hash-table) space to a defined state. It turns
191 	 * out at least NV11 will keep the previously programmed handles over resets and
192 	 * power-outages upto about 15 seconds!! Faulty entries might well hang the
193 	 * engine (confirmed on NV11).
194 	 * Note:
195 	 * this behaviour is not very strange: even very old DRAM chips are known to be
196 	 * able to do this, even though you should refresh them every few milliseconds or
197 	 * so. (Large memory cell capacitors, though different cells vary a lot in their
198 	 * capacity.)
199 	 * Of course data validity is not certain by a long shot over this large
200 	 * amount of time.. */
201 	for(cnt = 0; cnt < 0x0400; cnt++)
202 		NV_REG32(NVACC_HT_HANDL_00 + (cnt << 2)) = 0;
203 	/* RAMHT (hash-table) space SETUP FIFO HANDLES */
204 	/* note:
205 	 * 'instance' tells you where the engine command is stored in 'PR_CTXx_x' sets
206 	 * below: instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000).
207 	 * That command is linked to the handle noted here. This handle is then used to
208 	 * tell the FIFO to which engine command it is connected!
209 	 * (CTX registers are actually a sort of RAM space.) */
210 	if (si->ps.card_arch >= NV40A)
211 	{
212 		/* (first set) */
213 		ACCW(HT_HANDL_00, (0x80000000 | NV10_CONTEXT_SURFACES_2D)); /* 32bit handle (not used) */
214 		ACCW(HT_VALUE_00, 0x0010114c); /* instance $114c, engine = acc engine, CHID = $00 */
215 
216 		ACCW(HT_HANDL_01, (0x80000000 | NV_IMAGE_BLIT)); /* 32bit handle */
217 		ACCW(HT_VALUE_01, 0x00101148); /* instance $1148, engine = acc engine, CHID = $00 */
218 
219 		ACCW(HT_HANDL_02, (0x80000000 | NV4_GDI_RECTANGLE_TEXT)); /* 32bit handle */
220 		ACCW(HT_VALUE_02, 0x0010114a); /* instance $114a, engine = acc engine, CHID = $00 */
221 
222 		/* (second set) */
223 		ACCW(HT_HANDL_10, (0x80000000 | NV_ROP5_SOLID)); /* 32bit handle */
224 		ACCW(HT_VALUE_10, 0x00101142); /* instance $1142, engine = acc engine, CHID = $00 */
225 
226 		ACCW(HT_HANDL_11, (0x80000000 | NV_IMAGE_BLACK_RECTANGLE)); /* 32bit handle */
227 		ACCW(HT_VALUE_11, 0x00101144); /* instance $1144, engine = acc engine, CHID = $00 */
228 
229 		ACCW(HT_HANDL_12, (0x80000000 | NV_IMAGE_PATTERN)); /* 32bit handle */
230 		ACCW(HT_VALUE_12, 0x00101146); /* instance $1146, engine = acc engine, CHID = $00 */
231 
232 		ACCW(HT_HANDL_13, (0x80000000 | NV_SCALED_IMAGE_FROM_MEMORY)); /* 32bit handle */
233 		ACCW(HT_VALUE_13, 0x0010114e); /* instance $114e, engine = acc engine, CHID = $00 */
234 	}
235 	else
236 	{
237 		/* (first set) */
238 		ACCW(HT_HANDL_00, (0x80000000 | NV4_SURFACE)); /* 32bit handle */
239 		ACCW(HT_VALUE_00, 0x80011145); /* instance $1145, engine = acc engine, CHID = $00 */
240 
241 		ACCW(HT_HANDL_01, (0x80000000 | NV_IMAGE_BLIT)); /* 32bit handle */
242 		ACCW(HT_VALUE_01, 0x80011146); /* instance $1146, engine = acc engine, CHID = $00 */
243 
244 		ACCW(HT_HANDL_02, (0x80000000 | NV4_GDI_RECTANGLE_TEXT)); /* 32bit handle */
245 		ACCW(HT_VALUE_02, 0x80011147); /* instance $1147, engine = acc engine, CHID = $00 */
246 
247 		ACCW(HT_HANDL_03, (0x80000000 | NV4_CONTEXT_SURFACES_ARGB_ZS)); /* 32bit handle (3D) */
248 		ACCW(HT_VALUE_03, 0x80011148); /* instance $1148, engine = acc engine, CHID = $00 */
249 
250 		/* NV4_ and NV10_DX5_TEXTURE_TRIANGLE should be identical */
251 		ACCW(HT_HANDL_04, (0x80000000 | NV4_DX5_TEXTURE_TRIANGLE)); /* 32bit handle (3D) */
252 		ACCW(HT_VALUE_04, 0x80011149); /* instance $1149, engine = acc engine, CHID = $00 */
253 
254 		/* NV4_ and NV10_DX6_MULTI_TEXTURE_TRIANGLE should be identical */
255 		ACCW(HT_HANDL_05, (0x80000000 | NV4_DX6_MULTI_TEXTURE_TRIANGLE)); /* 32bit handle (not used) */
256 		ACCW(HT_VALUE_05, 0x8001114a); /* instance $114a, engine = acc engine, CHID = $00 */
257 
258 		ACCW(HT_HANDL_06, (0x80000000 | NV1_RENDER_SOLID_LIN)); /* 32bit handle (not used) */
259 		ACCW(HT_VALUE_06, 0x8001114c); /* instance $114c, engine = acc engine, CHID = $00 */
260 
261 		/* (second set) */
262 		ACCW(HT_HANDL_10, (0x80000000 | NV_ROP5_SOLID)); /* 32bit handle */
263 		ACCW(HT_VALUE_10, 0x80011142); /* instance $1142, engine = acc engine, CHID = $00 */
264 
265 		ACCW(HT_HANDL_11, (0x80000000 | NV_IMAGE_BLACK_RECTANGLE)); /* 32bit handle */
266 		ACCW(HT_VALUE_11, 0x80011143); /* instance $1143, engine = acc engine, CHID = $00 */
267 
268 		ACCW(HT_HANDL_12, (0x80000000 | NV_IMAGE_PATTERN)); /* 32bit handle */
269 		ACCW(HT_VALUE_12, 0x80011144); /* instance $1144, engine = acc engine, CHID = $00 */
270 
271 		ACCW(HT_HANDL_13, (0x80000000 | NV_SCALED_IMAGE_FROM_MEMORY)); /* 32bit handle */
272 		ACCW(HT_VALUE_13, 0x8001114b); /* instance $114b, engine = acc engine, CHID = $00 */
273 	}
274 
275 	/* program CTX registers: CTX1 is mostly done later (colorspace dependant) */
276 	/* note:
277 	 * CTX determines which HT handles point to what engine commands. */
278 	/* note also:
279 	 * CTX registers are in fact in the same GPU internal RAM space as the engine's
280 	 * hashtable. This means that stuff programmed in here also survives resets and
281 	 * power-outages! (confirmed NV11) */
282 	if (si->ps.card_arch >= NV40A)
283 	{
284 		/* setup a DMA define for use by command defines below. */
285 		ACCW(PR_CTX0_R, 0x00003000); /* DMA page table present and of linear type;
286 									  * DMA target node is NVM (non-volatile memory?)
287 									  * (instead of doing PCI or AGP transfers) */
288 		ACCW(PR_CTX1_R, (si->ps.memory_size - 1)); /* DMA limit: size is all cardRAM */
289 		ACCW(PR_CTX2_R, ((0x00000000 & 0xfffff000) | 0x00000002));
290 									 /* DMA access type is READ_AND_WRITE;
291 									  * memory starts at start of cardRAM (b12-31):
292 									  * It's adress needs to be at a 4kb boundary! */
293 		ACCW(PR_CTX3_R, 0x00000002); /* unknown (looks like this is rubbish/not needed?) */
294 		/* setup set '0' for cmd NV_ROP5_SOLID */
295 		ACCW(PR_CTX0_0, 0x02080043); /* NVclass $043, patchcfg ROP_AND, nv10+: little endian */
296 		ACCW(PR_CTX1_0, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
297 		ACCW(PR_CTX2_0, 0x00000000); /* DMA0 and DMA1 instance invalid */
298 		ACCW(PR_CTX3_0, 0x00000000); /* method traps disabled */
299 		ACCW(PR_CTX0_1, 0x00000000); /* extra */
300 		ACCW(PR_CTX1_1, 0x00000000); /* extra */
301 		/* setup set '1' for cmd NV_IMAGE_BLACK_RECTANGLE */
302 		ACCW(PR_CTX0_2, 0x02080019); /* NVclass $019, patchcfg ROP_AND, nv10+: little endian */
303 		ACCW(PR_CTX1_2, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
304 		ACCW(PR_CTX2_2, 0x00000000); /* DMA0 and DMA1 instance invalid */
305 		ACCW(PR_CTX3_2, 0x00000000); /* method traps disabled */
306 		ACCW(PR_CTX0_3, 0x00000000); /* extra */
307 		ACCW(PR_CTX1_3, 0x00000000); /* extra */
308 		/* setup set '2' for cmd NV_IMAGE_PATTERN */
309 		ACCW(PR_CTX0_4, 0x02080018); /* NVclass $018, patchcfg ROP_AND, nv10+: little endian */
310 		ACCW(PR_CTX1_4, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
311 		ACCW(PR_CTX2_4, 0x00000000); /* DMA0 and DMA1 instance invalid */
312 		ACCW(PR_CTX3_4, 0x00000000); /* method traps disabled */
313 		ACCW(PR_CTX0_5, 0x00000000); /* extra */
314 		ACCW(PR_CTX1_5, 0x00000000); /* extra */
315 		/* setup set '4' for cmd NV_IMAGE_BLIT */
316 		ACCW(PR_CTX0_6, 0x0208005f); /* NVclass $05f, patchcfg ROP_AND, nv10+: little endian */
317 		ACCW(PR_CTX1_6, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
318 		ACCW(PR_CTX2_6, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
319 		ACCW(PR_CTX3_6, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
320 		ACCW(PR_CTX0_7, 0x00000000); /* extra */
321 		ACCW(PR_CTX1_7, 0x00000000); /* extra */
322 		/* setup set '5' for cmd NV4_GDI_RECTANGLE_TEXT */
323 		ACCW(PR_CTX0_8, 0x0208004a); /* NVclass $04a, patchcfg ROP_AND, nv10+: little endian */
324 		ACCW(PR_CTX1_8, 0x02000000); /* colorspace not set, notify instance is $0200 (b16-31) */
325 		ACCW(PR_CTX2_8, 0x00000000); /* DMA0 and DMA1 instance invalid */
326 		ACCW(PR_CTX3_8, 0x00000000); /* method traps disabled */
327 		ACCW(PR_CTX0_9, 0x00000000); /* extra */
328 		ACCW(PR_CTX1_9, 0x00000000); /* extra */
329 		/* setup set '6' for cmd NV10_CONTEXT_SURFACES_2D */
330 		ACCW(PR_CTX0_A, 0x02080062); /* NVclass $062, nv10+: little endian */
331 		ACCW(PR_CTX1_A, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
332 		ACCW(PR_CTX2_A, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
333 		ACCW(PR_CTX3_A, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
334 		ACCW(PR_CTX0_B, 0x00000000); /* extra */
335 		ACCW(PR_CTX1_B, 0x00000000); /* extra */
336 		/* setup set '7' for cmd NV_SCALED_IMAGE_FROM_MEMORY */
337 		ACCW(PR_CTX0_C, 0x02080077); /* NVclass $077, nv10+: little endian */
338 		ACCW(PR_CTX1_C, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
339 		ACCW(PR_CTX2_C, 0x00001140); /* DMA0 instance is $1140, DMA1 instance invalid */
340 		ACCW(PR_CTX3_C, 0x00001140); /* method trap 0 is $1140, trap 1 disabled */
341 		ACCW(PR_CTX0_D, 0x00000000); /* extra */
342 		ACCW(PR_CTX1_D, 0x00000000); /* extra */
343 		/* setup DMA set pointed at by PF_CACH1_DMAI */
344 		ACCW(PR_CTX0_E, 0x00003002); /* DMA page table present and of linear type;
345 									  * DMA class is $002 (b0-11);
346 									  * DMA target node is NVM (non-volatile memory?)
347 									  * (instead of doing PCI or AGP transfers) */
348 		ACCW(PR_CTX1_E, 0x00007fff); /* DMA limit: tablesize is 32k bytes */
349 		ACCW(PR_CTX2_E, (((si->ps.memory_size - 1) & 0xffff8000) | 0x00000002));
350 									 /* DMA access type is READ_AND_WRITE;
351 									  * table is located at end of cardRAM (b12-31):
352 									  * It's adress needs to be at a 4kb boundary! */
353 	}
354 	else
355 	{
356 		/* setup a DMA define for use by command defines below. */
357 		ACCW(PR_CTX0_R, 0x00003000); /* DMA page table present and of linear type;
358 									  * DMA target node is NVM (non-volatile memory?)
359 									  * (instead of doing PCI or AGP transfers) */
360 		ACCW(PR_CTX1_R, (si->ps.memory_size - 1)); /* DMA limit: size is all cardRAM */
361 		ACCW(PR_CTX2_R, ((0x00000000 & 0xfffff000) | 0x00000002));
362 									 /* DMA access type is READ_AND_WRITE;
363 									  * memory starts at start of cardRAM (b12-31):
364 									  * It's adress needs to be at a 4kb boundary! */
365 		ACCW(PR_CTX3_R, 0x00000002); /* unknown (looks like this is rubbish/not needed?) */
366 		/* setup set '0' for cmd NV_ROP5_SOLID */
367 		ACCW(PR_CTX0_0, 0x01008043); /* NVclass $043, patchcfg ROP_AND, nv10+: little endian */
368 		ACCW(PR_CTX1_0, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
369 		ACCW(PR_CTX2_0, 0x00000000); /* DMA0 and DMA1 instance invalid */
370 		ACCW(PR_CTX3_0, 0x00000000); /* method traps disabled */
371 		/* setup set '1' for cmd NV_IMAGE_BLACK_RECTANGLE */
372 		ACCW(PR_CTX0_1, 0x01008019); /* NVclass $019, patchcfg ROP_AND, nv10+: little endian */
373 		ACCW(PR_CTX1_1, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
374 		ACCW(PR_CTX2_1, 0x00000000); /* DMA0 and DMA1 instance invalid */
375 		ACCW(PR_CTX3_1, 0x00000000); /* method traps disabled */
376 		/* setup set '2' for cmd NV_IMAGE_PATTERN */
377 		ACCW(PR_CTX0_2, 0x01008018); /* NVclass $018, patchcfg ROP_AND, nv10+: little endian */
378 		ACCW(PR_CTX1_2, 0x00000002); /* colorspace not set, notify instance is $0200 (b16-31) */
379 		ACCW(PR_CTX2_2, 0x00000000); /* DMA0 and DMA1 instance invalid */
380 		ACCW(PR_CTX3_2, 0x00000000); /* method traps disabled */
381 		/* setup set '3' for ... */
382 		if(si->ps.card_arch >= NV10A)
383 		{
384 			/* ... cmd NV10_CONTEXT_SURFACES_2D */
385 			ACCW(PR_CTX0_3, 0x01008062); /* NVclass $062, nv10+: little endian */
386 		}
387 		else
388 		{
389 			/* ... cmd NV4_SURFACE */
390 			ACCW(PR_CTX0_3, 0x01008042); /* NVclass $042, nv10+: little endian */
391 		}
392 		ACCW(PR_CTX1_3, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
393 		ACCW(PR_CTX2_3, 0x11401140); /* DMA0 instance is $1140, DMA1 instance invalid */
394 		ACCW(PR_CTX3_3, 0x00000000); /* method trap 0 is $1140, trap 1 disabled */
395 		/* setup set '4' for cmd NV_IMAGE_BLIT */
396 		ACCW(PR_CTX0_4, 0x0100805f); /* NVclass $05f, patchcfg ROP_AND, nv10+: little endian */
397 		ACCW(PR_CTX1_4, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
398 		ACCW(PR_CTX2_4, 0x11401140); /* DMA0 instance is $1140, DMA1 instance invalid */
399 		ACCW(PR_CTX3_4, 0x00000000); /* method trap 0 is $1140, trap 1 disabled */
400 		/* setup set '5' for cmd NV4_GDI_RECTANGLE_TEXT */
401 		ACCW(PR_CTX0_5, 0x0100804a); /* NVclass $04a, patchcfg ROP_AND, nv10+: little endian */
402 		ACCW(PR_CTX1_5, 0x00000002); /* colorspace not set, notify instance is $0200 (b16-31) */
403 		ACCW(PR_CTX2_5, 0x00000000); /* DMA0 and DMA1 instance invalid */
404 		ACCW(PR_CTX3_5, 0x00000000); /* method traps disabled */
405 		/* setup set '6' ... */
406 		if (si->ps.card_arch >= NV10A)
407 		{
408 			/* ... for cmd NV10_CONTEXT_SURFACES_ARGB_ZS */
409 			ACCW(PR_CTX0_6, 0x00000093); /* NVclass $093, nv10+: little endian */
410 		}
411 		else
412 		{
413 			/* ... for cmd NV4_CONTEXT_SURFACES_ARGB_ZS */
414 			ACCW(PR_CTX0_6, 0x00000053); /* NVclass $053, nv10+: little endian */
415 		}
416 		ACCW(PR_CTX1_6, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
417 		ACCW(PR_CTX2_6, 0x11401140); /* DMA0, DMA1 instance = $1140 */
418 		ACCW(PR_CTX3_6, 0x00000000); /* method traps disabled */
419 		/* setup set '7' ... */
420 		if (si->ps.card_arch >= NV10A)
421 		{
422 			/* ... for cmd NV10_DX5_TEXTURE_TRIANGLE */
423 			ACCW(PR_CTX0_7, 0x0300a094); /* NVclass $094, patchcfg ROP_AND, userclip enable,
424 										  * context surface0 valid, nv10+: little endian */
425 		}
426 		else
427 		{
428 			/* ... for cmd NV4_DX5_TEXTURE_TRIANGLE */
429 			ACCW(PR_CTX0_7, 0x0300a054); /* NVclass $054, patchcfg ROP_AND, userclip enable,
430 										  * context surface0 valid */
431 		}
432 		ACCW(PR_CTX1_7, 0x00000d01); /* format is A8RGB24, MSB mono */
433 		ACCW(PR_CTX2_7, 0x11401140); /* DMA0, DMA1 instance = $1140 */
434 		ACCW(PR_CTX3_7, 0x00000000); /* method traps disabled */
435 		/* setup set '8' ... */
436 		if (si->ps.card_arch >= NV10A)
437 		{
438 			/* ... for cmd NV10_DX6_MULTI_TEXTURE_TRIANGLE (not used) */
439 			ACCW(PR_CTX0_8, 0x0300a095); /* NVclass $095, patchcfg ROP_AND, userclip enable,
440 										  * context surface0 valid, nv10+: little endian */
441 		}
442 		else
443 		{
444 			/* ... for cmd NV4_DX6_MULTI_TEXTURE_TRIANGLE (not used) */
445 			ACCW(PR_CTX0_8, 0x0300a055); /* NVclass $055, patchcfg ROP_AND, userclip enable,
446 										  * context surface0 valid */
447 		}
448 		ACCW(PR_CTX1_8, 0x00000d01); /* format is A8RGB24, MSB mono */
449 		ACCW(PR_CTX2_8, 0x11401140); /* DMA0, DMA1 instance = $1140 */
450 		ACCW(PR_CTX3_8, 0x00000000); /* method traps disabled */
451 		/* setup set '9' for cmd NV_SCALED_IMAGE_FROM_MEMORY */
452 		ACCW(PR_CTX0_9, 0x01018077); /* NVclass $077, patchcfg SRC_COPY,
453 									  * context surface0 valid, nv10+: little endian */
454 		ACCW(PR_CTX1_9, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
455 		ACCW(PR_CTX2_9, 0x11401140); /* DMA0, DMA1 instance = $1140 */
456 		ACCW(PR_CTX3_9, 0x00000000); /* method traps disabled */
457 		/* setup set 'A' for cmd NV1_RENDER_SOLID_LIN (not used) */
458 		ACCW(PR_CTX0_A, 0x0300a01c); /* NVclass $01c, patchcfg ROP_AND, userclip enable,
459 									  * context surface0 valid, nv10+: little endian */
460 		ACCW(PR_CTX1_A, 0x00000000); /* colorspace not set, notify instance invalid (b16-31) */
461 		ACCW(PR_CTX2_A, 0x11401140); /* DMA0, DMA1 instance = $1140 */
462 		ACCW(PR_CTX3_A, 0x00000000); /* method traps disabled */
463 		/* setup DMA set pointed at by PF_CACH1_DMAI */
464 		if (si->engine.agp_mode)
465 		{
466 			/* DMA page table present and of linear type;
467 			 * DMA class is $002 (b0-11);
468 			 * DMA target node is AGP */
469 			ACCW(PR_CTX0_B, 0x00033002);
470 		}
471 		else
472 		{
473 			/* DMA page table present and of linear type;
474 			 * DMA class is $002 (b0-11);
475 			 * DMA target node is PCI */
476 			ACCW(PR_CTX0_B, 0x00023002);
477 		}
478 		ACCW(PR_CTX1_B, 0x000fffff); /* DMA limit: tablesize is 1M bytes */
479 		ACCW(PR_CTX2_B, (((uint32)((uint8 *)(si->dma_buffer_pci))) | 0x00000002));
480 									 /* DMA access type is READ_AND_WRITE;
481 									  * table is located in main system RAM (b12-31):
482 									  * It's adress needs to be at a 4kb boundary! */
483 
484 //3D stuff:
485 /*
486 	rud's (temp.) notes:
487 	(problem: 3D driver renders in 32bit whatever the frontbuffer space in DMA mode.)
488 	- the colorspace dependant info under 'acc engine' also sets the outcome for the
489 	  3D add-on. I don't know yet if the 3D render funcs render in the frontbuffer
490 	  space and the back-to-front blit isn't set (stays in 32bit!) (likely),
491 	  or if the 3D funcs render always in 32bit space and back-to-front blit color-
492 	  space converts... I'll try to nail this down at some point.
493 	- the colorspace dependant info under 'pramin' is needed to get the 3D related
494 	  surface commands up and running. An alternate solution would probably be calling
495 	  the surface command with the colorspace set.
496 */
497 		switch(si->dm.space)
498 		{
499 		case B_CMAP8:
500 			/* acc engine */
501 			ACCW(FORMATS, 0x00001010);
502 			if (si->ps.card_arch < NV30A)
503 				/* set depth 0-5: $1 = Y8 */
504 				ACCW(BPIXEL, 0x00111111);
505 			else
506 				/* set depth 0-1: $1 = Y8, $2 = X1R5G5B5_Z1R5G5B5 */
507 				ACCW(BPIXEL, 0x00000021);
508 			ACCW(STRD_FMT, 0x03020202);
509 			/* PRAMIN */
510 			if (si->ps.card_arch == NV04A)
511 				ACCW(PR_CTX1_6, 0x00000302); /* format is X24Y8, LSB mono */
512 			else
513 				ACCW(PR_CTX1_6, 0x00000000); /* format is invalid */
514 			ACCW(PR_CTX1_A, 0x00000302); /* format is X24Y8, LSB mono */
515 			break;
516 		case B_RGB15_LITTLE:
517 			/* acc engine */
518 			ACCW(FORMATS, 0x00002071);
519 			if (si->ps.card_arch < NV30A)
520 				/* set depth 0-5: $2 = X1R5G5B5_Z1R5G5B5, $6 = Y16 */
521 				ACCW(BPIXEL, 0x00226222);
522 			else
523 				/* set depth 0-1: $2 = X1R5G5B5_Z1R5G5B5, $4 = A1R5G5B5 */
524 				ACCW(BPIXEL, 0x00000042);
525 			ACCW(STRD_FMT, 0x09080808);
526 			/* PRAMIN */
527 			ACCW(PR_CTX1_6, 0x00000902); /* format is X17RGB15, LSB mono */
528 			ACCW(PR_CTX1_A, 0x00000902); /* format is X17RGB15, LSB mono */
529 			break;
530 		case B_RGB16_LITTLE:
531 			/* acc engine */
532 			ACCW(FORMATS, 0x000050C2);
533 			if (si->ps.card_arch < NV30A)
534 				/* set depth 0-5: $5 = R5G6B5, $6 = Y16 */
535 				ACCW(BPIXEL, 0x00556555);
536 			else
537 				/* set depth 0-1: $5 = R5G6B5, $a = X1A7R8G8B8_O1A7R8G8B8 */
538 				ACCW(BPIXEL, 0x000000a5);
539 			if (si->ps.card_arch == NV04A)
540 				ACCW(STRD_FMT, 0x0c0b0b0b);
541 			else
542 				ACCW(STRD_FMT, 0x000b0b0c);
543 			/* PRAMIN */
544 			ACCW(PR_CTX1_6, 0x00000c02); /* format is X16RGB16, LSB mono */
545 			ACCW(PR_CTX1_A, 0x00000c02); /* format is X16RGB16, LSB mono */
546 			break;
547 		case B_RGB32_LITTLE:
548 		case B_RGBA32_LITTLE:
549 			/* acc engine */
550 			ACCW(FORMATS, 0x000070e5);
551 			if (si->ps.card_arch < NV30A)
552 				/* set depth 0-5: $7 = X8R8G8B8_Z8R8G8B8, $d = Y32 */
553 				ACCW(BPIXEL, 0x0077d777);
554 			else
555 				/* set depth 0-1: $7 = X8R8G8B8_Z8R8G8B8, $e = V8YB8U8YA8 */
556 				ACCW(BPIXEL, 0x000000e7);
557 			ACCW(STRD_FMT, 0x0e0d0d0d);
558 			/* PRAMIN */
559 			ACCW(PR_CTX1_6, 0x00000e02); /* format is X8RGB24, LSB mono */
560 			ACCW(PR_CTX1_A, 0x00000e02); /* format is X8RGB24, LSB mono */
561 			break;
562 		default:
563 			LOG(8,("ACC: init, invalid bit depth\n"));
564 			return B_ERROR;
565 		}
566 //end 3D stuff.
567 	}
568 
569 	if (si->ps.card_arch == NV04A)
570 	{
571 		/* do a explicit engine reset */
572 		ACCW(DEBUG0, 0x000001ff);
573 
574 		/* init some function blocks */
575 		ACCW(DEBUG0, 0x1230c000);
576 		ACCW(DEBUG1, 0x72111101);
577 		ACCW(DEBUG2, 0x11d5f071);
578 		ACCW(DEBUG3, 0x0004ff31);
579 		/* init OP methods */
580 		ACCW(DEBUG3, 0x4004ff31);
581 
582 		/* disable all acceleration engine INT reguests */
583 		ACCW(ACC_INTE, 0x00000000);
584 		/* reset all acceration engine INT status bits */
585 		ACCW(ACC_INTS, 0xffffffff);
586 		/* context control enabled */
587 		ACCW(NV04_CTX_CTRL, 0x10010100);
588 		/* all acceleration buffers, pitches and colors are valid */
589 		ACCW(NV04_ACC_STAT, 0xffffffff);
590 		/* enable acceleration engine command FIFO */
591 		ACCW(FIFO_EN, 0x00000001);
592 
593 		/* setup location of active screen in framebuffer */
594 		ACCW(OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
595 		ACCW(OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
596 		/* setup accesible card memory range */
597 		ACCW(BLIMIT0, (si->ps.memory_size - 1));
598 		ACCW(BLIMIT1, (si->ps.memory_size - 1));
599 
600 		/* pattern shape value = 8x8, 2 color */
601 		//fixme: not needed, unless the engine has a hardware fault (setting via cmd)!
602 		//ACCW(PAT_SHP, 0x00000000);
603 		/* Pgraph Beta AND value (fraction) b23-30 */
604 		ACCW(BETA_AND_VAL, 0xffffffff);
605 	}
606 	else
607 	{
608 		/* do a explicit engine reset */
609 		ACCW(DEBUG0, 0xffffffff);
610 		ACCW(DEBUG0, 0x00000000);
611 		/* disable all acceleration engine INT reguests */
612 		ACCW(ACC_INTE, 0x00000000);
613 		/* reset all acceration engine INT status bits */
614 		ACCW(ACC_INTS, 0xffffffff);
615 		/* context control enabled */
616 		ACCW(NV10_CTX_CTRL, 0x10010100);
617 		/* all acceleration buffers, pitches and colors are valid */
618 		ACCW(NV10_ACC_STAT, 0xffffffff);
619 		/* enable acceleration engine command FIFO */
620 		ACCW(FIFO_EN, 0x00000001);
621 		/* setup surface type:
622 		 * b1-0 = %01 = surface type is non-swizzle;
623 		 * this is needed to enable 3D on NV1x (confirmed) and maybe others? */
624 		ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) & 0x0007ff00));
625 		ACCW(NV10_SURF_TYP, ((ACCR(NV10_SURF_TYP)) | 0x00020101));
626 	}
627 
628 	if (si->ps.card_arch == NV10A)
629 	{
630 		/* init some function blocks */
631 		ACCW(DEBUG1, 0x00118700);
632 		/* DEBUG2 has a big influence on 3D speed for NV15 (confirmed) */
633 		ACCW(DEBUG2, 0x24f82ad9);
634 		ACCW(DEBUG3, 0x55de0030);
635 
636 		/* copy tile setup stuff from 'source' to acc engine */
637 		for (cnt = 0; cnt < 32; cnt++)
638 		{
639 			NV_REG32(NVACC_NV10_TIL0AD + (cnt << 2)) =
640 				NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
641 		}
642 
643 		/* setup location of active screen in framebuffer */
644 		ACCW(OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
645 		ACCW(OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
646 		/* setup accesible card memory range */
647 		ACCW(BLIMIT0, (si->ps.memory_size - 1));
648 		ACCW(BLIMIT1, (si->ps.memory_size - 1));
649 
650 		/* pattern shape value = 8x8, 2 color */
651 		//fixme: not needed, unless the engine has a hardware fault (setting via cmd)!
652 		//ACCW(PAT_SHP, 0x00000000);
653 		/* Pgraph Beta AND value (fraction) b23-30 */
654 		ACCW(BETA_AND_VAL, 0xffffffff);
655 	}
656 
657 	if (si->ps.card_arch >= NV20A)
658 	{
659 		switch (si->ps.card_arch)
660 		{
661 		case NV40A:
662 			/* init some function blocks */
663 			ACCW(DEBUG1, 0x401287c0);
664 			ACCW(DEBUG3, 0x60de8051);
665 			/* disable specific functions, but enable SETUP_SPARE2 register */
666 			ACCW(NV10_DEBUG4, 0x00008000);
667 			/* set limit_viol_pix_adress(?): more likely something unknown.. */
668 			ACCW(NV25_WHAT0, 0x00be3c5f);
669 
670 			/* setup some unknown serially accessed registers (?) */
671 			tmp = (NV_REG32(NV32_NV4X_WHAT0) & 0x000000ff);
672 			for (cnt = 0; (tmp && !(tmp & 0x00000001)); tmp >>= 1, cnt++);
673 			{
674 				ACCW(NV4X_WHAT2, cnt);
675 			}
676 
677 			/* unknown.. */
678 			switch (si->ps.card_type)
679 			{
680 			case NV40:
681 			case NV45:
682 			/* and NV48: but these are pgm'd as NV45 currently */
683 				ACCW(NV40_WHAT0, 0x83280fff);
684 				ACCW(NV40_WHAT1, 0x000000a0);
685 				ACCW(NV40_WHAT2, 0x0078e366);
686 				ACCW(NV40_WHAT3, 0x0000014c);
687 				break;
688 			case NV41:
689 			/* and ID == 0x012x: but no cards defined yet */
690 				ACCW(NV40P_WHAT0, 0x83280eff);
691 				ACCW(NV40P_WHAT1, 0x000000a0);
692 				ACCW(NV40P_WHAT2, 0x007596ff);
693 				ACCW(NV40P_WHAT3, 0x00000108);
694 				break;
695 			case NV43:
696 				ACCW(NV40P_WHAT0, 0x83280eff);
697 				ACCW(NV40P_WHAT1, 0x000000a0);
698 				ACCW(NV40P_WHAT2, 0x0072cb77);
699 				ACCW(NV40P_WHAT3, 0x00000108);
700 				break;
701 			case NV44:
702 			/* and ID == 0x01dx: but no cards defined yet */
703 				ACCW(NV40P_WHAT0, 0x83280eff);
704 				ACCW(NV40P_WHAT1, 0x000000a0);
705 
706 				NV_REG32(NV32_NV44_WHAT10) = NV_REG32(NV32_NV10STRAPINFO);
707 				NV_REG32(NV32_NV44_WHAT11) = 0x00000000;
708 				NV_REG32(NV32_NV44_WHAT12) = 0x00000000;
709 				NV_REG32(NV32_NV44_WHAT13) = NV_REG32(NV32_NV10STRAPINFO);
710 
711 				ACCW(NV44_WHAT2, 0x00000000);
712 				ACCW(NV44_WHAT3, 0x00000000);
713 				break;
714 /*			case NV44 type 2:
715 				//fixme if needed: doesn't seem to need the strapinfo thing..
716 				ACCW(NV40P_WHAT0, 0x83280eff);
717 				ACCW(NV40P_WHAT1, 0x000000a0);
718 
719 				ACCW(NV44_WHAT2, 0x00000000);
720 				ACCW(NV44_WHAT3, 0x00000000);
721 				break;
722 */			case NV47:
723 			/* and ID == 0x029x: but no cards defined yet */
724 				ACCW(NV40P_WHAT0, 0x83280eff);
725 				ACCW(NV40P_WHAT1, 0x000000a0);
726 				ACCW(NV40P_WHAT2, 0x07830610);
727 				ACCW(NV40P_WHAT3, 0x0000016a);
728 				break;
729 			default:
730 				ACCW(NV40P_WHAT0, 0x83280eff);
731 				ACCW(NV40P_WHAT1, 0x000000a0);
732 				break;
733 			}
734 
735 			ACCW(NV10_TIL3PT, 0x2ffff800);
736 			ACCW(NV10_TIL3ST, 0x00006000);
737 			ACCW(NV4X_WHAT1, 0x01000000);
738 			/* engine data source DMA instance = $1140 */
739 			ACCW(NV4X_DMA_SRC, 0x00001140);
740 			break;
741 		case NV30A:
742 			/* init some function blocks, but most is unknown.. */
743 			ACCW(DEBUG1, 0x40108700);
744 			ACCW(NV25_WHAT1, 0x00140000);
745 			ACCW(DEBUG3, 0xf00e0431);
746 			ACCW(NV10_DEBUG4, 0x00008000);
747 			ACCW(NV25_WHAT0, 0xf04b1f36);
748 			ACCW(NV20_WHAT3, 0x1002d888);
749 			ACCW(NV25_WHAT2, 0x62ff007f);
750 			break;
751 		case NV20A:
752 			/* init some function blocks, but most is unknown.. */
753 			ACCW(DEBUG1, 0x00118700);
754 			ACCW(DEBUG3, 0xf20e0431);
755 			ACCW(NV10_DEBUG4, 0x00000000);
756 			ACCW(NV20_WHAT1, 0x00000040);
757 			if (si->ps.card_type < NV25)
758 			{
759 				ACCW(NV20_WHAT2, 0x00080000);
760 				ACCW(NV10_DEBUG5, 0x00000005);
761 				ACCW(NV20_WHAT3, 0x45caa208);
762 				ACCW(NV20_WHAT4, 0x24000000);
763 				ACCW(NV20_WHAT5, 0x00000040);
764 
765 				/* copy some fixed RAM(?) configuration info(?) to some indexed registers: */
766 				/* b16-24 is select; b2-13 is adress in 32-bit words */
767 				ACCW(RDI_INDEX, 0x00e00038);
768 				/* data is 32-bit */
769 				ACCW(RDI_DATA, 0x00000030);
770 				/* copy some fixed RAM(?) configuration info(?) to some indexed registers: */
771 				/* b16-24 is select; b2-13 is adress in 32-bit words */
772 				ACCW(RDI_INDEX, 0x00e10038);
773 				/* data is 32-bit */
774 				ACCW(RDI_DATA, 0x00000030);
775 			}
776 			else
777 			{
778 				ACCW(NV25_WHAT1, 0x00080000);
779 				ACCW(NV25_WHAT0, 0x304b1fb6);
780 				ACCW(NV20_WHAT3, 0x18b82880);
781 				ACCW(NV20_WHAT4, 0x44000000);
782 				ACCW(NV20_WHAT5, 0x40000080);
783 				ACCW(NV25_WHAT2, 0x000000ff);
784 			}
785 			break;
786 		}
787 
788 		/* NV20A, NV30A and NV40A: */
789 		/* copy tile setup stuff from previous setup 'source' to acc engine
790 		 * (pattern colorRAM?) */
791 		if ((si->ps.card_type <= NV40) || (si->ps.card_type == NV45))
792 		{
793 			for (cnt = 0; cnt < 32; cnt++)
794 			{
795 				/* copy NV10_FBTIL0AD upto/including NV10_FBTIL7ST */
796 				NV_REG32(NVACC_NV20_WHAT0 + (cnt << 2)) =
797 					NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
798 
799 				/* copy NV10_FBTIL0AD upto/including NV10_FBTIL7ST */
800 				NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
801 					NV_REG32(NVACC_NV10_FBTIL0AD + (cnt << 2));
802 			}
803 		}
804 		else
805 		{
806 			/* NV41, 43, 44, 47 */
807 			if (si->ps.card_type == NV47)
808 			/* or ID == 0x01dx or ID == 0x029x: but no cards defined yet */
809 			{
810 				for (cnt = 0; cnt < 60; cnt++)
811 				{
812 					/* copy NV41_FBTIL0AD upto/including NV47_FBTILEST */
813 					NV_REG32(NVACC_NV41_WHAT0 + (cnt << 2)) =
814 						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
815 
816 					/* copy NV41_FBTIL0AD upto/including NV47_FBTILEST */
817 					NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
818 						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
819 				}
820 			}
821 			else
822 			{
823 				/* NV41, 43, 44 */
824 				for (cnt = 0; cnt < 48; cnt++)
825 				{
826 					/* copy NV41_FBTIL0AD upto/including NV41_FBTILBST */
827 					NV_REG32(NVACC_NV20_WHAT0 + (cnt << 2)) =
828 						NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
829 
830 					if (si->ps.card_type != NV44)
831 					{
832 						/* copy NV41_FBTIL0AD upto/including NV41_FBTILBST */
833 						NV_REG32(NVACC_NV20_2_WHAT0 + (cnt << 2)) =
834 							NV_REG32(NVACC_NV41_FBTIL0AD + (cnt << 2));
835 					}
836 				}
837 			}
838 		}
839 
840 		if (si->ps.card_arch >= NV40A)
841 		{
842 			if ((si->ps.card_type == NV40) || (si->ps.card_type == NV45))
843 			{
844 				/* copy some RAM configuration info(?) */
845  				ACCW(NV20_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
846 				ACCW(NV20_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
847 				ACCW(NV40_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
848 				ACCW(NV40_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));
849 
850 				/* setup location of active screen in framebuffer */
851 				ACCW(NV20_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
852 				ACCW(NV20_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
853 				/* setup accesible card memory range */
854 				ACCW(NV20_BLIMIT6, (si->ps.memory_size - 1));
855 				ACCW(NV20_BLIMIT7, (si->ps.memory_size - 1));
856 			}
857 			else
858 			{
859 				/* NV41, 43, 44, 47 */
860 
861 				/* copy some RAM configuration info(?) */
862 				if (si->ps.card_type == NV47)
863 				/* or ID == 0x01dx or ID == 0x029x: but no cards defined yet */
864 				{
865 					ACCW(NV47_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
866 					ACCW(NV47_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
867 				}
868 				else
869 				{
870 					/* NV41, 43, 44 */
871 					ACCW(NV40P_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
872 					ACCW(NV40P_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
873 				}
874 				ACCW(NV40P_WHAT_T2, NV_REG32(NV32_PFB_CONFIG_0));
875 				ACCW(NV40P_WHAT_T3, NV_REG32(NV32_PFB_CONFIG_1));
876 
877 				/* setup location of active screen in framebuffer */
878 				ACCW(NV40P_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
879 				ACCW(NV40P_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
880 				/* setup accesible card memory range */
881 				ACCW(NV40P_BLIMIT6, (si->ps.memory_size - 1));
882 				ACCW(NV40P_BLIMIT7, (si->ps.memory_size - 1));
883 			}
884 		}
885 		else /* NV20A and NV30A: */
886 		{
887 			/* copy some RAM configuration info(?) */
888 			ACCW(NV20_WHAT_T0, NV_REG32(NV32_PFB_CONFIG_0));
889 			ACCW(NV20_WHAT_T1, NV_REG32(NV32_PFB_CONFIG_1));
890 			/* copy some RAM configuration info(?) to some indexed registers: */
891 			/* b16-24 is select; b2-13 is adress in 32-bit words */
892 			ACCW(RDI_INDEX, 0x00ea0000);
893 			/* data is 32-bit */
894 			ACCW(RDI_DATA, NV_REG32(NV32_PFB_CONFIG_0));
895 			/* b16-24 is select; b2-13 is adress in 32-bit words */
896 			ACCW(RDI_INDEX, 0x00ea0004);
897 			/* data is 32-bit */
898 			ACCW(RDI_DATA, NV_REG32(NV32_PFB_CONFIG_1));
899 
900 			/* setup location of active screen in framebuffer */
901 			ACCW(NV20_OFFSET0, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
902 			ACCW(NV20_OFFSET1, ((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer));
903 			/* setup accesible card memory range */
904 			ACCW(NV20_BLIMIT6, (si->ps.memory_size - 1));
905 			ACCW(NV20_BLIMIT7, (si->ps.memory_size - 1));
906 		}
907 
908 		/* NV20A, NV30A and NV40A: */
909 		/* setup some acc engine tile stuff */
910 		ACCW(NV10_TIL2AD, 0x00000000);
911 		ACCW(NV10_TIL0ED, 0xffffffff);
912 	}
913 
914 	/* all cards: */
915 	/* setup clipping: rect size is 32768 x 32768, probably max. setting */
916 	/* note:
917 	 * can also be done via the NV_IMAGE_BLACK_RECTANGLE engine command. */
918 	ACCW(ABS_UCLP_XMIN, 0x00000000);
919 	ACCW(ABS_UCLP_YMIN, 0x00000000);
920 	ACCW(ABS_UCLP_XMAX, 0x00007fff);
921 	ACCW(ABS_UCLP_YMAX, 0x00007fff);
922 
923 	/*** PFIFO ***/
924 	/* (setup caches) */
925 	/* disable caches reassign */
926 	ACCW(PF_CACHES, 0x00000000);
927 	/* PFIFO mode: channel 0 is in DMA mode, channels 1 - 32 are in PIO mode */
928 	ACCW(PF_MODE, 0x00000001);
929 	/* cache1 push0 access disabled */
930 	ACCW(PF_CACH1_PSH0, 0x00000000);
931 	/* cache1 pull0 access disabled */
932 	ACCW(PF_CACH1_PUL0, 0x00000000);
933 	/* cache1 push1 mode = DMA */
934 	if (si->ps.card_arch >= NV40A)
935 		ACCW(PF_CACH1_PSH1, 0x00010000);
936 	else
937 		ACCW(PF_CACH1_PSH1, 0x00000100);
938 	/* cache1 DMA Put offset = 0 (b2-28) */
939 	ACCW(PF_CACH1_DMAP, 0x00000000);
940 	/* cache1 DMA Get offset = 0 (b2-28) */
941 	ACCW(PF_CACH1_DMAG, 0x00000000);
942 	/* cache1 DMA instance adress = $114e (b0-15);
943 	 * instance being b4-19 with baseadress NV_PRAMIN_CTX_0 (0x00700000). */
944 	/* note:
945 	 * should point to a DMA definition in CTX register space (which is sort of RAM).
946 	 * This define tells the engine where the DMA cmd buffer is and what it's size is.
947 	 * Inside that cmd buffer you'll find the actual issued engine commands. */
948 	if (si->ps.card_arch >= NV40A)
949 		ACCW(PF_CACH1_DMAI, 0x00001150);
950 	else
951 		ACCW(PF_CACH1_DMAI, 0x0000114d);
952 	/* cache0 push0 access disabled */
953 	ACCW(PF_CACH0_PSH0, 0x00000000);
954 	/* cache0 pull0 access disabled */
955 	ACCW(PF_CACH0_PUL0, 0x00000000);
956 	/* RAM HT (hash table) baseadress = $10000 (b4-8), size = 4k,
957 	 * search = 128 (is byte offset between hash 'sets') */
958 	/* note:
959 	 * so HT base is $00710000, last is $00710fff.
960 	 * In this space you define the engine command handles (HT_HANDL_XX), which
961 	 * in turn points to the defines in CTX register space (which is sort of RAM) */
962 	ACCW(PF_RAMHT, 0x03000100);
963 	/* RAM FC baseadress = $11000 (b3-8) (size is fixed to 0.5k(?)) */
964 	/* note:
965 	 * so FC base is $00711000, last is $007111ff. (not used?) */
966 	ACCW(PF_RAMFC, 0x00000110);
967 	/* RAM RO baseadress = $11200 (b1-8), size = 0.5k */
968 	/* note:
969 	 * so RO base is $00711200, last is $007113ff. (not used?) */
970 	/* note also:
971 	 * This means(?) the PRAMIN CTX registers are accessible from base $00711400. */
972 	ACCW(PF_RAMRO, 0x00000112);
973 	/* PFIFO size: ch0-15 = 512 bytes, ch16-31 = 124 bytes */
974 	ACCW(PF_SIZE, 0x0000ffff);
975 	/* cache1 hash instance = $ffff (b0-15) */
976 	ACCW(PF_CACH1_HASH, 0x0000ffff);
977 	/* disable all PFIFO INTs */
978 	ACCW(PF_INTEN, 0x00000000);
979 	/* reset all PFIFO INT status bits */
980 	ACCW(PF_INTSTAT, 0xffffffff);
981 	/* cache0 pull0 engine = acceleration engine (graphics) */
982 	ACCW(PF_CACH0_PUL1, 0x00000001);
983 	/* cache1 DMA control: disable some stuff */
984 	ACCW(PF_CACH1_DMAC, 0x00000000);
985 	/* cache1 engine 0 upto/including 7 is software (could also be graphics or DVD) */
986 	ACCW(PF_CACH1_ENG, 0x00000000);
987 	/* cache1 DMA fetch: trigger at 128 bytes, size is 32 bytes, max requests is 15,
988 	 * use little endian */
989 	ACCW(PF_CACH1_DMAF, 0x000f0078);
990 	/* cache1 DMA push: b0 = 1: access is enabled */
991 	ACCW(PF_CACH1_DMAS, 0x00000001);
992 	/* cache1 push0 access enabled */
993 	ACCW(PF_CACH1_PSH0, 0x00000001);
994 	/* cache1 pull0 access enabled */
995 	ACCW(PF_CACH1_PUL0, 0x00000001);
996 	/* cache1 pull1 engine = acceleration engine (graphics) */
997 	ACCW(PF_CACH1_PUL1, 0x00000001);
998 	/* enable PFIFO caches reassign */
999 	ACCW(PF_CACHES, 0x00000001);
1000 
1001 	/* setup 3D specifics */
1002 	nv_init_for_3D_dma();
1003 
1004 	/*** init acceleration engine command info ***/
1005 	/* set object handles */
1006 	/* note:
1007 	 * probably depending on some other setup, there are 8 or 32 FIFO channels
1008 	 * available. Assuming the current setup only has 8 channels because the 'rest'
1009 	 * isn't setup here... */
1010 	si->engine.fifo.handle[0] = NV_ROP5_SOLID;
1011 	si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
1012 	si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
1013 	si->engine.fifo.handle[3] = NV4_SURFACE; /* NV10_CONTEXT_SURFACES_2D is identical */
1014 	si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
1015 	si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
1016 	si->engine.fifo.handle[6] = NV4_CONTEXT_SURFACES_ARGB_ZS;//NV1_RENDER_SOLID_LIN;
1017 	si->engine.fifo.handle[7] = NV4_DX5_TEXTURE_TRIANGLE;
1018 	/* preset no FIFO channels assigned to cmd's */
1019 	for (cnt = 0; cnt < 0x20; cnt++)
1020 	{
1021 		si->engine.fifo.ch_ptr[cnt] = 0;
1022 	}
1023 	/* set handle's pointers to their assigned FIFO channels */
1024 	/* note:
1025 	 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
1026 	for (cnt = 0; cnt < 0x08; cnt++)
1027 	{
1028 		si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
1029 												(0x00000001 + (cnt * 0x00002000));
1030 	}
1031 
1032 	/*** init DMA command buffer info ***/
1033 	if (si->ps.card_arch >= NV40A) //main mem DMA buf on pre-NV40
1034 	{
1035 		si->dma_buffer = (void *)((char *)si->framebuffer +
1036 			((si->ps.memory_size - 1) & 0xffff8000));
1037 	}
1038 	LOG(4,("ACC_DMA: command buffer is at adress $%08x\n",
1039 		((uint32)(si->dma_buffer))));
1040 	/* we have issued no DMA cmd's to the engine yet */
1041 	si->engine.dma.put = 0;
1042 	/* the current first free adress in the DMA buffer is at offset 0 */
1043 	si->engine.dma.current = 0;
1044 	/* the DMA buffer can hold 8k 32-bit words (it's 32kb in size),
1045 	 * or 256k 32-bit words (1Mb in size) dependant on architecture (for now) */
1046 	/* note:
1047 	 * one word is reserved at the end of the DMA buffer to be able to instruct the
1048 	 * engine to do a buffer wrap-around!
1049 	 * (DMA opcode 'noninc method': issue word $20000000.) */
1050 	if (si->ps.card_arch < NV40A)
1051 		si->engine.dma.max = ((1 * 1024 * 1024) >> 2) - 1;
1052 	else
1053 		si->engine.dma.max = 8192 - 1;
1054 	/* note the current free space we have left in the DMA buffer */
1055 	si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
1056 
1057 	/*** init FIFO via DMA command buffer. ***/
1058 	/* wait for room in fifo for new FIFO assigment cmds if needed: */
1059 	if (si->ps.card_arch >= NV40A)
1060 	{
1061 		if (nv_acc_fifofree_dma(12) != B_OK) return B_ERROR;
1062 	}
1063 	else
1064 	{
1065 		if (nv_acc_fifofree_dma(16) != B_OK) return B_ERROR;
1066 	}
1067 
1068 	/* program new FIFO assignments */
1069 	/* Raster OPeration: */
1070 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
1071 	/* Clip: */
1072 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
1073 	/* Pattern: */
1074 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
1075 	/* 2D Surfaces: */
1076 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
1077 	/* Blit: */
1078 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
1079 	/* Bitmap: */
1080 	nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);
1081 	if (si->ps.card_arch < NV40A)
1082 	{
1083 		/* 3D surfaces: (3D related only) */
1084 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH6, si->engine.fifo.handle[6]);
1085 		/* Textured Triangle: (3D only) */
1086 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH7, si->engine.fifo.handle[7]);
1087 	}
1088 
1089 	/*** Set pixel width ***/
1090 	switch(si->dm.space)
1091 	{
1092 	case B_CMAP8:
1093 		surf_depth = 0x00000001;
1094 		cmd_depth = 0x00000003;
1095 		break;
1096 	case B_RGB15_LITTLE:
1097 	case B_RGB16_LITTLE:
1098 		surf_depth = 0x00000004;
1099 		cmd_depth = 0x00000001;
1100 		break;
1101 	case B_RGB32_LITTLE:
1102 	case B_RGBA32_LITTLE:
1103 		surf_depth = 0x00000006;
1104 		cmd_depth = 0x00000003;
1105 		break;
1106 	default:
1107 		LOG(8,("ACC_DMA: init, invalid bit depth\n"));
1108 		return B_ERROR;
1109 	}
1110 
1111 	/* wait for room in fifo for surface setup cmd if needed */
1112 	if (nv_acc_fifofree_dma(5) != B_OK) return B_ERROR;
1113 	/* now setup 2D surface (writing 5 32bit words) */
1114 	nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 4);
1115 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = surf_depth; /* Format */
1116 	/* setup screen pitch */
1117 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1118 		((si->fbc.bytes_per_row & 0x0000ffff) | (si->fbc.bytes_per_row << 16)); /* Pitch */
1119 	/* setup screen location */
1120 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1121 		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetSource */
1122 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1123 		((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer); /* OffsetDest */
1124 
1125 	/* wait for room in fifo for pattern colordepth setup cmd if needed */
1126 	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
1127 	/* set pattern colordepth (writing 2 32bit words) */
1128 	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLORFORMAT, 1);
1129 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1130 
1131 	/* wait for room in fifo for bitmap colordepth setup cmd if needed */
1132 	if (nv_acc_fifofree_dma(2) != B_OK) return B_ERROR;
1133 	/* set bitmap colordepth (writing 2 32bit words) */
1134 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_SETCOLORFORMAT, 1);
1135 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1136 
1137 	/* Load our pattern into the engine: */
1138 	/* wait for room in fifo for pattern cmd if needed. */
1139 	if (nv_acc_fifofree_dma(7) != B_OK) return B_ERROR;
1140 	/* now setup pattern (writing 7 32bit words) */
1141 	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETSHAPE, 1);
1142 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* SetShape: 0 = 8x8, 1 = 64x1, 2 = 1x64 */
1143 	nv_acc_cmd_dma(NV_IMAGE_PATTERN, NV_IMAGE_PATTERN_SETCOLOR0, 4);
1144 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor0 */
1145 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetColor1 */
1146 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[0] */
1147 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xffffffff; /* SetPattern[1] */
1148 
1149 	/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
1150 	nv_start_dma();
1151 
1152 	return B_OK;
1153 }
1154 
1155 static void nv_init_for_3D_dma(void)
1156 {
1157 	/* setup PGRAPH unknown registers and modify (pre-cleared) pipe stuff for 3D use */
1158 	if (si->ps.card_arch >= NV10A)
1159 	{
1160 		/* setup unknown PGRAPH stuff */
1161 		ACCW(PGWHAT_00, 0x00000000);
1162 		ACCW(PGWHAT_01, 0x00000000);
1163 		ACCW(PGWHAT_02, 0x00000000);
1164 		ACCW(PGWHAT_03, 0x00000000);
1165 
1166 		ACCW(PGWHAT_04, 0x00001000);
1167 		ACCW(PGWHAT_05, 0x00001000);
1168 		ACCW(PGWHAT_06, 0x4003ff80);
1169 
1170 		ACCW(PGWHAT_07, 0x00000000);
1171 		ACCW(PGWHAT_08, 0x00000000);
1172 		ACCW(PGWHAT_09, 0x00000000);
1173 		ACCW(PGWHAT_0A, 0x00000000);
1174 		ACCW(PGWHAT_0B, 0x00000000);
1175 
1176 		ACCW(PGWHAT_0C, 0x00080008);
1177 		ACCW(PGWHAT_0D, 0x00080008);
1178 
1179 		ACCW(PGWHAT_0E, 0x00000000);
1180 		ACCW(PGWHAT_0F, 0x00000000);
1181 		ACCW(PGWHAT_10, 0x00000000);
1182 		ACCW(PGWHAT_11, 0x00000000);
1183 		ACCW(PGWHAT_12, 0x00000000);
1184 		ACCW(PGWHAT_13, 0x00000000);
1185 		ACCW(PGWHAT_14, 0x00000000);
1186 		ACCW(PGWHAT_15, 0x00000000);
1187 		ACCW(PGWHAT_16, 0x00000000);
1188 		ACCW(PGWHAT_17, 0x00000000);
1189 		ACCW(PGWHAT_18, 0x00000000);
1190 
1191 		ACCW(PGWHAT_19, 0x10000000);
1192 
1193 		ACCW(PGWHAT_1A, 0x00000000);
1194 		ACCW(PGWHAT_1B, 0x00000000);
1195 		ACCW(PGWHAT_1C, 0x00000000);
1196 		ACCW(PGWHAT_1D, 0x00000000);
1197 		ACCW(PGWHAT_1E, 0x00000000);
1198 		ACCW(PGWHAT_1F, 0x00000000);
1199 		ACCW(PGWHAT_20, 0x00000000);
1200 		ACCW(PGWHAT_21, 0x00000000);
1201 
1202 		ACCW(PGWHAT_22, 0x08000000);
1203 
1204 		ACCW(PGWHAT_23, 0x00000000);
1205 		ACCW(PGWHAT_24, 0x00000000);
1206 		ACCW(PGWHAT_25, 0x00000000);
1207 		ACCW(PGWHAT_26, 0x00000000);
1208 
1209 		ACCW(PGWHAT_27, 0x4b7fffff);
1210 
1211 		ACCW(PGWHAT_28, 0x00000000);
1212 		ACCW(PGWHAT_29, 0x00000000);
1213 		ACCW(PGWHAT_2A, 0x00000000);
1214 
1215 		/* setup window clipping */
1216 		/* b0-11 = min; b16-27 = max.
1217 		 * note:
1218 		 * probably two's complement values, so setting to max range here:
1219 		 * which would be -2048 upto/including +2047. */
1220 		/* horizontal */
1221 		ACCW(WINCLIP_H_0, 0x07ff0800);
1222 		ACCW(WINCLIP_H_1, 0x07ff0800);
1223 		ACCW(WINCLIP_H_2, 0x07ff0800);
1224 		ACCW(WINCLIP_H_3, 0x07ff0800);
1225 		ACCW(WINCLIP_H_4, 0x07ff0800);
1226 		ACCW(WINCLIP_H_5, 0x07ff0800);
1227 		ACCW(WINCLIP_H_6, 0x07ff0800);
1228 		ACCW(WINCLIP_H_7, 0x07ff0800);
1229 		/* vertical */
1230 		ACCW(WINCLIP_V_0, 0x07ff0800);
1231 		ACCW(WINCLIP_V_1, 0x07ff0800);
1232 		ACCW(WINCLIP_V_2, 0x07ff0800);
1233 		ACCW(WINCLIP_V_3, 0x07ff0800);
1234 		ACCW(WINCLIP_V_4, 0x07ff0800);
1235 		ACCW(WINCLIP_V_5, 0x07ff0800);
1236 		ACCW(WINCLIP_V_6, 0x07ff0800);
1237 		ACCW(WINCLIP_V_7, 0x07ff0800);
1238 
1239 		/* setup (initialize) pipe:
1240 		 * needed to get valid 3D rendering on (at least) NV1x cards. Without this
1241 		 * those cards produce rubbish instead of 3D, although the engine itself keeps
1242 		 * running and 2D stays OK. */
1243 
1244 		/* set eyetype to local, lightning etc. is off */
1245 		ACCW(NV10_XFMOD0, 0x10000000);
1246 		/* disable all lights */
1247 		ACCW(NV10_XFMOD1, 0x00000000);
1248 
1249 		/* note: upon writing data into the PIPEDAT register, the PIPEADR is
1250 		 * probably auto-incremented! */
1251 		/* (pipe adress = b2-16, pipe data = b0-31) */
1252 		/* note: pipe adresses IGRAPH registers? */
1253 		ACCW(NV10_PIPEADR, 0x00006740);
1254 		ACCW(NV10_PIPEDAT, 0x00000000);
1255 		ACCW(NV10_PIPEDAT, 0x00000000);
1256 		ACCW(NV10_PIPEDAT, 0x00000000);
1257 		ACCW(NV10_PIPEDAT, 0x3f800000);
1258 
1259 		ACCW(NV10_PIPEADR, 0x00006750);
1260 		ACCW(NV10_PIPEDAT, 0x40000000);
1261 		ACCW(NV10_PIPEDAT, 0x40000000);
1262 		ACCW(NV10_PIPEDAT, 0x40000000);
1263 		ACCW(NV10_PIPEDAT, 0x40000000);
1264 
1265 		ACCW(NV10_PIPEADR, 0x00006760);
1266 		ACCW(NV10_PIPEDAT, 0x00000000);
1267 		ACCW(NV10_PIPEDAT, 0x00000000);
1268 		ACCW(NV10_PIPEDAT, 0x3f800000);
1269 		ACCW(NV10_PIPEDAT, 0x00000000);
1270 
1271 		ACCW(NV10_PIPEADR, 0x00006770);
1272 		ACCW(NV10_PIPEDAT, 0xc5000000);
1273 		ACCW(NV10_PIPEDAT, 0xc5000000);
1274 		ACCW(NV10_PIPEDAT, 0x00000000);
1275 		ACCW(NV10_PIPEDAT, 0x00000000);
1276 
1277 		ACCW(NV10_PIPEADR, 0x00006780);
1278 		ACCW(NV10_PIPEDAT, 0x00000000);
1279 		ACCW(NV10_PIPEDAT, 0x00000000);
1280 		ACCW(NV10_PIPEDAT, 0x3f800000);
1281 		ACCW(NV10_PIPEDAT, 0x00000000);
1282 
1283 		ACCW(NV10_PIPEADR, 0x000067a0);
1284 		ACCW(NV10_PIPEDAT, 0x3f800000);
1285 		ACCW(NV10_PIPEDAT, 0x3f800000);
1286 		ACCW(NV10_PIPEDAT, 0x3f800000);
1287 		ACCW(NV10_PIPEDAT, 0x3f800000);
1288 
1289 		ACCW(NV10_PIPEADR, 0x00006ab0);
1290 		ACCW(NV10_PIPEDAT, 0x3f800000);
1291 		ACCW(NV10_PIPEDAT, 0x3f800000);
1292 		ACCW(NV10_PIPEDAT, 0x3f800000);
1293 
1294 		ACCW(NV10_PIPEADR, 0x00006ac0);
1295 		ACCW(NV10_PIPEDAT, 0x00000000);
1296 		ACCW(NV10_PIPEDAT, 0x00000000);
1297 		ACCW(NV10_PIPEDAT, 0x00000000);
1298 
1299 		ACCW(NV10_PIPEADR, 0x00006c10);
1300 		ACCW(NV10_PIPEDAT, 0xbf800000);
1301 
1302 		ACCW(NV10_PIPEADR, 0x00007030);
1303 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1304 
1305 		ACCW(NV10_PIPEADR, 0x00007040);
1306 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1307 
1308 		ACCW(NV10_PIPEADR, 0x00007050);
1309 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1310 
1311 		ACCW(NV10_PIPEADR, 0x00007060);
1312 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1313 
1314 		ACCW(NV10_PIPEADR, 0x00007070);
1315 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1316 
1317 		ACCW(NV10_PIPEADR, 0x00007080);
1318 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1319 
1320 		ACCW(NV10_PIPEADR, 0x00007090);
1321 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1322 
1323 		ACCW(NV10_PIPEADR, 0x000070a0);
1324 		ACCW(NV10_PIPEDAT, 0x7149f2ca);
1325 
1326 		ACCW(NV10_PIPEADR, 0x00006a80);
1327 		ACCW(NV10_PIPEDAT, 0x00000000);
1328 		ACCW(NV10_PIPEDAT, 0x00000000);
1329 		ACCW(NV10_PIPEDAT, 0x3f800000);
1330 
1331 		ACCW(NV10_PIPEADR, 0x00006aa0);
1332 		ACCW(NV10_PIPEDAT, 0x00000000);
1333 		ACCW(NV10_PIPEDAT, 0x00000000);
1334 		ACCW(NV10_PIPEDAT, 0x00000000);
1335 
1336 		ACCW(NV10_PIPEADR, 0x00000040);
1337 		ACCW(NV10_PIPEDAT, 0x00000005);
1338 
1339 		ACCW(NV10_PIPEADR, 0x00006400);
1340 		ACCW(NV10_PIPEDAT, 0x3f800000);
1341 		ACCW(NV10_PIPEDAT, 0x3f800000);
1342 		ACCW(NV10_PIPEDAT, 0x4b7fffff);
1343 		ACCW(NV10_PIPEDAT, 0x00000000);
1344 
1345 		ACCW(NV10_PIPEADR, 0x00006410);
1346 		ACCW(NV10_PIPEDAT, 0xc5000000);
1347 		ACCW(NV10_PIPEDAT, 0xc5000000);
1348 		ACCW(NV10_PIPEDAT, 0x00000000);
1349 		ACCW(NV10_PIPEDAT, 0x00000000);
1350 
1351 		ACCW(NV10_PIPEADR, 0x00006420);
1352 		ACCW(NV10_PIPEDAT, 0x00000000);
1353 		ACCW(NV10_PIPEDAT, 0x00000000);
1354 		ACCW(NV10_PIPEDAT, 0x00000000);
1355 		ACCW(NV10_PIPEDAT, 0x00000000);
1356 
1357 		ACCW(NV10_PIPEADR, 0x00006430);
1358 		ACCW(NV10_PIPEDAT, 0x00000000);
1359 		ACCW(NV10_PIPEDAT, 0x00000000);
1360 		ACCW(NV10_PIPEDAT, 0x00000000);
1361 		ACCW(NV10_PIPEDAT, 0x00000000);
1362 
1363 		ACCW(NV10_PIPEADR, 0x000064c0);
1364 		ACCW(NV10_PIPEDAT, 0x3f800000);
1365 		ACCW(NV10_PIPEDAT, 0x3f800000);
1366 		ACCW(NV10_PIPEDAT, 0x477fffff);
1367 		ACCW(NV10_PIPEDAT, 0x3f800000);
1368 
1369 		ACCW(NV10_PIPEADR, 0x000064d0);
1370 		ACCW(NV10_PIPEDAT, 0xc5000000);
1371 		ACCW(NV10_PIPEDAT, 0xc5000000);
1372 		ACCW(NV10_PIPEDAT, 0x00000000);
1373 		ACCW(NV10_PIPEDAT, 0x00000000);
1374 
1375 		ACCW(NV10_PIPEADR, 0x000064e0);
1376 		ACCW(NV10_PIPEDAT, 0xc4fff000);
1377 		ACCW(NV10_PIPEDAT, 0xc4fff000);
1378 		ACCW(NV10_PIPEDAT, 0x00000000);
1379 		ACCW(NV10_PIPEDAT, 0x00000000);
1380 
1381 		ACCW(NV10_PIPEADR, 0x000064f0);
1382 		ACCW(NV10_PIPEDAT, 0x00000000);
1383 		ACCW(NV10_PIPEDAT, 0x00000000);
1384 		ACCW(NV10_PIPEDAT, 0x00000000);
1385 		ACCW(NV10_PIPEDAT, 0x00000000);
1386 
1387 		/* turn lightning on */
1388 		ACCW(NV10_XFMOD0, 0x30000000);
1389 		/* set light 1 to infinite type, other lights remain off */
1390 		ACCW(NV10_XFMOD1, 0x00000004);
1391 
1392 		/* Z-buffer state is:
1393 		 * initialized, set to: 'fixed point' (integer?); Z-buffer; 16bits depth */
1394 		/* note:
1395 		 * other options possible are: floating point; 24bits depth; W-buffer */
1396 		ACCW(GLOB_STAT_0, 0x10000000);
1397 		/* set DMA instance 2 and 3 to be invalid */
1398 		ACCW(GLOB_STAT_1, 0x00000000);
1399 	}
1400 }
1401 
1402 static void nv_start_dma(void)
1403 {
1404 	uint32 dummy;
1405 
1406 	if (si->engine.dma.current != si->engine.dma.put)
1407 	{
1408 		si->engine.dma.put = si->engine.dma.current;
1409 		/* flush used caches so we know for sure the DMA cmd buffer received all data. */
1410 		if (si->ps.card_arch < NV40A)
1411 		{
1412 			/* some CPU's support out-of-order processing (WinChip/Cyrix). Flush them. */
1413 			__asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
1414 			/* read a non-cached adress to flush the cash */
1415 			dummy = ACCR(STATUS);
1416 		}
1417 		else
1418 		{
1419 			/* dummy read the first adress of the framebuffer to flush MTRR-WC buffers */
1420 			dummy = *((volatile uint32 *)(si->framebuffer));
1421 		}
1422 
1423 		/* actually start DMA to execute all commands now in buffer */
1424 		/* note:
1425 		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
1426 		 * fact all the same set. It also doesn't matter if the channel was assigned a
1427 		 * command or not. */
1428 		/* note also:
1429 		 * NV_GENERAL_DMAPUT is a write-only register on some cards (confirmed NV11). */
1430 		NV_REG32(NVACC_FIFO + NV_GENERAL_DMAPUT) = (si->engine.dma.put << 2);
1431 	}
1432 }
1433 
1434 /* this routine does not check the engine's internal hardware FIFO, but the DMA
1435  * command buffer. You can see this as a FIFO as well, that feeds the hardware FIFO.
1436  * The hardware FIFO state is checked by the DMA hardware automatically. */
1437 static status_t nv_acc_fifofree_dma(uint16 cmd_size)
1438 {
1439 	uint32 dmaget;
1440 
1441 	/* we'd better check for timeouts on the DMA engine as it's theoretically
1442 	 * breakable by malfunctioning software */
1443 	uint16 cnt = 0;
1444 
1445 	/* check if the DMA buffer has enough room for the command.
1446 	 * note:
1447 	 * engine.dma.free is 'cached' */
1448 	while ((si->engine.dma.free < cmd_size) && (cnt < 10000) && (err < 3))
1449 	{
1450 		/* see where the engine is currently fetching from the buffer */
1451 		/* note:
1452 		 * read this only once in the code as accessing registers is relatively slow */
1453 		/* note also:
1454 		 * it doesn't matter which FIFO channel's DMA registers we access, they are in
1455 		 * fact all the same set. It also doesn't matter if the channel was assigned a
1456 		 * command or not. */
1457 		dmaget = ((NV_REG32(NVACC_FIFO + NV_GENERAL_DMAGET)) >> 2);
1458 
1459 		/* update timeout counter: on NV11 on a Pentium4 2.8Ghz max reached count
1460 		 * using BeRoMeter 1.2.6 was about 600; so counting 10000 before generating
1461 		 * a timeout should definately do it. Snooze()-ing cannot be done without a
1462 		 * serious speed penalty, even if done for only 1 microSecond. */
1463 		cnt++;
1464 
1465 		/* where's the engine fetching viewed from us issuing? */
1466 		if (si->engine.dma.put >= dmaget)
1467 		{
1468 			/* engine is fetching 'behind us', the last piece of the buffer is free */
1469 
1470 			/* note the 'updated' free space we have in the DMA buffer */
1471 			si->engine.dma.free = si->engine.dma.max - si->engine.dma.current;
1472 			/* if it's enough after all we exit this routine immediately. Else: */
1473 			if (si->engine.dma.free < cmd_size)
1474 			{
1475 				/* not enough room left, so instruct DMA engine to reset the buffer
1476 				 * when it's reaching the end of it */
1477 				((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x20000000;
1478 				/* reset our buffer pointer, so new commands will be placed at the
1479 				 * beginning of the buffer. */
1480 				si->engine.dma.current = 0;
1481 				/* tell the engine to fetch the remaining command(s) in the DMA buffer
1482 				 * that where not executed before. */
1483 				nv_start_dma();
1484 
1485 				/* NOW the engine is fetching 'in front of us', so the first piece
1486 				 * of the buffer is free */
1487 
1488 				/* note the updated current free space we have in the DMA buffer */
1489 				si->engine.dma.free = dmaget - si->engine.dma.current;
1490 				/* mind this pittfall:
1491 				 * Leave some room between where the engine is fetching and where we
1492 				 * put new commands. Otherwise the engine will crash on heavy loads.
1493 				 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
1494 				 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
1495 				 * Note:
1496 				 * The engine is DMA triggered for fetching chunks every 128 bytes,
1497 				 * maybe this is the reason for this behaviour.
1498 				 * Note also:
1499 				 * it looks like the space that needs to be kept free is coupled
1500 				 * with the size of the DMA buffer. */
1501 				if (si->engine.dma.free < 256)
1502 					si->engine.dma.free = 0;
1503 				else
1504 					si->engine.dma.free -= 256;
1505 			}
1506 		}
1507 		else
1508 		{
1509 			/* engine is fetching 'in front of us', so the first piece of the buffer
1510 			 * is free */
1511 
1512 			/* note the updated current free space we have in the DMA buffer */
1513 			si->engine.dma.free = dmaget - si->engine.dma.current;
1514 			/* mind this pittfall:
1515 			 * Leave some room between where the engine is fetching and where we
1516 			 * put new commands. Otherwise the engine will crash on heavy loads.
1517 			 * A crash can be forced best in 640x480x32 mode with BeRoMeter 1.2.6.
1518 			 * (confirmed on NV11 and NV43 with less than 256 words forced freespace.)
1519 			 * Note:
1520 			 * The engine is DMA triggered for fetching chunks every 128 bytes,
1521 			 * maybe this is the reason for this behaviour.
1522 			 * Note also:
1523 			 * it looks like the space that needs to be kept free is coupled
1524 			 * with the size of the DMA buffer. */
1525 			if (si->engine.dma.free < 256)
1526 				si->engine.dma.free = 0;
1527 			else
1528 				si->engine.dma.free -= 256;
1529 		}
1530 	}
1531 
1532 	/* log timeout if we had one */
1533 	if (cnt == 10000)
1534 	{
1535 		if (err < 3) err++;
1536 		LOG(4,("ACC_DMA: fifofree; DMA timeout #%d, engine trouble!\n", err));
1537 	}
1538 
1539 	/* we must make the acceleration routines abort or the driver will hang! */
1540 	if (err >= 3) return B_ERROR;
1541 
1542 	return B_OK;
1543 }
1544 
1545 static void nv_acc_cmd_dma(uint32 cmd, uint16 offset, uint16 size)
1546 {
1547 	/* NV_FIFO_DMA_OPCODE: set number of cmd words (b18 - 28); set FIFO offset for
1548 	 * first cmd word (b2 - 15); set DMA opcode = method (b29 - 31).
1549 	 * a 'NOP' is the opcode word $00000000. */
1550 	/* note:
1551 	 * possible DMA opcodes:
1552 	 * b'000' is 'method' (execute cmd);
1553 	 * b'001' is 'jump';
1554 	 * b'002' is 'noninc method' (execute buffer wrap-around);
1555 	 * b'003' is 'call': return is executed by opcode word $00020000 (b17 = 1). */
1556 	/* note also:
1557 	 * this system uses auto-increments for the FIFO offset adresses. Make sure
1558 	 * to set a new adress if a gap exists between the previous one and the new one. */
1559 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((size << 18) |
1560 		((si->engine.fifo.ch_ptr[cmd] + offset) & 0x0000fffc));
1561 
1562 	/* space left after issuing the current command is the cmd AND it's arguments less */
1563 	si->engine.dma.free -= (size + 1);
1564 }
1565 
1566 static void nv_acc_set_ch_dma(uint16 ch, uint32 handle)
1567 {
1568 	/* issue FIFO channel assign cmd */
1569 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = ((1 << 18) | ch);
1570 	/* set new assignment */
1571 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = (0x80000000 | handle);
1572 
1573 	/* space left after issuing the current command is the cmd AND it's arguments less */
1574 	si->engine.dma.free -= 2;
1575 }
1576 
1577 /* note:
1578  * switching fifo channel assignments this way has no noticable slowdown:
1579  * measured 0.2% with Quake2. */
1580 void nv_acc_assert_fifo_dma(void)
1581 {
1582 	/* does every engine cmd this accelerant needs have a FIFO channel? */
1583 	//fixme: can probably be optimized for both speed and channel selection...
1584 	if (!si->engine.fifo.ch_ptr[NV_ROP5_SOLID] ||
1585 		!si->engine.fifo.ch_ptr[NV_IMAGE_BLACK_RECTANGLE] ||
1586 		!si->engine.fifo.ch_ptr[NV_IMAGE_PATTERN] ||
1587 		!si->engine.fifo.ch_ptr[NV4_SURFACE] ||
1588 		!si->engine.fifo.ch_ptr[NV_IMAGE_BLIT] ||
1589 		!si->engine.fifo.ch_ptr[NV4_GDI_RECTANGLE_TEXT] ||
1590 		!si->engine.fifo.ch_ptr[NV_SCALED_IMAGE_FROM_MEMORY])
1591 	{
1592 		uint16 cnt;
1593 
1594 		/* free the FIFO channels we want from the currently assigned cmd's */
1595 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[0]] = 0;
1596 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[1]] = 0;
1597 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[2]] = 0;
1598 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[3]] = 0;
1599 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[4]] = 0;
1600 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[5]] = 0;
1601 		si->engine.fifo.ch_ptr[si->engine.fifo.handle[6]] = 0;
1602 
1603 		/* set new object handles */
1604 		si->engine.fifo.handle[0] = NV_ROP5_SOLID;
1605 		si->engine.fifo.handle[1] = NV_IMAGE_BLACK_RECTANGLE;
1606 		si->engine.fifo.handle[2] = NV_IMAGE_PATTERN;
1607 		si->engine.fifo.handle[3] = NV4_SURFACE;
1608 		si->engine.fifo.handle[4] = NV_IMAGE_BLIT;
1609 		si->engine.fifo.handle[5] = NV4_GDI_RECTANGLE_TEXT;
1610 		si->engine.fifo.handle[6] = NV_SCALED_IMAGE_FROM_MEMORY;
1611 
1612 		/* set handle's pointers to their assigned FIFO channels */
1613 		/* note:
1614 		 * b0-1 aren't used as adressbits. Using b0 to indicate a valid pointer. */
1615 		for (cnt = 0; cnt < 0x08; cnt++)
1616 		{
1617 			si->engine.fifo.ch_ptr[(si->engine.fifo.handle[cnt])] =
1618 				(0x00000001 + (cnt * 0x00002000));
1619 		}
1620 
1621 		/* wait for room in fifo for new FIFO assigment cmds if needed. */
1622 		if (nv_acc_fifofree_dma(14) != B_OK) return;
1623 
1624 		/* program new FIFO assignments */
1625 		/* Raster OPeration: */
1626 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH0, si->engine.fifo.handle[0]);
1627 		/* Clip: */
1628 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH1, si->engine.fifo.handle[1]);
1629 		/* Pattern: */
1630 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH2, si->engine.fifo.handle[2]);
1631 		/* 2D Surface: */
1632 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH3, si->engine.fifo.handle[3]);
1633 		/* Blit: */
1634 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH4, si->engine.fifo.handle[4]);
1635 		/* Bitmap: */
1636 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH5, si->engine.fifo.handle[5]);
1637 		/* Scaled and fitered Blit: */
1638 		nv_acc_set_ch_dma(NV_GENERAL_FIFO_CH6, si->engine.fifo.handle[6]);
1639 
1640 		/* tell the engine to fetch and execute all (new) commands in the DMA buffer */
1641 		nv_start_dma();
1642 	}
1643 }
1644 
1645 /*
1646 	note:
1647 	moved acceleration 'top-level' routines to be integrated in the engine:
1648 	it is costly to call the engine for every single function within a loop!
1649 	(measured with BeRoMeter 1.2.6: upto 15% speed increase on all CPU's.)
1650 
1651 	note also:
1652 	splitting up each command list into sublists (see routines below) prevents
1653 	a lot more nested calls, further increasing the speed with upto 70%.
1654 
1655 	finally:
1656 	sending the sublist to just one single engine command even further increases
1657 	speed with upto another 10%. This can't be done for blits though, as this engine-
1658 	command's hardware does not support multiple objects.
1659 */
1660 
1661 /* screen to screen blit - i.e. move windows around and scroll within them. */
1662 void SCREEN_TO_SCREEN_BLIT_DMA(engine_token *et, blit_params *list, uint32 count)
1663 {
1664 	uint32 i = 0;
1665 	uint16 subcnt;
1666 
1667 	/*** init acc engine for blit function ***/
1668 	/* ROP registers (Raster OPeration):
1669 	 * wait for room in fifo for ROP cmd if needed. */
1670 	if (nv_acc_fifofree_dma(2) != B_OK) return;
1671 	/* now setup ROP (writing 2 32bit words) for GXcopy */
1672 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1673 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
1674 
1675 	/*** do each blit ***/
1676 	/* Note:
1677 	 * blit-copy direction is determined inside nvidia hardware: no setup needed */
1678 	while (count)
1679 	{
1680 		/* break up the list in sublists to minimize calls, while making sure long
1681 		 * lists still get executed without trouble */
1682 		subcnt = 32;
1683 		if (count < 32) subcnt = count;
1684 		count -= subcnt;
1685 
1686 		/* wait for room in fifo for blit cmd if needed. */
1687 		if (nv_acc_fifofree_dma(4 * subcnt) != B_OK) return;
1688 
1689 		while (subcnt--)
1690 		{
1691 			/* now setup blit (writing 4 32bit words) */
1692 			nv_acc_cmd_dma(NV_IMAGE_BLIT, NV_IMAGE_BLIT_SOURCEORG, 3);
1693 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1694 				(((list[i].src_top) << 16) | (list[i].src_left)); /* SourceOrg */
1695 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1696 				(((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */
1697 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1698 				((((list[i].height) + 1) << 16) | ((list[i].width) + 1)); /* HeightWidth */
1699 
1700 			i++;
1701 		}
1702 
1703 		/* tell the engine to fetch the commands in the DMA buffer that where not
1704 		 * executed before. */
1705 		nv_start_dma();
1706 	}
1707 
1708 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1709 	si->engine.threeD.reload = 0xffffffff;
1710 }
1711 
1712 /* scaled and filtered screen to screen blit - i.e. video playback without overlay */
1713 /* note: source and destination may not overlap. */
1714 //fixme? checkout NV5 and NV10 version of cmd: faster?? (or is 0x77 a 'autoselect' version?)
1715 void SCREEN_TO_SCREEN_SCALED_FILTERED_BLIT_DMA(engine_token *et, scaled_blit_params *list, uint32 count)
1716 {
1717 	uint32 i = 0;
1718 	uint16 subcnt;
1719 	uint32 cmd_depth;
1720 	uint8 bpp;
1721 
1722 	/*** init acc engine for scaled filtered blit function ***/
1723 	/* Set pixel width */
1724 	switch(si->dm.space)
1725 	{
1726 	case B_RGB15_LITTLE:
1727 		cmd_depth = 0x00000002;
1728 		bpp = 2;
1729 		break;
1730 	case B_RGB16_LITTLE:
1731 		cmd_depth = 0x00000007;
1732 		bpp = 2;
1733 		break;
1734 	case B_RGB32_LITTLE:
1735 	case B_RGBA32_LITTLE:
1736 		cmd_depth = 0x00000004;
1737 		bpp = 4;
1738 		break;
1739 	/* fixme sometime:
1740 	 * we could do the spaces below if this function would be modified to be able
1741 	 * to use a source outside of the desktop, i.e. using offscreen bitmaps... */
1742 	case B_YCbCr422:
1743 		cmd_depth = 0x00000005;
1744 		bpp = 2;
1745 		break;
1746 	case B_YUV422:
1747 		cmd_depth = 0x00000006;
1748 		bpp = 2;
1749 		break;
1750 	default:
1751 		/* note: this function does not support src or dest in the B_CMAP8 space! */
1752 		//fixme: the NV10 version of this cmd supports B_CMAP8 src though... (checkout)
1753 		LOG(8,("ACC_DMA: scaled_filtered_blit, invalid bit depth\n"));
1754 		return;
1755 	}
1756 
1757 	/* modify surface depth settings for 15-bit colorspace so command works as intended */
1758 	if (si->dm.space == B_RGB15_LITTLE)
1759 	{
1760 		/* wait for room in fifo for surface setup cmd if needed */
1761 		if (nv_acc_fifofree_dma(2) != B_OK) return;
1762 		/* now setup 2D surface (writing 1 32bit word) */
1763 		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
1764 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000002; /* Format */
1765 	}
1766 
1767 	/* TNT1 has fixed operation mode 'SRCcopy' while the rest can be programmed: */
1768 	if (si->ps.card_type != NV04)
1769 	{
1770 		/* wait for room in fifo for cmds if needed. */
1771 		if (nv_acc_fifofree_dma(5) != B_OK) return;
1772 		/* now setup source bitmap colorspace */
1773 		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 2);
1774 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1775 		/* now setup operation mode to SRCcopy */
1776 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000003; /* SetOperation */
1777 	}
1778 	else
1779 	{
1780 		/* wait for room in fifo for cmd if needed. */
1781 		if (nv_acc_fifofree_dma(4) != B_OK) return;
1782 		/* now setup source bitmap colorspace */
1783 		nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SETCOLORFORMAT, 1);
1784 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = cmd_depth; /* SetColorFormat */
1785 		/* TNT1 has fixed operation mode SRCcopy */
1786 	}
1787 	/* now setup fill color (writing 2 32bit words) */
1788 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1789 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */
1790 
1791 	/*** do each blit ***/
1792 	while (count)
1793 	{
1794 		/* break up the list in sublists to minimize calls, while making sure long
1795 		 * lists still get executed without trouble */
1796 		subcnt = 16;
1797 		if (count < 16) subcnt = count;
1798 		count -= subcnt;
1799 
1800 		/* wait for room in fifo for blit cmd if needed. */
1801 		if (nv_acc_fifofree_dma(12 * subcnt) != B_OK) return;
1802 
1803 		while (subcnt--)
1804 		{
1805 			/* now setup blit (writing 12 32bit words) */
1806 			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCEORG, 6);
1807 			/* setup dest clipping ref for blit (not used) (b0-15 = left, b16-31 = top) */
1808 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0; /* SourceOrg */
1809 			/* setup dest clipping size for blit */
1810 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1811 				(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* SourceHeightWidth */
1812 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1813 			/* setup destination location and size for blit */
1814 				(((list[i].dest_top) << 16) | (list[i].dest_left)); /* DestOrg */
1815 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1816 				(((list[i].dest_height + 1) << 16) | (list[i].dest_width + 1)); /* DestHeightWidth */
1817 			//fixme: findout scaling limits... (although the current cmd interface doesn't support them.)
1818 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1819 				(((list[i].src_width + 1) << 20) / (list[i].dest_width + 1)); /* HorInvScale (in 12.20 format) */
1820 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1821 				(((list[i].src_height + 1) << 20) / (list[i].dest_height + 1)); /* VerInvScale (in 12.20 format) */
1822 
1823 			nv_acc_cmd_dma(NV_SCALED_IMAGE_FROM_MEMORY, NV_SCALED_IMAGE_FROM_MEMORY_SOURCESIZE, 4);
1824 			/* setup horizontal and vertical source (fetching) ends.
1825 			 * note:
1826 			 * horizontal granularity is 2 pixels, vertical granularity is 1 pixel.
1827 			 * look at Matrox or Neomagic bes engines code for usage example. */
1828 			//fixme: tested 15, 16 and 32-bit RGB depth, verify other depths...
1829 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1830 				(((list[i].src_height + 1) << 16) |
1831 				 (((list[i].src_width + 1) + 0x0001) & ~0x0001)); /* SourceHeightWidth */
1832 			/* setup source pitch (b0-15). Set 'format origin center' (b16-17) and
1833 			 * select 'format interpolator foh (bilinear filtering)' (b24). */
1834 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1835 				(si->fbc.bytes_per_row | (1 << 16) | (1 << 24)); /* SourcePitch */
1836 			/* setup source surface location */
1837 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1838 				((uint32)((uint8*)si->fbc.frame_buffer - (uint8*)si->framebuffer)) +
1839 				(list[i].src_top * si->fbc.bytes_per_row) +	(list[i].src_left * bpp); /* Offset */
1840 			/* setup source start: first (sub)pixel contributing to output picture */
1841 			/* note:
1842 			 * clipping is not asked for.
1843 			 * look at nVidia NV10+ bes engine code for useage example. */
1844 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1845 				0; /* SourceRef (b0-15 = hor, b16-31 = ver: both in 12.4 format) */
1846 
1847 			i++;
1848 		}
1849 
1850 		/* tell the engine to fetch the commands in the DMA buffer that where not
1851 		 * executed before. */
1852 		nv_start_dma();
1853 	}
1854 
1855 	/* reset surface depth settings so the other engine commands works as intended */
1856 	if (si->dm.space == B_RGB15_LITTLE)
1857 	{
1858 		/* wait for room in fifo for surface setup cmd if needed */
1859 		if (nv_acc_fifofree_dma(2) != B_OK) return;
1860 		/* now setup 2D surface (writing 1 32bit word) */
1861 		nv_acc_cmd_dma(NV4_SURFACE, NV4_SURFACE_FORMAT, 1);
1862 		((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000004; /* Format */
1863 
1864 		/* tell the engine to fetch the commands in the DMA buffer that where not
1865 		 * executed before. */
1866 		nv_start_dma();
1867 	}
1868 
1869 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1870 	si->engine.threeD.reload = 0xffffffff;
1871 }
1872 
1873 /* rectangle fill - i.e. workspace and window background color */
1874 void FILL_RECTANGLE_DMA(engine_token *et, uint32 colorIndex, fill_rect_params *list, uint32 count)
1875 {
1876 	uint32 i = 0;
1877 	uint16 subcnt;
1878 
1879 	/*** init acc engine for fill function ***/
1880 	/* ROP registers (Raster OPeration):
1881 	 * wait for room in fifo for ROP and bitmap cmd if needed. */
1882 	if (nv_acc_fifofree_dma(4) != B_OK) return;
1883 	/* now setup ROP (writing 2 32bit words) for GXcopy */
1884 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1885 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
1886 	/* now setup fill color (writing 2 32bit words) */
1887 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1888 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */
1889 
1890 	/*** draw each rectangle ***/
1891 	while (count)
1892 	{
1893 		/* break up the list in sublists to minimize calls, while making sure long
1894 		 * lists still get executed without trouble */
1895 		subcnt = 32;
1896 		if (count < 32) subcnt = count;
1897 		count -= subcnt;
1898 
1899 		/* wait for room in fifo for bitmap cmd if needed. */
1900 		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
1901 
1902 		/* issue fill command once... */
1903 		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
1904 		/* ... and send multiple rects (engine cmd supports 32 max) */
1905 		while (subcnt--)
1906 		{
1907 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1908 				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
1909 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1910 				(((((list[i].right)+1) - (list[i].left)) << 16) |
1911 				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */
1912 
1913 			i++;
1914 		}
1915 
1916 		/* tell the engine to fetch the commands in the DMA buffer that where not
1917 		 * executed before. */
1918 		nv_start_dma();
1919 	}
1920 
1921 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1922 	si->engine.threeD.reload = 0xffffffff;
1923 }
1924 
1925 /* span fill - i.e. (selected) menuitem background color (Dano) */
1926 void FILL_SPAN_DMA(engine_token *et, uint32 colorIndex, uint16 *list, uint32 count)
1927 {
1928 	uint32 i = 0;
1929 	uint16 subcnt;
1930 
1931 	/*** init acc engine for fill function ***/
1932 	/* ROP registers (Raster OPeration):
1933 	 * wait for room in fifo for ROP and bitmap cmd if needed. */
1934 	if (nv_acc_fifofree_dma(4) != B_OK) return;
1935 	/* now setup ROP (writing 2 32bit words) for GXcopy */
1936 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1937 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0xcc; /* SetRop5 */
1938 	/* now setup fill color (writing 2 32bit words) */
1939 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1940 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = colorIndex; /* Color1A */
1941 
1942 	/*** draw each span ***/
1943 	while (count)
1944 	{
1945 		/* break up the list in sublists to minimize calls, while making sure long
1946 		 * lists still get executed without trouble */
1947 		subcnt = 32;
1948 		if (count < 32) subcnt = count;
1949 		count -= subcnt;
1950 
1951 		/* wait for room in fifo for bitmap cmd if needed. */
1952 		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
1953 
1954 		/* issue fill command once... */
1955 		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
1956 		/* ... and send multiple rects (spans) (engine cmd supports 32 max) */
1957 		while (subcnt--)
1958 		{
1959 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1960 				(((list[i+1]) << 16) | ((list[i]) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
1961 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
1962 				((((list[i+2]+1) - (list[i+1])) << 16) | 0x00000001); /* Unclipped Rect 0 WidthHeight */
1963 
1964 			i+=3;
1965 		}
1966 
1967 		/* tell the engine to fetch the commands in the DMA buffer that where not
1968 		 * executed before. */
1969 		nv_start_dma();
1970 	}
1971 
1972 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
1973 	si->engine.threeD.reload = 0xffffffff;
1974 }
1975 
1976 /* rectangle invert - i.e. text cursor and text selection */
1977 void INVERT_RECTANGLE_DMA(engine_token *et, fill_rect_params *list, uint32 count)
1978 {
1979 	uint32 i = 0;
1980 	uint16 subcnt;
1981 
1982 	/*** init acc engine for invert function ***/
1983 	/* ROP registers (Raster OPeration):
1984 	 * wait for room in fifo for ROP and bitmap cmd if needed. */
1985 	if (nv_acc_fifofree_dma(4) != B_OK) return;
1986 	/* now setup ROP (writing 2 32bit words) for GXinvert */
1987 	nv_acc_cmd_dma(NV_ROP5_SOLID, NV_ROP5_SOLID_SETROP5, 1);
1988 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x55; /* SetRop5 */
1989 	/* now reset fill color (writing 2 32bit words) */
1990 	nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_COLOR1A, 1);
1991 	((uint32*)(si->dma_buffer))[si->engine.dma.current++] = 0x00000000; /* Color1A */
1992 
1993 	/*** invert each rectangle ***/
1994 	while (count)
1995 	{
1996 		/* break up the list in sublists to minimize calls, while making sure long
1997 		 * lists still get executed without trouble */
1998 		subcnt = 32;
1999 		if (count < 32) subcnt = count;
2000 		count -= subcnt;
2001 
2002 		/* wait for room in fifo for bitmap cmd if needed. */
2003 		if (nv_acc_fifofree_dma(1 + (2 * subcnt)) != B_OK) return;
2004 
2005 		/* issue fill command once... */
2006 		nv_acc_cmd_dma(NV4_GDI_RECTANGLE_TEXT, NV4_GDI_RECTANGLE_TEXT_UCR0_LEFTTOP, (2 * subcnt));
2007 		/* ... and send multiple rects (engine cmd supports 32 max) */
2008 		while (subcnt--)
2009 		{
2010 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2011 				(((list[i].left) << 16) | ((list[i].top) & 0x0000ffff)); /* Unclipped Rect 0 LeftTop */
2012 			((uint32*)(si->dma_buffer))[si->engine.dma.current++] =
2013 				(((((list[i].right)+1) - (list[i].left)) << 16) |
2014 				(((list[i].bottom-list[i].top)+1) & 0x0000ffff)); /* Unclipped Rect 0 WidthHeight */
2015 
2016 			i++;
2017 		}
2018 
2019 		/* tell the engine to fetch the commands in the DMA buffer that where not
2020 		 * executed before. */
2021 		nv_start_dma();
2022 	}
2023 
2024 	/* tell 3D add-ons that they should reload their rendering states and surfaces */
2025 	si->engine.threeD.reload = 0xffffffff;
2026 }
2027