xref: /haiku/src/add-ons/accelerants/radeon/CP.c (revision 5ffbe7d778424c9c59f00b37a3baff5c4c648790)
1 /*
2 	Copyright (c) 2002, Thomas Kurschel
3 
4 
5 	Part of Radeon accelerant
6 
7 	Command Processor handling
8 
9 
10 	Something about synchronization in general:
11 
12 	The DDK says that only some register accesses are stored in the
13 	Command FIFO, i.e. in almost all cases you don't have to wait until
14 	there is enough space in this FIFO. Unfortunately, ATI doesn't speak
15 	clearly here and doesn't tell you which registers are buffered and
16 	which not (the r300 DDK provides some examples only, other DDKs refer
17 	to some include file where no such info could be found).
18 
19 	Looking at pre-Radeon specs, we have the following register ranges:
20 		0		configuration/display/multi-media registers
21 		0xf00	read-only PCI configuration space
22 		0x1000	CCE registers
23 		0x1400	FIFOed GUI-registers
24 
25 	So, if the list is still correct, the affected registers are only
26 	those used for 2D/3D drawing.
27 
28 	This is very important as if the register you want to write is
29 	buffered, you have to do a busy wait until there is enough FIFO
30 	space. As concurrent threads may do the same, register access should
31 	only be done with a lock held. We never write GUI-registers directly,
32 	so we never have to wait for the FIFO and thus don't need this lock.
33 
34 */
35 
36 #include "radeon_accelerant.h"
37 #include "mmio.h"
38 #include "buscntrl_regs.h"
39 #include "utils.h"
40 #include <sys/ioctl.h>
41 #include "CP.h"
42 
43 #include "log_coll.h"
44 #include "log_enum.h"
45 
46 #include <string.h>
47 
48 
49 // get number of free entries in CP's ring buffer
getAvailRingBuffer(accelerator_info * ai)50 static uint getAvailRingBuffer( accelerator_info *ai )
51 {
52 	CP_info *cp = &ai->si->cp;
53 	int space;
54 
55 	space =
56 		*(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset)
57 		//*cp->ring.head
58 		- cp->ring.tail;
59 	//space = INREG( ai->regs, RADEON_CP_RB_RPTR ) - cp->ring.tail;
60 
61 	if( space <= 0 )
62 		space += cp->ring.size;
63 
64 	// don't fill up the entire buffer as we cannot
65 	// distinguish between a full and an empty ring
66 	--space;
67 
68 	SHOW_FLOW( 3, "head=%ld, tail=%ld, space=%ld",
69 		*(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset),
70 		//*cp->ring.head,
71 		cp->ring.tail, space );
72 
73 	LOG1( si->log, _GetAvailRingBufferQueue, space );
74 
75 	cp->ring.space = space;
76 
77 	return space;
78 }
79 
80 
81 // mark all indirect buffers that have been processed as being free;
82 // lock must be hold
Radeon_FreeIndirectBuffers(accelerator_info * ai)83 void Radeon_FreeIndirectBuffers( accelerator_info *ai )
84 {
85 	CP_info *cp = &ai->si->cp;
86 	int32 cur_processed_tag =
87 		((uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.scratch_mem_offset))[1];
88 		//ai->si->cp.scratch.ptr[1];
89 	//INREG( ai->regs, RADEON_SCRATCH_REG1 );
90 
91 	SHOW_FLOW( 3, "processed_tag=%d", cur_processed_tag );
92 
93 	// mark all sent indirect buffers as free
94 	while( cp->buffers.oldest != -1 ) {
95 		indirect_buffer *oldest_buffer =
96 			&cp->buffers.buffers[cp->buffers.oldest];
97 		int tmp_oldest_buffer;
98 
99 		SHOW_FLOW( 3, "oldset buffer's tag: %d", oldest_buffer->send_tag );
100 
101 		// this is a tricky calculation to handle wrap-arounds correctly,
102 		// so don't change it unless you really understand the signess problem
103 		if( (int32)(cur_processed_tag - oldest_buffer->send_tag) < 0 )
104 			break;
105 
106 		SHOW_FLOW( 3, "mark %d as being free", oldest_buffer->send_tag );
107 
108 		// remove buffer from "used" list
109 		tmp_oldest_buffer = oldest_buffer->next;
110 
111 		if( tmp_oldest_buffer == -1 )
112 			cp->buffers.newest = -1;
113 
114 		// put it on free list
115 		oldest_buffer->next = cp->buffers.free_list;
116 		cp->buffers.free_list = cp->buffers.oldest;
117 
118 		cp->buffers.oldest = tmp_oldest_buffer;
119 	}
120 }
121 
122 
123 // wait until an indirect buffer becomes available;
124 // lock must be hold
Radeon_WaitForFreeIndirectBuffers(accelerator_info * ai)125 static void Radeon_WaitForFreeIndirectBuffers( accelerator_info *ai )
126 {
127 	bigtime_t start_time;
128 	CP_info *cp = &ai->si->cp;
129 
130 	SHOW_FLOW0( 3, "" );
131 
132 	start_time = system_time();
133 
134 	while( 1 ) {
135 		bigtime_t sample_time;
136 
137 		Radeon_FreeIndirectBuffers( ai );
138 
139 		if( cp->buffers.free_list >= 0 )
140 			return;
141 
142 		sample_time = system_time();
143 
144 		if( sample_time - start_time > 100000 )
145 			break;
146 
147 		RELEASE_BEN( cp->lock );
148 
149 		// use exponential fall-off
150 		// in the beginning do busy-waiting, later on we let the thread sleep;
151 		// the micro-spin is used to reduce PCI load
152 		if( sample_time - start_time > 5000 )
153 			snooze( (sample_time - start_time) / 10 );
154 		else
155 			Radeon_Spin( 1 );
156 
157 		ACQUIRE_BEN( cp->lock );
158 	}
159 
160 	SHOW_ERROR0( 0, "All buffers are in use and engine doesn't finish any of them" );
161 
162 	// lock must be released during reset (reset acquires it automatically)
163 	RELEASE_BEN( cp->lock );
164 	Radeon_ResetEngine( ai );
165 	ACQUIRE_BEN( cp->lock );
166 }
167 
168 // allocate an indirect buffer
Radeon_AllocIndirectBuffer(accelerator_info * ai,bool keep_lock)169 int Radeon_AllocIndirectBuffer( accelerator_info *ai, bool keep_lock )
170 {
171 	CP_info *cp = &ai->si->cp;
172 	int buffer_idx;
173 
174 	SHOW_FLOW0( 3, "" );
175 
176 	ACQUIRE_BEN( cp->lock );
177 
178 	if( cp->buffers.free_list == -1 )
179 		Radeon_WaitForFreeIndirectBuffers( ai );
180 
181 	buffer_idx = cp->buffers.free_list;
182 	cp->buffers.free_list = cp->buffers.buffers[buffer_idx].next;
183 
184 	//if( !keep_lock )
185 		RELEASE_BEN( cp->lock );
186 	(void)keep_lock;
187 
188 	SHOW_FLOW( 3, "got %d", buffer_idx );
189 
190 	return buffer_idx;
191 }
192 
193 
194 // explicitely free an indirect buffer;
195 // this is not needed if the buffer was send via SendIndirectBuffer()
196 // never_used	- 	set to true if the buffer wasn't even sent indirectly
197 //					as a state buffer
198 // !Warning!
199 // if never_used is false, execution may take very long as all buffers
200 // must be flushed!
Radeon_FreeIndirectBuffer(accelerator_info * ai,int buffer_idx,bool never_used)201 void Radeon_FreeIndirectBuffer( accelerator_info *ai, int buffer_idx, bool never_used )
202 {
203 	CP_info *cp = &ai->si->cp;
204 
205 	SHOW_FLOW( 3, "buffer_idx=%d, never_used=%d", buffer_idx, never_used );
206 
207 	// if the buffer was used as a state buffer, we don't record its usage,
208 	// so we don't know if the buffer was/is/will be used;
209 	// the only way to be sure is to let the CP run dry
210 	if( !never_used )
211 		Radeon_WaitForIdle( ai, false );
212 
213 	ACQUIRE_BEN( cp->lock );
214 
215 	cp->buffers.buffers[buffer_idx].next = cp->buffers.free_list;
216 	cp->buffers.free_list = buffer_idx;
217 
218 	RELEASE_BEN( cp->lock );
219 
220 	SHOW_FLOW0( 3, "done" );
221 }
222 
223 // this function must be moved to end of file to avoid inlining
224 void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords );
225 
226 
227 // start writing to ring buffer
228 // num_dwords - number of dwords to write (must be precise!)
229 // !Warning!
230 // during wait, CP's benaphore is released
231 #define WRITE_RB_START( num_dwords ) \
232 	{ \
233 		uint32 *ring_start; \
234 		uint32 ring_tail, ring_tail_mask; \
235 		uint32 ring_tail_increment = (num_dwords); \
236 		if( cp->ring.space < ring_tail_increment ) \
237 			Radeon_WaitForRingBufferSpace( ai, ring_tail_increment ); \
238 		ring_start = \
239 		(uint32 *)(ai->mapped_memory[cp->ring.mem_type].data + cp->ring.mem_offset); \
240 			/*cp->ring.start;*/ \
241 		ring_tail = cp->ring.tail; \
242 		ring_tail_mask = cp->ring.tail_mask;
243 
244 // write single dword to ring buffer
245 #define WRITE_RB( value ) \
246 	{ \
247 		uint32 val = (value); \
248 		SHOW_FLOW( 3, "@%d: %x", ring_tail, val ); \
249 		ring_start[ring_tail++] = val; \
250 		ring_tail &= ring_tail_mask; \
251 	}
252 
253 // finish writing to ring buffer
254 #define WRITE_RB_FINISH \
255 		cp->ring.tail = ring_tail; \
256 		cp->ring.space -= ring_tail_increment; \
257 	}
258 
259 // submit indirect buffer for execution.
260 // the indirect buffer must not be used afterwards!
261 // buffer_idx			- index of indirect buffer to submit
262 // buffer_size  		- size of indirect buffer in 32 bits
263 // state_buffer_idx		- index of indirect buffer to restore required state
264 // state_buffer_size	- size of indirect buffer to restore required state
265 // returns:				  tag of buffer (so you can wait for its execution)
266 // if no special state is required, set state_buffer_size to zero
Radeon_SendIndirectBuffer(accelerator_info * ai,int buffer_idx,int buffer_size,int state_buffer_idx,int state_buffer_size,bool has_lock)267 void Radeon_SendIndirectBuffer( accelerator_info *ai,
268 	int buffer_idx, int buffer_size,
269 	int state_buffer_idx, int state_buffer_size, bool has_lock )
270 {
271 	CP_info *cp = &ai->si->cp;
272 	bool need_stateupdate;
273 
274 	SHOW_FLOW( 3, "buffer_idx=%d, buffer_size=%d, state_buffer_idx=%d, state_buffer_size=%d",
275 		buffer_idx, buffer_size, state_buffer_idx, state_buffer_size );
276 
277 	if( (buffer_size & 1) != 0 ) {
278 		SHOW_FLOW( 3, "buffer has uneven size (%d)", buffer_size );
279 		// size of indirect buffers _must_ be multiple of 64 bits, so
280 		// add a nop to fulfil alignment
281 		Radeon_GetIndirectBufferPtr( ai, buffer_idx )[buffer_size] = RADEON_CP_PACKET2;
282 		buffer_size += 1;
283 	}
284 
285 	//if( !has_lock )
286 		ACQUIRE_BEN( cp->lock );
287 	(void)has_lock;
288 
289 	need_stateupdate =
290 		state_buffer_size > 0 && state_buffer_idx != cp->buffers.active_state;
291 
292 	WRITE_RB_START( 5 + (need_stateupdate ? 3 : 0) );
293 
294 	// if the indirect buffer to submit requires a special state and the
295 	// hardware is in wrong state then execute state buffer
296 	if( need_stateupdate ) {
297 		SHOW_FLOW0( 3, "update state" );
298 
299 		WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 ));
300 		WRITE_RB( cp->buffers.vm_start +
301 			state_buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 ));
302 		WRITE_RB( state_buffer_size );
303 
304 		cp->buffers.active_state = state_buffer_idx;
305 	}
306 
307 	// execute indirect buffer
308 	WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 ));
309 	WRITE_RB( cp->buffers.vm_start + buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 ));
310 	WRITE_RB( buffer_size );
311 
312 	// give buffer a tag so it can be freed after execution
313 	WRITE_RB( CP_PACKET0( RADEON_SCRATCH_REG1, 1 ));
314 	WRITE_RB( cp->buffers.buffers[buffer_idx].send_tag = (int32)++cp->buffers.cur_tag );
315 
316 	SHOW_FLOW( 3, "Assigned tag %d", cp->buffers.buffers[buffer_idx].send_tag );
317 
318 	WRITE_RB_FINISH;
319 
320 	// append buffer to list of submitted buffers
321 	if( cp->buffers.newest > 0 )
322 		cp->buffers.buffers[cp->buffers.newest].next = buffer_idx;
323 	else
324 		cp->buffers.oldest = buffer_idx;
325 
326 	cp->buffers.newest = buffer_idx;
327 	cp->buffers.buffers[buffer_idx].next = -1;
328 
329 	// flush writes to CP buffers
330 	// (this code is a bit of a overkill - currently, only some WinChip/Cyrix
331 	//  CPU's support out-of-order writes, but we are prepared)
332 	// TODO : Other Architectures? PowerPC?
333 	#ifdef __i386__
334 	__asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory");
335 	#endif
336 	// make sure the motherboard chipset has flushed its write buffer by
337 	// reading some uncached memory
338 	//(void)*(volatile int *)si->framebuffer;
339 	INREG( ai->regs, RADEON_CP_RB_RPTR );
340 
341 	//SHOW_FLOW( 3, "new tail: %d", cp->ring.tail );
342 
343 	//snooze( 100 );
344 
345 	// now, the command list should really be written to memory,
346 	// so it's safe to instruct the graphics card to read it
347 	OUTREG( ai->regs, RADEON_CP_RB_WPTR, cp->ring.tail );
348 
349 	// read from PCI bus to ensure correct posting
350 	//INREG( ai->regs, RADEON_CP_RB_RPTR );
351 
352 	RELEASE_BEN( cp->lock );
353 
354 	SHOW_FLOW0( 3, "done" );
355 }
356 
357 
358 // mark state buffer as being invalid;
359 // this must be done _before_ modifying the state buffer as the
360 // state buffer may be in use
Radeon_InvalidateStateBuffer(accelerator_info * ai,int state_buffer_idx)361 void Radeon_InvalidateStateBuffer( accelerator_info *ai, int state_buffer_idx )
362 {
363 	CP_info *cp = &ai->si->cp;
364 
365 	// make sure state buffer is not used anymore
366 	Radeon_WaitForIdle( ai, false );
367 
368 	ACQUIRE_BEN( cp->lock );
369 
370 	// mark state as being invalid
371 	if( cp->buffers.active_state == state_buffer_idx )
372 		cp->buffers.active_state = -1;
373 
374 	RELEASE_BEN( cp->lock );
375 }
376 
377 
378 // wait until there is enough space in ring buffer
379 // num_dwords - number of dwords needed in ring buffer
380 // must be called with benaphore hold
Radeon_WaitForRingBufferSpace(accelerator_info * ai,uint num_dwords)381 void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords )
382 {
383 	bigtime_t start_time;
384 	CP_info *cp = &ai->si->cp;
385 
386 	start_time = system_time();
387 
388 	while( getAvailRingBuffer( ai ) < num_dwords ) {
389 		bigtime_t sample_time;
390 
391 		sample_time = system_time();
392 
393 		if( sample_time - start_time > 100000 )
394 			break;
395 
396 		RELEASE_BEN( cp->lock );
397 
398 		// use exponential fall-off
399 		// in the beginning do busy-waiting, later on we let the thread sleep;
400 		// the micro-spin is used to reduce PCI load
401 		if( sample_time - start_time > 5000 )
402 			snooze( (sample_time - start_time) / 10 );
403 		else
404 			Radeon_Spin( 1 );
405 
406 		ACQUIRE_BEN( cp->lock );
407 	}
408 }
409