1 /* 2 Copyright (c) 2002, Thomas Kurschel 3 4 5 Part of Radeon accelerant 6 7 Command Processor handling 8 9 10 Something about synchronization in general: 11 12 The DDK says that only some register accesses are stored in the 13 Command FIFO, i.e. in almost all cases you don't have to wait until 14 there is enough space in this FIFO. Unfortunately, ATI doesn't speak 15 clearly here and doesn't tell you which registers are buffered and 16 which not (the r300 DDK provides some examples only, other DDKs refer 17 to some include file where no such info could be found). 18 19 Looking at pre-Radeon specs, we have the following register ranges: 20 0 configuration/display/multi-media registers 21 0xf00 read-only PCI configuration space 22 0x1000 CCE registers 23 0x1400 FIFOed GUI-registers 24 25 So, if the list is still correct, the affected registers are only 26 those used for 2D/3D drawing. 27 28 This is very important as if the register you want to write is 29 buffered, you have to do a busy wait until there is enough FIFO 30 space. As concurrent threads may do the same, register access should 31 only be done with a lock held. We never write GUI-registers directly, 32 so we never have to wait for the FIFO and thus don't need this lock. 33 34 */ 35 36 #include "radeon_accelerant.h" 37 #include "mmio.h" 38 #include "buscntrl_regs.h" 39 #include "utils.h" 40 #include <sys/ioctl.h> 41 #include "CP.h" 42 43 #include "log_coll.h" 44 #include "log_enum.h" 45 46 #include <string.h> 47 48 49 // get number of free entries in CP's ring buffer 50 static uint getAvailRingBuffer( accelerator_info *ai ) 51 { 52 CP_info *cp = &ai->si->cp; 53 int space; 54 55 space = 56 *(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset) 57 //*cp->ring.head 58 - cp->ring.tail; 59 //space = INREG( ai->regs, RADEON_CP_RB_RPTR ) - cp->ring.tail; 60 61 if( space <= 0 ) 62 space += cp->ring.size; 63 64 // don't fill up the entire buffer as we cannot 65 // distinguish between a full and an empty ring 66 --space; 67 68 SHOW_FLOW( 3, "head=%ld, tail=%ld, space=%ld", 69 *(uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.head_mem_offset), 70 //*cp->ring.head, 71 cp->ring.tail, space ); 72 73 LOG1( si->log, _GetAvailRingBufferQueue, space ); 74 75 cp->ring.space = space; 76 77 return space; 78 } 79 80 81 // mark all indirect buffers that have been processed as being free; 82 // lock must be hold 83 void Radeon_FreeIndirectBuffers( accelerator_info *ai ) 84 { 85 CP_info *cp = &ai->si->cp; 86 int32 cur_processed_tag = 87 ((uint32 *)(ai->mapped_memory[cp->feedback.mem_type].data + cp->feedback.scratch_mem_offset))[1]; 88 //ai->si->cp.scratch.ptr[1]; 89 //INREG( ai->regs, RADEON_SCRATCH_REG1 ); 90 91 SHOW_FLOW( 3, "processed_tag=%d", cur_processed_tag ); 92 93 // mark all sent indirect buffers as free 94 while( cp->buffers.oldest != -1 ) { 95 indirect_buffer *oldest_buffer = 96 &cp->buffers.buffers[cp->buffers.oldest]; 97 int tmp_oldest_buffer; 98 99 SHOW_FLOW( 3, "oldset buffer's tag: %d", oldest_buffer->send_tag ); 100 101 // this is a tricky calculation to handle wrap-arounds correctly, 102 // so don't change it unless you really understand the signess problem 103 if( (int32)(cur_processed_tag - oldest_buffer->send_tag) < 0 ) 104 break; 105 106 SHOW_FLOW( 3, "mark %d as being free", oldest_buffer->send_tag ); 107 108 // remove buffer from "used" list 109 tmp_oldest_buffer = oldest_buffer->next; 110 111 if( tmp_oldest_buffer == -1 ) 112 cp->buffers.newest = -1; 113 114 // put it on free list 115 oldest_buffer->next = cp->buffers.free_list; 116 cp->buffers.free_list = cp->buffers.oldest; 117 118 cp->buffers.oldest = tmp_oldest_buffer; 119 } 120 } 121 122 123 // wait until an indirect buffer becomes available; 124 // lock must be hold 125 static void Radeon_WaitForFreeIndirectBuffers( accelerator_info *ai ) 126 { 127 bigtime_t start_time; 128 CP_info *cp = &ai->si->cp; 129 130 SHOW_FLOW0( 3, "" ); 131 132 start_time = system_time(); 133 134 while( 1 ) { 135 bigtime_t sample_time; 136 137 Radeon_FreeIndirectBuffers( ai ); 138 139 if( cp->buffers.free_list >= 0 ) 140 return; 141 142 sample_time = system_time(); 143 144 if( sample_time - start_time > 100000 ) 145 break; 146 147 RELEASE_BEN( cp->lock ); 148 149 // use exponential fall-off 150 // in the beginning do busy-waiting, later on we let the thread sleep; 151 // the micro-spin is used to reduce PCI load 152 if( sample_time - start_time > 5000 ) 153 snooze( (sample_time - start_time) / 10 ); 154 else 155 Radeon_Spin( 1 ); 156 157 ACQUIRE_BEN( cp->lock ); 158 } 159 160 SHOW_ERROR0( 0, "All buffers are in use and engine doesn't finish any of them" ); 161 162 // lock must be released during reset (reset acquires it automatically) 163 RELEASE_BEN( cp->lock ); 164 Radeon_ResetEngine( ai ); 165 ACQUIRE_BEN( cp->lock ); 166 } 167 168 // allocate an indirect buffer 169 int Radeon_AllocIndirectBuffer( accelerator_info *ai, bool keep_lock ) 170 { 171 CP_info *cp = &ai->si->cp; 172 int buffer_idx; 173 174 SHOW_FLOW0( 3, "" ); 175 176 ACQUIRE_BEN( cp->lock ); 177 178 if( cp->buffers.free_list == -1 ) 179 Radeon_WaitForFreeIndirectBuffers( ai ); 180 181 buffer_idx = cp->buffers.free_list; 182 cp->buffers.free_list = cp->buffers.buffers[buffer_idx].next; 183 184 //if( !keep_lock ) 185 RELEASE_BEN( cp->lock ); 186 (void)keep_lock; 187 188 SHOW_FLOW( 3, "got %d", buffer_idx ); 189 190 return buffer_idx; 191 } 192 193 194 // explicitely free an indirect buffer; 195 // this is not needed if the buffer was send via SendIndirectBuffer() 196 // never_used - set to true if the buffer wasn't even sent indirectly 197 // as a state buffer 198 // !Warning! 199 // if never_used is false, execution may take very long as all buffers 200 // must be flushed! 201 void Radeon_FreeIndirectBuffer( accelerator_info *ai, int buffer_idx, bool never_used ) 202 { 203 CP_info *cp = &ai->si->cp; 204 205 SHOW_FLOW( 3, "buffer_idx=%d, never_used=%d", buffer_idx, never_used ); 206 207 // if the buffer was used as a state buffer, we don't record its usage, 208 // so we don't know if the buffer was/is/will be used; 209 // the only way to be sure is to let the CP run dry 210 if( !never_used ) 211 Radeon_WaitForIdle( ai, false ); 212 213 ACQUIRE_BEN( cp->lock ); 214 215 cp->buffers.buffers[buffer_idx].next = cp->buffers.free_list; 216 cp->buffers.free_list = buffer_idx; 217 218 RELEASE_BEN( cp->lock ); 219 220 SHOW_FLOW0( 3, "done" ); 221 } 222 223 // this function must be moved to end of file to avoid inlining 224 void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords ); 225 226 227 // start writing to ring buffer 228 // num_dwords - number of dwords to write (must be precise!) 229 // !Warning! 230 // during wait, CP's benaphore is released 231 #define WRITE_RB_START( num_dwords ) \ 232 { \ 233 uint32 *ring_start; \ 234 uint32 ring_tail, ring_tail_mask; \ 235 uint32 ring_tail_increment = (num_dwords); \ 236 if( cp->ring.space < ring_tail_increment ) \ 237 Radeon_WaitForRingBufferSpace( ai, ring_tail_increment ); \ 238 ring_start = \ 239 (uint32 *)(ai->mapped_memory[cp->ring.mem_type].data + cp->ring.mem_offset); \ 240 /*cp->ring.start;*/ \ 241 ring_tail = cp->ring.tail; \ 242 ring_tail_mask = cp->ring.tail_mask; 243 244 // write single dword to ring buffer 245 #define WRITE_RB( value ) \ 246 { \ 247 uint32 val = (value); \ 248 SHOW_FLOW( 3, "@%d: %x", ring_tail, val ); \ 249 ring_start[ring_tail++] = val; \ 250 ring_tail &= ring_tail_mask; \ 251 } 252 253 // finish writing to ring buffer 254 #define WRITE_RB_FINISH \ 255 cp->ring.tail = ring_tail; \ 256 cp->ring.space -= ring_tail_increment; \ 257 } 258 259 // submit indirect buffer for execution. 260 // the indirect buffer must not be used afterwards! 261 // buffer_idx - index of indirect buffer to submit 262 // buffer_size - size of indirect buffer in 32 bits 263 // state_buffer_idx - index of indirect buffer to restore required state 264 // state_buffer_size - size of indirect buffer to restore required state 265 // returns: tag of buffer (so you can wait for its execution) 266 // if no special state is required, set state_buffer_size to zero 267 void Radeon_SendIndirectBuffer( accelerator_info *ai, 268 int buffer_idx, int buffer_size, 269 int state_buffer_idx, int state_buffer_size, bool has_lock ) 270 { 271 CP_info *cp = &ai->si->cp; 272 bool need_stateupdate; 273 274 SHOW_FLOW( 3, "buffer_idx=%d, buffer_size=%d, state_buffer_idx=%d, state_buffer_size=%d", 275 buffer_idx, buffer_size, state_buffer_idx, state_buffer_size ); 276 277 if( (buffer_size & 1) != 0 ) { 278 SHOW_FLOW( 3, "buffer has uneven size (%d)", buffer_size ); 279 // size of indirect buffers _must_ be multiple of 64 bits, so 280 // add a nop to fulfil alignment 281 Radeon_GetIndirectBufferPtr( ai, buffer_idx )[buffer_size] = RADEON_CP_PACKET2; 282 buffer_size += 1; 283 } 284 285 //if( !has_lock ) 286 ACQUIRE_BEN( cp->lock ); 287 (void)has_lock; 288 289 need_stateupdate = 290 state_buffer_size > 0 && state_buffer_idx != cp->buffers.active_state; 291 292 WRITE_RB_START( 5 + (need_stateupdate ? 3 : 0) ); 293 294 // if the indirect buffer to submit requires a special state and the 295 // hardware is in wrong state then execute state buffer 296 if( need_stateupdate ) { 297 SHOW_FLOW0( 3, "update state" ); 298 299 WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 )); 300 WRITE_RB( cp->buffers.vm_start + 301 state_buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 )); 302 WRITE_RB( state_buffer_size ); 303 304 cp->buffers.active_state = state_buffer_idx; 305 } 306 307 // execute indirect buffer 308 WRITE_RB( CP_PACKET0( RADEON_CP_IB_BASE, 2 )); 309 WRITE_RB( cp->buffers.vm_start + buffer_idx * INDIRECT_BUFFER_SIZE * sizeof( uint32 )); 310 WRITE_RB( buffer_size ); 311 312 // give buffer a tag so it can be freed after execution 313 WRITE_RB( CP_PACKET0( RADEON_SCRATCH_REG1, 1 )); 314 WRITE_RB( cp->buffers.buffers[buffer_idx].send_tag = (int32)++cp->buffers.cur_tag ); 315 316 SHOW_FLOW( 3, "Assigned tag %d", cp->buffers.buffers[buffer_idx].send_tag ); 317 318 WRITE_RB_FINISH; 319 320 // append buffer to list of submitted buffers 321 if( cp->buffers.newest > 0 ) 322 cp->buffers.buffers[cp->buffers.newest].next = buffer_idx; 323 else 324 cp->buffers.oldest = buffer_idx; 325 326 cp->buffers.newest = buffer_idx; 327 cp->buffers.buffers[buffer_idx].next = -1; 328 329 // flush writes to CP buffers 330 // (this code is a bit of a overkill - currently, only some WinChip/Cyrix 331 // CPU's support out-of-order writes, but we are prepared) 332 __asm__ __volatile__ ("lock; addl $0,0(%%esp)": : :"memory"); 333 // make sure the motherboard chipset has flushed its write buffer by 334 // reading some uncached memory 335 //(void)*(volatile int *)si->framebuffer; 336 INREG( ai->regs, RADEON_CP_RB_RPTR ); 337 338 //SHOW_FLOW( 3, "new tail: %d", cp->ring.tail ); 339 340 //snooze( 100 ); 341 342 // now, the command list should really be written to memory, 343 // so it's safe to instruct the graphics card to read it 344 OUTREG( ai->regs, RADEON_CP_RB_WPTR, cp->ring.tail ); 345 346 // read from PCI bus to ensure correct posting 347 //INREG( ai->regs, RADEON_CP_RB_RPTR ); 348 349 RELEASE_BEN( cp->lock ); 350 351 SHOW_FLOW0( 3, "done" ); 352 } 353 354 355 // mark state buffer as being invalid; 356 // this must be done _before_ modifying the state buffer as the 357 // state buffer may be in use 358 void Radeon_InvalidateStateBuffer( accelerator_info *ai, int state_buffer_idx ) 359 { 360 CP_info *cp = &ai->si->cp; 361 362 // make sure state buffer is not used anymore 363 Radeon_WaitForIdle( ai, false ); 364 365 ACQUIRE_BEN( cp->lock ); 366 367 // mark state as being invalid 368 if( cp->buffers.active_state == state_buffer_idx ) 369 cp->buffers.active_state = -1; 370 371 RELEASE_BEN( cp->lock ); 372 } 373 374 375 // wait until there is enough space in ring buffer 376 // num_dwords - number of dwords needed in ring buffer 377 // must be called with benaphore hold 378 void Radeon_WaitForRingBufferSpace( accelerator_info *ai, uint num_dwords ) 379 { 380 bigtime_t start_time; 381 CP_info *cp = &ai->si->cp; 382 383 start_time = system_time(); 384 385 while( getAvailRingBuffer( ai ) < num_dwords ) { 386 bigtime_t sample_time; 387 388 sample_time = system_time(); 389 390 if( sample_time - start_time > 100000 ) 391 break; 392 393 RELEASE_BEN( cp->lock ); 394 395 // use exponential fall-off 396 // in the beginning do busy-waiting, later on we let the thread sleep; 397 // the micro-spin is used to reduce PCI load 398 if( sample_time - start_time > 5000 ) 399 snooze( (sample_time - start_time) / 10 ); 400 else 401 Radeon_Spin( 1 ); 402 403 ACQUIRE_BEN( cp->lock ); 404 } 405 } 406