xref: /haiku/src/system/kernel/cache/block_cache.cpp (revision 138d92635d06584fde6e175814d8f30c9d94885d)
1 /*
2  * Copyright 2004-2020, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include <block_cache.h>
8 
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13 #include <sys/uio.h>
14 
15 #include <KernelExport.h>
16 #include <fs_cache.h>
17 
18 #include <condition_variable.h>
19 #include <lock.h>
20 #include <low_resource_manager.h>
21 #include <slab/Slab.h>
22 #include <tracing.h>
23 #include <util/kernel_cpp.h>
24 #include <util/DoublyLinkedList.h>
25 #include <util/AutoLock.h>
26 #include <StackOrHeapArray.h>
27 #include <vm/vm_page.h>
28 
29 #ifndef BUILDING_USERLAND_FS_SERVER
30 #include "IORequest.h"
31 #endif // !BUILDING_USERLAND_FS_SERVER
32 #include "kernel_debug_config.h"
33 
34 
35 // TODO: this is a naive but growing implementation to test the API:
36 //	block reading/writing is not at all optimized for speed, it will
37 //	just read and write single blocks.
38 // TODO: the retrieval/copy of the original data could be delayed until the
39 //		new data must be written, ie. in low memory situations.
40 
41 #ifdef _KERNEL_MODE
42 #	define TRACE_ALWAYS(x...) dprintf(x)
43 #else
44 #	define TRACE_ALWAYS(x...) printf(x)
45 #endif
46 
47 //#define TRACE_BLOCK_CACHE
48 #ifdef TRACE_BLOCK_CACHE
49 #	define TRACE(x)	TRACE_ALWAYS(x)
50 #else
51 #	define TRACE(x) ;
52 #endif
53 
54 
55 // This macro is used for fatal situations that are acceptable in a running
56 // system, like out of memory situations - should only panic for debugging.
57 #define FATAL(x) panic x
58 
59 static const bigtime_t kTransactionIdleTime = 2000000LL;
60 	// a transaction is considered idle after 2 seconds of inactivity
61 
62 
63 namespace {
64 
65 struct cache_transaction;
66 struct cached_block;
67 struct block_cache;
68 typedef DoublyLinkedListLink<cached_block> block_link;
69 
70 struct cached_block {
71 	cached_block*	next;			// next in hash
72 	cached_block*	transaction_next;
73 	block_link		link;
74 	off_t			block_number;
75 	void*			current_data;
76 		// The data that is seen by everyone using the API; this one is always
77 		// present.
78 	void*			original_data;
79 		// When in a transaction, this contains the original data from before
80 		// the transaction.
81 	void*			parent_data;
82 		// This is a lazily alloced buffer that represents the contents of the
83 		// block in the parent transaction. It may point to current_data if the
84 		// contents have been changed only in the parent transaction, or, if the
85 		// block has been changed in the current sub transaction already, to a
86 		// new block containing the contents changed in the parent transaction.
87 		// If this is NULL, the block has not been changed in the parent
88 		// transaction at all.
89 #if BLOCK_CACHE_DEBUG_CHANGED
90 	void*			compare;
91 #endif
92 	int32			ref_count;
93 	int32			last_accessed;
94 	bool			busy_reading : 1;
95 	bool			busy_writing : 1;
96 	bool			is_writing : 1;
97 		// Block has been checked out for writing without transactions, and
98 		// cannot be written back if set
99 	bool			is_dirty : 1;
100 	bool			unused : 1;
101 	bool			discard : 1;
102 	bool			busy_reading_waiters : 1;
103 	bool			busy_writing_waiters : 1;
104 	cache_transaction* transaction;
105 		// This is the current active transaction, if any, the block is
106 		// currently in (meaning was changed as a part of it).
107 	cache_transaction* previous_transaction;
108 		// This is set to the last transaction that was ended containing this
109 		// block. In this case, the block has not yet written back yet, and
110 		// the changed data is either in current_data, or original_data -- the
111 		// latter if the block is already being part of another transaction.
112 		// There can only be one previous transaction, so when the active
113 		// transaction ends, the changes of the previous transaction have to
114 		// be written back before that transaction becomes the next previous
115 		// transaction.
116 
117 	bool CanBeWritten() const;
LastAccess__anon7a06ab4d0111::cached_block118 	int32 LastAccess() const
119 		{ return system_time() / 1000000L - last_accessed; }
120 };
121 
122 typedef DoublyLinkedList<cached_block,
123 	DoublyLinkedListMemberGetLink<cached_block,
124 		&cached_block::link> > block_list;
125 
126 struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
127 	static inline void* operator new(size_t size);
128 	static inline void operator delete(void* block);
129 
130 	int32			transaction_id;
131 	int32			events_pending;
132 	int32			events;
133 	transaction_notification_hook hook;
134 	void*			data;
135 	bool			delete_after_event;
136 };
137 
138 typedef DoublyLinkedList<cache_notification> NotificationList;
139 
140 static object_cache* sCacheNotificationCache;
141 
142 struct cache_listener;
143 typedef DoublyLinkedListLink<cache_listener> listener_link;
144 
145 struct cache_listener : cache_notification {
146 	listener_link	link;
147 };
148 
149 typedef DoublyLinkedList<cache_listener,
150 	DoublyLinkedListMemberGetLink<cache_listener,
151 		&cache_listener::link> > ListenerList;
152 
153 void*
operator new(size_t size)154 cache_notification::operator new(size_t size)
155 {
156 	// We can't really know whether something is a cache_notification or a
157 	// cache_listener at runtime, so we just use one object_cache for both
158 	// with the size set to that of the (slightly larger) cache_listener.
159 	// In practice, the vast majority of cache_notifications are really
160 	// cache_listeners, so this is a more than acceptable trade-off.
161 	ASSERT(size <= sizeof(cache_listener));
162 	return object_cache_alloc(sCacheNotificationCache, 0);
163 }
164 
165 void
operator delete(void * block)166 cache_notification::operator delete(void* block)
167 {
168 	object_cache_free(sCacheNotificationCache, block, 0);
169 }
170 
171 
172 struct BlockHash {
173 	typedef off_t			KeyType;
174 	typedef	cached_block	ValueType;
175 
HashKey__anon7a06ab4d0111::BlockHash176 	size_t HashKey(KeyType key) const
177 	{
178 		return key;
179 	}
180 
Hash__anon7a06ab4d0111::BlockHash181 	size_t Hash(ValueType* block) const
182 	{
183 		return block->block_number;
184 	}
185 
Compare__anon7a06ab4d0111::BlockHash186 	bool Compare(KeyType key, ValueType* block) const
187 	{
188 		return block->block_number == key;
189 	}
190 
GetLink__anon7a06ab4d0111::BlockHash191 	ValueType*& GetLink(ValueType* value) const
192 	{
193 		return value->next;
194 	}
195 };
196 
197 typedef BOpenHashTable<BlockHash> BlockTable;
198 
199 
200 struct TransactionHash {
201 	typedef int32				KeyType;
202 	typedef	cache_transaction	ValueType;
203 
HashKey__anon7a06ab4d0111::TransactionHash204 	size_t HashKey(KeyType key) const
205 	{
206 		return key;
207 	}
208 
209 	size_t Hash(ValueType* transaction) const;
210 	bool Compare(KeyType key, ValueType* transaction) const;
211 	ValueType*& GetLink(ValueType* value) const;
212 };
213 
214 typedef BOpenHashTable<TransactionHash> TransactionTable;
215 
216 
217 struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
218 	BlockTable*		hash;
219 	mutex			lock;
220 	const int		fd;
221 	off_t			max_blocks;
222 	const size_t	block_size;
223 	int32			next_transaction_id;
224 	cache_transaction* last_transaction;
225 	TransactionTable* transaction_hash;
226 
227 	object_cache*	buffer_cache;
228 	block_list		unused_blocks;
229 	uint32			unused_block_count;
230 
231 	ConditionVariable busy_reading_condition;
232 	uint32			busy_reading_count;
233 	bool			busy_reading_waiters;
234 
235 	ConditionVariable busy_writing_condition;
236 	uint32			busy_writing_count;
237 	bool			busy_writing_waiters;
238 
239 	bigtime_t		last_block_write;
240 	bigtime_t		last_block_write_duration;
241 
242 	uint32			num_dirty_blocks;
243 	const bool		read_only;
244 
245 	NotificationList pending_notifications;
246 	ConditionVariable condition_variable;
247 
248 					block_cache(int fd, off_t numBlocks, size_t blockSize,
249 						bool readOnly);
250 					~block_cache();
251 
252 	status_t		Init();
253 
254 	void			Free(void* buffer);
255 	void*			Allocate();
256 	void			FreeBlock(cached_block* block);
257 	cached_block*	NewBlock(off_t blockNumber);
258 	void			FreeBlockParentData(cached_block* block);
259 
260 	void			RemoveUnusedBlocks(int32 count, int32 minSecondsOld = 0);
261 	void			RemoveBlock(cached_block* block);
262 	void			DiscardBlock(cached_block* block);
263 
264 private:
265 	static void		_LowMemoryHandler(void* data, uint32 resources,
266 						int32 level);
267 	cached_block*	_GetUnusedBlock();
268 };
269 
270 struct cache_transaction {
271 	cache_transaction();
272 
273 	cache_transaction* next;
274 	int32			id;
275 	int32			num_blocks;
276 	int32			main_num_blocks;
277 	int32			sub_num_blocks;
278 	cached_block*	first_block;
279 	block_list		blocks;
280 	ListenerList	listeners;
281 	bool			open;
282 	bool			has_sub_transaction;
283 	bigtime_t		last_used;
284 	int32			busy_writing_count;
285 };
286 
287 
288 class BlockWriter {
289 public:
290 								BlockWriter(block_cache* cache,
291 									size_t max = SIZE_MAX);
292 								~BlockWriter();
293 
294 			bool				Add(cached_block* block,
295 									cache_transaction* transaction = NULL);
296 			bool				Add(cache_transaction* transaction,
297 									bool& hasLeftOvers);
298 
299 			status_t			Write(cache_transaction* transaction = NULL,
300 									bool canUnlock = true);
301 
DeletedTransaction() const302 			bool				DeletedTransaction() const
303 									{ return fDeletedTransaction; }
304 
305 	static	status_t			WriteBlock(block_cache* cache,
306 									cached_block* block);
307 
308 private:
309 			void*				_Data(cached_block* block) const;
310 			status_t			_WriteBlocks(cached_block** blocks, uint32 count);
311 			void				_BlockDone(cached_block* block,
312 									cache_transaction* transaction);
313 			void				_UnmarkWriting(cached_block* block);
314 
315 	static	int					_CompareBlocks(const void* _blockA,
316 									const void* _blockB);
317 
318 private:
319 	static	const size_t		kBufferSize = 64;
320 
321 			block_cache*		fCache;
322 			cached_block*		fBuffer[kBufferSize];
323 			cached_block**		fBlocks;
324 			size_t				fCount;
325 			size_t				fTotal;
326 			size_t				fCapacity;
327 			size_t				fMax;
328 			status_t			fStatus;
329 			bool				fDeletedTransaction;
330 };
331 
332 
333 #ifndef BUILDING_USERLAND_FS_SERVER
334 class BlockPrefetcher {
335 public:
336 								BlockPrefetcher(block_cache* cache, off_t fBlockNumber,
337 									size_t numBlocks);
338 								~BlockPrefetcher();
339 
340 			status_t			Allocate();
341 			status_t			ReadAsync(MutexLocker& cacheLocker);
342 
NumAllocated()343 			size_t				NumAllocated() { return fNumAllocated; }
344 
345 private:
346 	static	void			_IOFinishedCallback(void* cookie, io_request* request,
347 									status_t status, bool partialTransfer,
348 									generic_size_t bytesTransferred);
349 			void			_IOFinished(status_t status, generic_size_t bytesTransferred);
350 
351 			void				_RemoveAllocated(size_t unbusyCount, size_t removeCount);
352 
353 private:
354 			block_cache* 		fCache;
355 			off_t				fBlockNumber;
356 			size_t				fNumRequested;
357 			size_t				fNumAllocated;
358 			cached_block** 		fBlocks;
359 			generic_io_vec* 	fDestVecs;
360 };
361 #endif // !BUILDING_USERLAND_FS_SERVER
362 
363 
364 class TransactionLocking {
365 public:
Lock(block_cache * cache)366 	inline bool Lock(block_cache* cache)
367 	{
368 		mutex_lock(&cache->lock);
369 
370 		while (cache->busy_writing_count != 0) {
371 			// wait for all blocks to be written
372 			ConditionVariableEntry entry;
373 			cache->busy_writing_condition.Add(&entry);
374 			cache->busy_writing_waiters = true;
375 
376 			mutex_unlock(&cache->lock);
377 
378 			entry.Wait();
379 
380 			mutex_lock(&cache->lock);
381 		}
382 
383 		return true;
384 	}
385 
Unlock(block_cache * cache)386 	inline void Unlock(block_cache* cache)
387 	{
388 		mutex_unlock(&cache->lock);
389 	}
390 };
391 
392 typedef AutoLocker<block_cache, TransactionLocking> TransactionLocker;
393 
394 } // namespace
395 
396 
397 #if BLOCK_CACHE_BLOCK_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
398 namespace BlockTracing {
399 
400 class Action : public AbstractTraceEntry {
401 public:
Action(block_cache * cache,cached_block * block)402 	Action(block_cache* cache, cached_block* block)
403 		:
404 		fCache(cache),
405 		fBlockNumber(block->block_number),
406 		fIsDirty(block->is_dirty),
407 		fHasOriginal(block->original_data != NULL),
408 		fHasParent(block->parent_data != NULL),
409 		fTransactionID(-1),
410 		fPreviousID(-1)
411 	{
412 		if (block->transaction != NULL)
413 			fTransactionID = block->transaction->id;
414 		if (block->previous_transaction != NULL)
415 			fPreviousID = block->previous_transaction->id;
416 	}
417 
AddDump(TraceOutput & out)418 	virtual void AddDump(TraceOutput& out)
419 	{
420 		out.Print("block cache %p, %s %" B_PRIu64 ", %c%c%c transaction %" B_PRId32
421 			" (previous id %" B_PRId32 ")\n", fCache, _Action(), fBlockNumber,
422 			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
423 			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
424 	}
425 
426 	virtual const char* _Action() const = 0;
427 
428 private:
429 	block_cache*		fCache;
430 	uint64				fBlockNumber;
431 	bool				fIsDirty;
432 	bool				fHasOriginal;
433 	bool				fHasParent;
434 	int32				fTransactionID;
435 	int32				fPreviousID;
436 };
437 
438 class Get : public Action {
439 public:
Get(block_cache * cache,cached_block * block)440 	Get(block_cache* cache, cached_block* block)
441 		:
442 		Action(cache, block)
443 	{
444 		Initialized();
445 	}
446 
_Action() const447 	virtual const char* _Action() const { return "get"; }
448 };
449 
450 class Put : public Action {
451 public:
Put(block_cache * cache,cached_block * block)452 	Put(block_cache* cache, cached_block* block)
453 		:
454 		Action(cache, block)
455 	{
456 		Initialized();
457 	}
458 
_Action() const459 	virtual const char* _Action() const { return "put"; }
460 };
461 
462 class Read : public Action {
463 public:
Read(block_cache * cache,cached_block * block)464 	Read(block_cache* cache, cached_block* block)
465 		:
466 		Action(cache, block)
467 	{
468 		Initialized();
469 	}
470 
_Action() const471 	virtual const char* _Action() const { return "read"; }
472 };
473 
474 class Write : public Action {
475 public:
Write(block_cache * cache,cached_block * block)476 	Write(block_cache* cache, cached_block* block)
477 		:
478 		Action(cache, block)
479 	{
480 		Initialized();
481 	}
482 
_Action() const483 	virtual const char* _Action() const { return "write"; }
484 };
485 
486 class Flush : public Action {
487 public:
Flush(block_cache * cache,cached_block * block,bool getUnused=false)488 	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
489 		:
490 		Action(cache, block),
491 		fGetUnused(getUnused)
492 	{
493 		Initialized();
494 	}
495 
_Action() const496 	virtual const char* _Action() const
497 		{ return fGetUnused ? "get-unused" : "flush"; }
498 
499 private:
500 	bool	fGetUnused;
501 };
502 
503 class Error : public AbstractTraceEntry {
504 public:
Error(block_cache * cache,uint64 blockNumber,const char * message,status_t status=B_OK)505 	Error(block_cache* cache, uint64 blockNumber, const char* message,
506 			status_t status = B_OK)
507 		:
508 		fCache(cache),
509 		fBlockNumber(blockNumber),
510 		fMessage(message),
511 		fStatus(status)
512 	{
513 		Initialized();
514 	}
515 
AddDump(TraceOutput & out)516 	virtual void AddDump(TraceOutput& out)
517 	{
518 		out.Print("block cache %p, error %" B_PRIu64 ", %s%s%s",
519 			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
520 			fStatus != B_OK ? strerror(fStatus) : "");
521 	}
522 
523 private:
524 	block_cache*	fCache;
525 	uint64			fBlockNumber;
526 	const char*		fMessage;
527 	status_t		fStatus;
528 };
529 
530 #if BLOCK_CACHE_BLOCK_TRACING >= 2
531 class BlockData : public AbstractTraceEntry {
532 public:
533 	enum {
534 		kCurrent	= 0x01,
535 		kParent		= 0x02,
536 		kOriginal	= 0x04
537 	};
538 
BlockData(block_cache * cache,cached_block * block,const char * message)539 	BlockData(block_cache* cache, cached_block* block, const char* message)
540 		:
541 		fCache(cache),
542 		fSize(cache->block_size),
543 		fBlockNumber(block->block_number),
544 		fMessage(message)
545 	{
546 		_Allocate(fCurrent, block->current_data);
547 		_Allocate(fParent, block->parent_data);
548 		_Allocate(fOriginal, block->original_data);
549 
550 #if KTRACE_PRINTF_STACK_TRACE
551 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
552 			false);
553 #endif
554 
555 		Initialized();
556 	}
557 
AddDump(TraceOutput & out)558 	virtual void AddDump(TraceOutput& out)
559 	{
560 		out.Print("block cache %p, block %" B_PRIu64 ", data %c%c%c: %s",
561 			fCache, fBlockNumber, fCurrent != NULL ? 'c' : '-',
562 			fParent != NULL ? 'p' : '-', fOriginal != NULL ? 'o' : '-',
563 			fMessage);
564 	}
565 
566 #if KTRACE_PRINTF_STACK_TRACE
DumpStackTrace(TraceOutput & out)567 	virtual void DumpStackTrace(TraceOutput& out)
568 	{
569 		out.PrintStackTrace(fStackTrace);
570 	}
571 #endif
572 
DumpBlocks(uint32 which,uint32 offset,uint32 size)573 	void DumpBlocks(uint32 which, uint32 offset, uint32 size)
574 	{
575 		if ((which & kCurrent) != 0)
576 			DumpBlock(kCurrent, offset, size);
577 		if ((which & kParent) != 0)
578 			DumpBlock(kParent, offset, size);
579 		if ((which & kOriginal) != 0)
580 			DumpBlock(kOriginal, offset, size);
581 	}
582 
DumpBlock(uint32 which,uint32 offset,uint32 size)583 	void DumpBlock(uint32 which, uint32 offset, uint32 size)
584 	{
585 		if (offset > fSize) {
586 			kprintf("invalid offset (block size %" B_PRIu32 ")\n", fSize);
587 			return;
588 		}
589 		if (offset + size > fSize)
590 			size = fSize - offset;
591 
592 		const char* label;
593 		uint8* data;
594 
595 		if ((which & kCurrent) != 0) {
596 			label = "current";
597 			data = fCurrent;
598 		} else if ((which & kParent) != 0) {
599 			label = "parent";
600 			data = fParent;
601 		} else if ((which & kOriginal) != 0) {
602 			label = "original";
603 			data = fOriginal;
604 		} else
605 			return;
606 
607 		kprintf("%s: offset %" B_PRIu32 ", %" B_PRIu32 " bytes\n", label, offset, size);
608 
609 		static const uint32 kBlockSize = 16;
610 		data += offset;
611 
612 		for (uint32 i = 0; i < size;) {
613 			int start = i;
614 
615 			kprintf("  %04" B_PRIx32 " ", i);
616 			for (; i < start + kBlockSize; i++) {
617 				if (!(i % 4))
618 					kprintf(" ");
619 
620 				if (i >= size)
621 					kprintf("  ");
622 				else
623 					kprintf("%02x", data[i]);
624 			}
625 
626 			kprintf("\n");
627 		}
628 	}
629 
630 private:
_Allocate(uint8 * & target,void * source)631 	void _Allocate(uint8*& target, void* source)
632 	{
633 		if (source == NULL) {
634 			target = NULL;
635 			return;
636 		}
637 
638 		target = alloc_tracing_buffer_memcpy(source, fSize, false);
639 	}
640 
641 	block_cache*	fCache;
642 	uint32			fSize;
643 	uint64			fBlockNumber;
644 	const char*		fMessage;
645 	uint8*			fCurrent;
646 	uint8*			fParent;
647 	uint8*			fOriginal;
648 #if KTRACE_PRINTF_STACK_TRACE
649 	tracing_stack_trace* fStackTrace;
650 #endif
651 };
652 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
653 
654 }	// namespace BlockTracing
655 
656 #	define TB(x) new(std::nothrow) BlockTracing::x;
657 #else
658 #	define TB(x) ;
659 #endif
660 
661 #if BLOCK_CACHE_BLOCK_TRACING >= 2
662 #	define TB2(x) new(std::nothrow) BlockTracing::x;
663 #else
664 #	define TB2(x) ;
665 #endif
666 
667 
668 #if BLOCK_CACHE_TRANSACTION_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
669 namespace TransactionTracing {
670 
671 class Action : public AbstractTraceEntry {
672 public:
Action(const char * label,block_cache * cache,cache_transaction * transaction)673 	Action(const char* label, block_cache* cache,
674 			cache_transaction* transaction)
675 		:
676 		fCache(cache),
677 		fTransaction(transaction),
678 		fID(transaction->id),
679 		fSub(transaction->has_sub_transaction),
680 		fNumBlocks(transaction->num_blocks),
681 		fSubNumBlocks(transaction->sub_num_blocks)
682 	{
683 		strlcpy(fLabel, label, sizeof(fLabel));
684 		Initialized();
685 	}
686 
AddDump(TraceOutput & out)687 	virtual void AddDump(TraceOutput& out)
688 	{
689 		out.Print("block cache %p, %s transaction %p (id %" B_PRId32 ")%s"
690 			", %" B_PRId32 "/%" B_PRId32 " blocks", fCache, fLabel, fTransaction,
691 			fID, fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
692 	}
693 
694 private:
695 	char				fLabel[12];
696 	block_cache*		fCache;
697 	cache_transaction*	fTransaction;
698 	int32				fID;
699 	bool				fSub;
700 	int32				fNumBlocks;
701 	int32				fSubNumBlocks;
702 };
703 
704 class Detach : public AbstractTraceEntry {
705 public:
Detach(block_cache * cache,cache_transaction * transaction,cache_transaction * newTransaction)706 	Detach(block_cache* cache, cache_transaction* transaction,
707 			cache_transaction* newTransaction)
708 		:
709 		fCache(cache),
710 		fTransaction(transaction),
711 		fID(transaction->id),
712 		fSub(transaction->has_sub_transaction),
713 		fNewTransaction(newTransaction),
714 		fNewID(newTransaction->id)
715 	{
716 		Initialized();
717 	}
718 
AddDump(TraceOutput & out)719 	virtual void AddDump(TraceOutput& out)
720 	{
721 		out.Print("block cache %p, detach transaction %p (id %" B_PRId32 ")"
722 			"from transaction %p (id %" B_PRId32 ")%s",
723 			fCache, fNewTransaction, fNewID, fTransaction, fID,
724 			fSub ? " sub" : "");
725 	}
726 
727 private:
728 	block_cache*		fCache;
729 	cache_transaction*	fTransaction;
730 	int32				fID;
731 	bool				fSub;
732 	cache_transaction*	fNewTransaction;
733 	int32				fNewID;
734 };
735 
736 class Abort : public AbstractTraceEntry {
737 public:
Abort(block_cache * cache,cache_transaction * transaction)738 	Abort(block_cache* cache, cache_transaction* transaction)
739 		:
740 		fCache(cache),
741 		fTransaction(transaction),
742 		fID(transaction->id),
743 		fNumBlocks(0)
744 	{
745 		bool isSub = transaction->has_sub_transaction;
746 		fNumBlocks = isSub ? transaction->sub_num_blocks
747 			: transaction->num_blocks;
748 		fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
749 		if (fBlocks != NULL) {
750 			cached_block* block = transaction->first_block;
751 			for (int32 i = 0; block != NULL && i < fNumBlocks;
752 					block = block->transaction_next) {
753 				fBlocks[i++] = block->block_number;
754 			}
755 		} else
756 			fNumBlocks = 0;
757 
758 #if KTRACE_PRINTF_STACK_TRACE
759 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
760 			false);
761 #endif
762 
763 		Initialized();
764 	}
765 
AddDump(TraceOutput & out)766 	virtual void AddDump(TraceOutput& out)
767 	{
768 		out.Print("block cache %p, abort transaction "
769 			"%p (id %" B_PRId32 "), blocks", fCache, fTransaction, fID);
770 		for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
771 			out.Print(" %" B_PRIdOFF, fBlocks[i]);
772 	}
773 
774 #if KTRACE_PRINTF_STACK_TRACE
DumpStackTrace(TraceOutput & out)775 	virtual void DumpStackTrace(TraceOutput& out)
776 	{
777 		out.PrintStackTrace(fStackTrace);
778 	}
779 #endif
780 
781 private:
782 	block_cache*		fCache;
783 	cache_transaction*	fTransaction;
784 	int32				fID;
785 	off_t*				fBlocks;
786 	int32				fNumBlocks;
787 #if KTRACE_PRINTF_STACK_TRACE
788 	tracing_stack_trace* fStackTrace;
789 #endif
790 };
791 
792 }	// namespace TransactionTracing
793 
794 #	define T(x) new(std::nothrow) TransactionTracing::x;
795 #else
796 #	define T(x) ;
797 #endif
798 
799 
800 static DoublyLinkedList<block_cache> sCaches;
801 static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
802 static mutex sCachesMemoryUseLock
803 	= MUTEX_INITIALIZER("block caches memory use");
804 static size_t sUsedMemory;
805 static sem_id sEventSemaphore;
806 static mutex sNotificationsLock
807 	= MUTEX_INITIALIZER("block cache notifications");
808 static thread_id sNotifierWriterThread;
809 static DoublyLinkedListLink<block_cache> sMarkCache;
810 	// TODO: this only works if the link is the first entry of block_cache
811 static object_cache* sBlockCache;
812 
813 
814 static void mark_block_busy_reading(block_cache* cache, cached_block* block);
815 static void mark_block_unbusy_reading(block_cache* cache, cached_block* block);
816 
817 
818 //	#pragma mark - notifications/listener
819 
820 
821 /*!	Checks whether or not this is an event that closes a transaction. */
822 static inline bool
is_closing_event(int32 event)823 is_closing_event(int32 event)
824 {
825 	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
826 }
827 
828 
829 static inline bool
is_written_event(int32 event)830 is_written_event(int32 event)
831 {
832 	return (event & TRANSACTION_WRITTEN) != 0;
833 }
834 
835 
836 /*!	From the specified \a notification, it will remove the lowest pending
837 	event, and return that one in \a _event.
838 	If there is no pending event anymore, it will return \c false.
839 */
840 static bool
get_next_pending_event(cache_notification * notification,int32 * _event)841 get_next_pending_event(cache_notification* notification, int32* _event)
842 {
843 	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
844 		int32 pending = atomic_and(&notification->events_pending,
845 			~eventMask);
846 
847 		bool more = (pending & ~eventMask) != 0;
848 
849 		if ((pending & eventMask) != 0) {
850 			*_event = eventMask;
851 			return more;
852 		}
853 	}
854 
855 	return false;
856 }
857 
858 
859 static void
flush_pending_notifications(block_cache * cache)860 flush_pending_notifications(block_cache* cache)
861 {
862 	ASSERT_LOCKED_MUTEX(&sCachesLock);
863 
864 	while (true) {
865 		MutexLocker locker(sNotificationsLock);
866 
867 		cache_notification* notification = cache->pending_notifications.Head();
868 		if (notification == NULL)
869 			return;
870 
871 		bool deleteAfterEvent = false;
872 		int32 event = -1;
873 		if (!get_next_pending_event(notification, &event)) {
874 			// remove the notification if this was the last pending event
875 			cache->pending_notifications.Remove(notification);
876 			deleteAfterEvent = notification->delete_after_event;
877 		}
878 
879 		if (event >= 0) {
880 			// Notify listener, we need to copy the notification, as it might
881 			// be removed when we unlock the list.
882 			cache_notification copy = *notification;
883 			locker.Unlock();
884 
885 			copy.hook(copy.transaction_id, event, copy.data);
886 
887 			locker.Lock();
888 		}
889 
890 		if (deleteAfterEvent)
891 			delete notification;
892 	}
893 }
894 
895 
896 /*!	Flushes all pending notifications by calling the appropriate hook
897 	functions.
898 	Must not be called with a cache lock held.
899 */
900 static void
flush_pending_notifications()901 flush_pending_notifications()
902 {
903 	MutexLocker _(sCachesLock);
904 
905 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
906 	while (iterator.HasNext()) {
907 		block_cache* cache = iterator.Next();
908 
909 		flush_pending_notifications(cache);
910 	}
911 }
912 
913 
914 /*!	Initializes the \a notification as specified. */
915 static void
set_notification(cache_transaction * transaction,cache_notification & notification,int32 events,transaction_notification_hook hook,void * data)916 set_notification(cache_transaction* transaction,
917 	cache_notification &notification, int32 events,
918 	transaction_notification_hook hook, void* data)
919 {
920 	notification.transaction_id = transaction != NULL ? transaction->id : -1;
921 	notification.events_pending = 0;
922 	notification.events = events;
923 	notification.hook = hook;
924 	notification.data = data;
925 	notification.delete_after_event = false;
926 }
927 
928 
929 /*!	Makes sure the notification is deleted. It either deletes it directly,
930 	when possible, or marks it for deletion if the notification is pending.
931 */
932 static void
delete_notification(cache_notification * notification)933 delete_notification(cache_notification* notification)
934 {
935 	MutexLocker locker(sNotificationsLock);
936 
937 	if (notification->events_pending != 0)
938 		notification->delete_after_event = true;
939 	else
940 		delete notification;
941 }
942 
943 
944 /*!	Adds the notification to the pending notifications list, or, if it's
945 	already part of it, updates its events_pending field.
946 	Also marks the notification to be deleted if \a deleteNotification
947 	is \c true.
948 	Triggers the notifier thread to run.
949 */
950 static void
add_notification(block_cache * cache,cache_notification * notification,int32 event,bool deleteNotification)951 add_notification(block_cache* cache, cache_notification* notification,
952 	int32 event, bool deleteNotification)
953 {
954 	if (notification->hook == NULL)
955 		return;
956 
957 	int32 pending = atomic_or(&notification->events_pending, event);
958 	if (pending == 0) {
959 		// not yet part of the notification list
960 		MutexLocker locker(sNotificationsLock);
961 		if (deleteNotification)
962 			notification->delete_after_event = true;
963 		cache->pending_notifications.Add(notification);
964 	} else if (deleteNotification) {
965 		// we might need to delete it ourselves if we're late
966 		delete_notification(notification);
967 	}
968 
969 	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
970 		// We're probably still holding some locks that makes rescheduling
971 		// not a good idea at this point.
972 }
973 
974 
975 /*!	Notifies all interested listeners of this transaction about the \a event.
976 	If \a event is a closing event (ie. TRANSACTION_ENDED, and
977 	TRANSACTION_ABORTED), all listeners except those listening to
978 	TRANSACTION_WRITTEN will be removed.
979 */
980 static void
notify_transaction_listeners(block_cache * cache,cache_transaction * transaction,int32 event)981 notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
982 	int32 event)
983 {
984 	T(Action("notify", cache, transaction));
985 
986 	bool isClosing = is_closing_event(event);
987 	bool isWritten = is_written_event(event);
988 
989 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
990 	while (iterator.HasNext()) {
991 		cache_listener* listener = iterator.Next();
992 
993 		bool remove = (isClosing && !is_written_event(listener->events))
994 			|| (isWritten && is_written_event(listener->events));
995 		if (remove)
996 			iterator.Remove();
997 
998 		if ((listener->events & event) != 0)
999 			add_notification(cache, listener, event, remove);
1000 		else if (remove)
1001 			delete_notification(listener);
1002 	}
1003 }
1004 
1005 
1006 /*!	Removes and deletes all listeners that are still monitoring this
1007 	transaction.
1008 */
1009 static void
remove_transaction_listeners(block_cache * cache,cache_transaction * transaction)1010 remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
1011 {
1012 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
1013 	while (iterator.HasNext()) {
1014 		cache_listener* listener = iterator.Next();
1015 		iterator.Remove();
1016 
1017 		delete_notification(listener);
1018 	}
1019 }
1020 
1021 
1022 static status_t
add_transaction_listener(block_cache * cache,cache_transaction * transaction,int32 events,transaction_notification_hook hookFunction,void * data)1023 add_transaction_listener(block_cache* cache, cache_transaction* transaction,
1024 	int32 events, transaction_notification_hook hookFunction, void* data)
1025 {
1026 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
1027 	while (iterator.HasNext()) {
1028 		cache_listener* listener = iterator.Next();
1029 
1030 		if (listener->data == data && listener->hook == hookFunction) {
1031 			// this listener already exists, just update it
1032 			listener->events |= events;
1033 			return B_OK;
1034 		}
1035 	}
1036 
1037 	cache_listener* listener = new cache_listener;
1038 	if (listener == NULL)
1039 		return B_NO_MEMORY;
1040 
1041 	set_notification(transaction, *listener, events, hookFunction, data);
1042 	transaction->listeners.Add(listener);
1043 	return B_OK;
1044 }
1045 
1046 
1047 //	#pragma mark - private transaction
1048 
1049 
cache_transaction()1050 cache_transaction::cache_transaction()
1051 {
1052 	num_blocks = 0;
1053 	main_num_blocks = 0;
1054 	sub_num_blocks = 0;
1055 	first_block = NULL;
1056 	open = true;
1057 	has_sub_transaction = false;
1058 	last_used = system_time();
1059 	busy_writing_count = 0;
1060 }
1061 
1062 
1063 static void
delete_transaction(block_cache * cache,cache_transaction * transaction)1064 delete_transaction(block_cache* cache, cache_transaction* transaction)
1065 {
1066 	if (cache->last_transaction == transaction)
1067 		cache->last_transaction = NULL;
1068 
1069 	remove_transaction_listeners(cache, transaction);
1070 	delete transaction;
1071 }
1072 
1073 
1074 static cache_transaction*
lookup_transaction(block_cache * cache,int32 id)1075 lookup_transaction(block_cache* cache, int32 id)
1076 {
1077 	return cache->transaction_hash->Lookup(id);
1078 }
1079 
1080 
Hash(cache_transaction * transaction) const1081 size_t TransactionHash::Hash(cache_transaction* transaction) const
1082 {
1083 	return transaction->id;
1084 }
1085 
1086 
Compare(int32 key,cache_transaction * transaction) const1087 bool TransactionHash::Compare(int32 key, cache_transaction* transaction) const
1088 {
1089 	return transaction->id == key;
1090 }
1091 
1092 
GetLink(cache_transaction * value) const1093 cache_transaction*& TransactionHash::GetLink(cache_transaction* value) const
1094 {
1095 	return value->next;
1096 }
1097 
1098 
1099 /*!	Writes back any changes made to blocks in \a transaction that are still
1100 	part of a previous transacton.
1101 */
1102 static status_t
write_blocks_in_previous_transaction(block_cache * cache,cache_transaction * transaction)1103 write_blocks_in_previous_transaction(block_cache* cache,
1104 	cache_transaction* transaction)
1105 {
1106 	BlockWriter writer(cache);
1107 
1108 	cached_block* block = transaction->first_block;
1109 	for (; block != NULL; block = block->transaction_next) {
1110 		if (block->previous_transaction != NULL) {
1111 			// need to write back pending changes
1112 			writer.Add(block);
1113 		}
1114 	}
1115 
1116 	return writer.Write();
1117 }
1118 
1119 
1120 //	#pragma mark - cached_block
1121 
1122 
1123 bool
CanBeWritten() const1124 cached_block::CanBeWritten() const
1125 {
1126 	return !busy_writing && !busy_reading
1127 		&& (previous_transaction != NULL
1128 			|| (transaction == NULL && is_dirty && !is_writing));
1129 }
1130 
1131 
1132 //	#pragma mark - BlockWriter
1133 
1134 
BlockWriter(block_cache * cache,size_t max)1135 BlockWriter::BlockWriter(block_cache* cache, size_t max)
1136 	:
1137 	fCache(cache),
1138 	fBlocks(fBuffer),
1139 	fCount(0),
1140 	fTotal(0),
1141 	fCapacity(kBufferSize),
1142 	fMax(max),
1143 	fStatus(B_OK),
1144 	fDeletedTransaction(false)
1145 {
1146 }
1147 
1148 
~BlockWriter()1149 BlockWriter::~BlockWriter()
1150 {
1151 	if (fBlocks != fBuffer)
1152 		free(fBlocks);
1153 }
1154 
1155 
1156 /*!	Adds the specified block to the to be written array. If no more blocks can
1157 	be added, false is returned, otherwise true.
1158 */
1159 bool
Add(cached_block * block,cache_transaction * transaction)1160 BlockWriter::Add(cached_block* block, cache_transaction* transaction)
1161 {
1162 	ASSERT(block->CanBeWritten());
1163 
1164 	if (fTotal == fMax)
1165 		return false;
1166 
1167 	if (fCount >= fCapacity) {
1168 		// Enlarge array if necessary
1169 		cached_block** newBlocks;
1170 		size_t newCapacity = max_c(256, fCapacity * 2);
1171 		if (fBlocks == fBuffer)
1172 			newBlocks = (cached_block**)malloc(newCapacity * sizeof(void*));
1173 		else {
1174 			newBlocks = (cached_block**)realloc(fBlocks,
1175 				newCapacity * sizeof(void*));
1176 		}
1177 
1178 		if (newBlocks == NULL) {
1179 			// Allocating a larger array failed - we need to write back what
1180 			// we have synchronously now (this will also clear the array)
1181 			Write(transaction, false);
1182 		} else {
1183 			if (fBlocks == fBuffer)
1184 				memcpy(newBlocks, fBuffer, kBufferSize * sizeof(void*));
1185 
1186 			fBlocks = newBlocks;
1187 			fCapacity = newCapacity;
1188 		}
1189 	}
1190 
1191 	fBlocks[fCount++] = block;
1192 	fTotal++;
1193 	block->busy_writing = true;
1194 	fCache->busy_writing_count++;
1195 	if (block->previous_transaction != NULL)
1196 		block->previous_transaction->busy_writing_count++;
1197 
1198 	return true;
1199 }
1200 
1201 
1202 /*!	Adds all blocks of the specified transaction to the to be written array.
1203 	If no more blocks can be added, false is returned, otherwise true.
1204 */
1205 bool
Add(cache_transaction * transaction,bool & hasLeftOvers)1206 BlockWriter::Add(cache_transaction* transaction, bool& hasLeftOvers)
1207 {
1208 	ASSERT(!transaction->open);
1209 
1210 	if (transaction->busy_writing_count != 0) {
1211 		hasLeftOvers = true;
1212 		return true;
1213 	}
1214 
1215 	hasLeftOvers = false;
1216 
1217 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1218 	while (cached_block* block = blockIterator.Next()) {
1219 		if (!block->CanBeWritten()) {
1220 			// This block was already part of a previous transaction within this
1221 			// writer
1222 			hasLeftOvers = true;
1223 			continue;
1224 		}
1225 		if (!Add(block, transaction))
1226 			return false;
1227 
1228 		if (DeletedTransaction())
1229 			break;
1230 	}
1231 
1232 	return true;
1233 }
1234 
1235 
1236 /*! Cache must be locked when calling this method, but it will be unlocked
1237 	while the blocks are written back.
1238 */
1239 status_t
Write(cache_transaction * transaction,bool canUnlock)1240 BlockWriter::Write(cache_transaction* transaction, bool canUnlock)
1241 {
1242 	if (fCount == 0)
1243 		return B_OK;
1244 
1245 	if (canUnlock)
1246 		mutex_unlock(&fCache->lock);
1247 
1248 	// Sort blocks in their on-disk order, so we can merge consecutive writes.
1249 	qsort(fBlocks, fCount, sizeof(void*), &_CompareBlocks);
1250 	fDeletedTransaction = false;
1251 
1252 	bigtime_t start = system_time();
1253 
1254 	for (uint32 i = 0; i < fCount; i++) {
1255 		uint32 blocks = 1;
1256 		for (; (i + blocks) < fCount && blocks < IOV_MAX; blocks++) {
1257 			const uint32 j = i + blocks;
1258 			if (fBlocks[j]->block_number != (fBlocks[j - 1]->block_number + 1))
1259 				break;
1260 		}
1261 
1262 		status_t status = _WriteBlocks(fBlocks + i, blocks);
1263 		if (status != B_OK) {
1264 			// propagate to global error handling
1265 			if (fStatus == B_OK)
1266 				fStatus = status;
1267 
1268 			for (uint32 j = i; j < (i + blocks); j++) {
1269 				_UnmarkWriting(fBlocks[j]);
1270 				fBlocks[j] = NULL;
1271 					// This block will not be marked clean
1272 			}
1273 		}
1274 
1275 		i += (blocks - 1);
1276 	}
1277 
1278 	bigtime_t finish = system_time();
1279 
1280 	if (canUnlock)
1281 		mutex_lock(&fCache->lock);
1282 
1283 	if (fStatus == B_OK && fCount >= 8) {
1284 		fCache->last_block_write = finish;
1285 		fCache->last_block_write_duration = (fCache->last_block_write - start)
1286 			/ fCount;
1287 	}
1288 
1289 	for (uint32 i = 0; i < fCount; i++)
1290 		_BlockDone(fBlocks[i], transaction);
1291 
1292 	fCount = 0;
1293 	return fStatus;
1294 }
1295 
1296 
1297 /*!	Writes the specified \a block back to disk. It will always only write back
1298 	the oldest change of the block if it is part of more than one transaction.
1299 	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1300 	delete transactions when they are no longer used, and \a deleteTransaction
1301 	is \c true.
1302 */
1303 /*static*/ status_t
WriteBlock(block_cache * cache,cached_block * block)1304 BlockWriter::WriteBlock(block_cache* cache, cached_block* block)
1305 {
1306 	BlockWriter writer(cache);
1307 
1308 	writer.Add(block);
1309 	return writer.Write();
1310 }
1311 
1312 
1313 void*
_Data(cached_block * block) const1314 BlockWriter::_Data(cached_block* block) const
1315 {
1316 	return block->previous_transaction != NULL && block->original_data != NULL
1317 		? block->original_data : block->current_data;
1318 		// We first need to write back changes from previous transactions
1319 }
1320 
1321 
1322 status_t
_WriteBlocks(cached_block ** blocks,uint32 count)1323 BlockWriter::_WriteBlocks(cached_block** blocks, uint32 count)
1324 {
1325 	const size_t blockSize = fCache->block_size;
1326 
1327 	BStackOrHeapArray<iovec, 8> vecs(count);
1328 	for (uint32 i = 0; i < count; i++) {
1329 		cached_block* block = blocks[i];
1330 		ASSERT(block->busy_writing);
1331 		ASSERT(i == 0 || block->block_number == (blocks[i - 1]->block_number + 1));
1332 
1333 		TRACE(("BlockWriter::_WriteBlocks(block %" B_PRIdOFF ", count %" B_PRIu32 ")\n",
1334 			block->block_number, count));
1335 		TB(Write(fCache, block));
1336 		TB2(BlockData(fCache, block, "before write"));
1337 
1338 		vecs[i].iov_base = _Data(block);
1339 		vecs[i].iov_len = blockSize;
1340 	}
1341 
1342 	ssize_t written = writev_pos(fCache->fd,
1343 		blocks[0]->block_number * blockSize, vecs, count);
1344 
1345 	if (written != (ssize_t)(blockSize * count)) {
1346 		TB(Error(fCache, block->block_number, "write failed", written));
1347 		TRACE_ALWAYS("could not write back %" B_PRIu32 " blocks (start block %" B_PRIdOFF "): %s\n",
1348 			count, blocks[0]->block_number, strerror(errno));
1349 		if (written < 0)
1350 			return errno;
1351 
1352 		return B_IO_ERROR;
1353 	}
1354 
1355 	return B_OK;
1356 }
1357 
1358 
1359 void
_BlockDone(cached_block * block,cache_transaction * transaction)1360 BlockWriter::_BlockDone(cached_block* block,
1361 	cache_transaction* transaction)
1362 {
1363 	if (block == NULL) {
1364 		// An error occured when trying to write this block
1365 		return;
1366 	}
1367 
1368 	if (fCache->num_dirty_blocks > 0)
1369 		fCache->num_dirty_blocks--;
1370 
1371 	if (_Data(block) == block->current_data)
1372 		block->is_dirty = false;
1373 
1374 	_UnmarkWriting(block);
1375 
1376 	cache_transaction* previous = block->previous_transaction;
1377 	if (previous != NULL) {
1378 		previous->blocks.Remove(block);
1379 		block->previous_transaction = NULL;
1380 
1381 		if (block->original_data != NULL && block->transaction == NULL) {
1382 			// This block is not part of a transaction, so it does not need
1383 			// its original pointer anymore.
1384 			fCache->Free(block->original_data);
1385 			block->original_data = NULL;
1386 		}
1387 
1388 		// Has the previous transaction been finished with that write?
1389 		if (--previous->num_blocks == 0) {
1390 			TRACE(("cache transaction %" B_PRId32 " finished!\n", previous->id));
1391 			T(Action("written", fCache, previous));
1392 
1393 			notify_transaction_listeners(fCache, previous,
1394 				TRANSACTION_WRITTEN);
1395 
1396 			if (transaction != NULL) {
1397 				// This function is called while iterating transaction_hash. We
1398 				// use RemoveUnchecked so the iterator is still valid. A regular
1399 				// Remove can trigger a resize of the hash table which would
1400 				// result in the linked items in the table changing order.
1401 				fCache->transaction_hash->RemoveUnchecked(transaction);
1402 			} else
1403 				fCache->transaction_hash->Remove(previous);
1404 
1405 			delete_transaction(fCache, previous);
1406 			fDeletedTransaction = true;
1407 		}
1408 	}
1409 	if (block->transaction == NULL && block->ref_count == 0 && !block->unused) {
1410 		// the block is no longer used
1411 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1412 		block->unused = true;
1413 		fCache->unused_blocks.Add(block);
1414 		fCache->unused_block_count++;
1415 	}
1416 
1417 	TB2(BlockData(fCache, block, "after write"));
1418 }
1419 
1420 
1421 void
_UnmarkWriting(cached_block * block)1422 BlockWriter::_UnmarkWriting(cached_block* block)
1423 {
1424 	block->busy_writing = false;
1425 	if (block->previous_transaction != NULL)
1426 		block->previous_transaction->busy_writing_count--;
1427 	fCache->busy_writing_count--;
1428 
1429 	if ((fCache->busy_writing_waiters && fCache->busy_writing_count == 0)
1430 		|| block->busy_writing_waiters) {
1431 		fCache->busy_writing_waiters = false;
1432 		block->busy_writing_waiters = false;
1433 		fCache->busy_writing_condition.NotifyAll();
1434 	}
1435 }
1436 
1437 
1438 /*static*/ int
_CompareBlocks(const void * _blockA,const void * _blockB)1439 BlockWriter::_CompareBlocks(const void* _blockA, const void* _blockB)
1440 {
1441 	cached_block* blockA = *(cached_block**)_blockA;
1442 	cached_block* blockB = *(cached_block**)_blockB;
1443 
1444 	off_t diff = blockA->block_number - blockB->block_number;
1445 	if (diff > 0)
1446 		return 1;
1447 
1448 	return diff < 0 ? -1 : 0;
1449 }
1450 
1451 
1452 #ifndef BUILDING_USERLAND_FS_SERVER
1453 //	#pragma mark - BlockPrefetcher
1454 
1455 
BlockPrefetcher(block_cache * cache,off_t blockNumber,size_t numBlocks)1456 BlockPrefetcher::BlockPrefetcher(block_cache* cache, off_t blockNumber, size_t numBlocks)
1457 	:
1458 	fCache(cache),
1459 	fBlockNumber(blockNumber),
1460 	fNumRequested(numBlocks),
1461 	fNumAllocated(0)
1462 {
1463 	fBlocks = new cached_block*[numBlocks];
1464 	fDestVecs = new generic_io_vec[numBlocks];
1465 }
1466 
1467 
~BlockPrefetcher()1468 BlockPrefetcher::~BlockPrefetcher()
1469 {
1470 	delete[] fBlocks;
1471 	delete[] fDestVecs;
1472 }
1473 
1474 
1475 /*!	Allocates cached_block objects in preparation for prefetching.
1476 	@return If an error is returned, then no blocks have been allocated.
1477 	@post Blocks have been constructed (including allocating the current_data member)
1478 	but current_data is uninitialized.
1479 */
1480 status_t
Allocate()1481 BlockPrefetcher::Allocate()
1482 {
1483 	TRACE(("BlockPrefetcher::Allocate: looking up %" B_PRIuSIZE " blocks, starting with %"
1484 		B_PRIdOFF "\n", fNumBlocks, fBlockNumber));
1485 
1486 	ASSERT_LOCKED_MUTEX(&fCache->lock);
1487 
1488 	size_t finalNumBlocks = fNumRequested;
1489 
1490 	// determine whether any requested blocks are already cached
1491 	for (size_t i = 0; i < fNumRequested; ++i) {
1492 		off_t blockNumIter = fBlockNumber + i;
1493 		if (blockNumIter < 0 || blockNumIter >= fCache->max_blocks) {
1494 			panic("BlockPrefetcher::Allocate: invalid block number %" B_PRIdOFF " (max %"
1495 				B_PRIdOFF ")", blockNumIter, fCache->max_blocks - 1);
1496 			return B_BAD_VALUE;
1497 		}
1498 		cached_block* block = fCache->hash->Lookup(blockNumIter);
1499 		if (block != NULL) {
1500 			// truncate the request
1501 			TRACE(("BlockPrefetcher::Allocate: found an existing block (%" B_PRIdOFF ")\n",
1502 				blockNumIter));
1503 			fBlocks[i] = NULL;
1504 			finalNumBlocks = i;
1505 			break;
1506 		}
1507 	}
1508 
1509 	// allocate the blocks
1510 	for (size_t i = 0; i < finalNumBlocks; ++i) {
1511 		cached_block* block = fCache->NewBlock(fBlockNumber + i);
1512 		if (block == NULL) {
1513 			_RemoveAllocated(0, i);
1514 			return B_NO_MEMORY;
1515 		}
1516 		fCache->hash->Insert(block);
1517 
1518 		block->unused = true;
1519 		fCache->unused_blocks.Add(block);
1520 		fCache->unused_block_count++;
1521 
1522 		fBlocks[i] = block;
1523 	}
1524 
1525 	fNumAllocated = finalNumBlocks;
1526 
1527 	return B_OK;
1528 }
1529 
1530 
1531 /*!	Schedules reads from disk to cache.
1532 	\post The calling object will eventually be deleted by IOFinishedCallback.
1533 */
1534 status_t
ReadAsync(MutexLocker & cacheLocker)1535 BlockPrefetcher::ReadAsync(MutexLocker& cacheLocker)
1536 {
1537 	TRACE(("BlockPrefetcher::Read: reading %" B_PRIuSIZE " blocks\n", fNumAllocated));
1538 
1539 	size_t blockSize = fCache->block_size;
1540 	generic_io_vec* vecs = fDestVecs;
1541 	for (size_t i = 0; i < fNumAllocated; ++i) {
1542 		vecs[i].base = reinterpret_cast<generic_addr_t>(fBlocks[i]->current_data);
1543 		vecs[i].length = blockSize;
1544 		mark_block_busy_reading(fCache, fBlocks[i]);
1545 	}
1546 
1547 	IORequest* request = new IORequest;
1548 	status_t status = request->Init(fBlockNumber * blockSize, vecs, fNumAllocated,
1549 		fNumAllocated * blockSize, false, B_DELETE_IO_REQUEST);
1550 	if (status != B_OK) {
1551 		TB(Error(fCache, fBlockNumber, "IORequest::Init starting here failed", status));
1552 		TRACE_ALWAYS("BlockPrefetcher::Read: failed to initialize IO request for %" B_PRIuSIZE
1553 			" blocks starting with %" B_PRIdOFF ": %s\n",
1554 			fNumAllocated, fBlockNumber, strerror(status));
1555 
1556 		_RemoveAllocated(fNumAllocated, fNumAllocated);
1557 		delete request;
1558 		return status;
1559 	}
1560 
1561 	request->SetFinishedCallback(_IOFinishedCallback, this);
1562 
1563 	// do_fd_io() may invoke callbacks directly, so we need to have unlocked the cache.
1564 	cacheLocker.Unlock();
1565 
1566 	return do_fd_io(fCache->fd, request);
1567 }
1568 
1569 
1570 /*static*/ void
_IOFinishedCallback(void * cookie,io_request * request,status_t status,bool partialTransfer,generic_size_t bytesTransferred)1571 BlockPrefetcher::_IOFinishedCallback(void* cookie, io_request* request, status_t status,
1572 	bool partialTransfer, generic_size_t bytesTransferred)
1573 {
1574 	TRACE(("BlockPrefetcher::_IOFinishedCallback: status %s, partial %d\n",
1575 		strerror(status), partialTransfer));
1576 	((BlockPrefetcher*)cookie)->_IOFinished(status, bytesTransferred);
1577 }
1578 
1579 
1580 void
_IOFinished(status_t status,generic_size_t bytesTransferred)1581 BlockPrefetcher::_IOFinished(status_t status, generic_size_t bytesTransferred)
1582 {
1583 	MutexLocker locker(&fCache->lock);
1584 
1585 	if (bytesTransferred < (fNumAllocated * fCache->block_size)) {
1586 		_RemoveAllocated(fNumAllocated, fNumAllocated);
1587 
1588 		TB(Error(cache, fBlockNumber, "prefetch starting here failed", status));
1589 		TRACE_ALWAYS("BlockPrefetcher::_IOFinished: transferred only %" B_PRIuGENADDR
1590 			" bytes in attempt to read %" B_PRIuSIZE " blocks (start block %" B_PRIdOFF "): %s\n",
1591 			bytesTransferred, fNumAllocated, fBlockNumber, strerror(status));
1592 	} else {
1593 		for (size_t i = 0; i < fNumAllocated; i++) {
1594 			TB(Read(cache, fBlockNumber + i));
1595 			mark_block_unbusy_reading(fCache, fBlocks[i]);
1596 			fBlocks[i]->last_accessed = system_time() / 1000000L;
1597 		}
1598 	}
1599 
1600 	delete this;
1601 }
1602 
1603 
1604 /*!	Cleans up blocks that were allocated for prefetching when an in-progress prefetch
1605 	is cancelled.
1606 */
1607 void
_RemoveAllocated(size_t unbusyCount,size_t removeCount)1608 BlockPrefetcher::_RemoveAllocated(size_t unbusyCount, size_t removeCount)
1609 {
1610 	TRACE(("BlockPrefetcher::_RemoveAllocated:  unbusy %" B_PRIuSIZE " and remove %" B_PRIuSIZE
1611 		" starting with %" B_PRIdOFF "\n", unbusyCount, removeCount, (*fBlocks)->block_number));
1612 
1613 	ASSERT_LOCKED_MUTEX(&fCache->lock);
1614 
1615 	for (size_t i = 0; i < unbusyCount; ++i)
1616 		mark_block_unbusy_reading(fCache, fBlocks[i]);
1617 
1618 	for (size_t i = 0; i < removeCount; ++i) {
1619 		ASSERT(fBlocks[i]->is_dirty == false && fBlocks[i]->unused == true);
1620 
1621 		fCache->unused_blocks.Remove(fBlocks[i]);
1622 		fCache->unused_block_count--;
1623 
1624 		fCache->RemoveBlock(fBlocks[i]);
1625 		fBlocks[i] = NULL;
1626 	}
1627 
1628 	fNumAllocated = 0;
1629 
1630 	return;
1631 }
1632 #endif // !BUILDING_USERLAND_FS_SERVER
1633 
1634 
1635 //	#pragma mark - block_cache
1636 
1637 
block_cache(int _fd,off_t numBlocks,size_t blockSize,bool readOnly)1638 block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
1639 		bool readOnly)
1640 	:
1641 	hash(NULL),
1642 	fd(_fd),
1643 	max_blocks(numBlocks),
1644 	block_size(blockSize),
1645 	next_transaction_id(1),
1646 	last_transaction(NULL),
1647 	transaction_hash(NULL),
1648 	buffer_cache(NULL),
1649 	unused_block_count(0),
1650 	busy_reading_count(0),
1651 	busy_reading_waiters(false),
1652 	busy_writing_count(0),
1653 	busy_writing_waiters(0),
1654 	last_block_write(0),
1655 	last_block_write_duration(0),
1656 	num_dirty_blocks(0),
1657 	read_only(readOnly)
1658 {
1659 }
1660 
1661 
1662 /*! Should be called with the cache's lock held. */
~block_cache()1663 block_cache::~block_cache()
1664 {
1665 	unregister_low_resource_handler(&_LowMemoryHandler, this);
1666 
1667 	delete transaction_hash;
1668 	delete hash;
1669 
1670 	delete_object_cache(buffer_cache);
1671 
1672 	mutex_destroy(&lock);
1673 }
1674 
1675 
1676 status_t
Init()1677 block_cache::Init()
1678 {
1679 	busy_reading_condition.Init(this, "cache block busy_reading");
1680 	busy_writing_condition.Init(this, "cache block busy writing");
1681 	condition_variable.Init(this, "cache transaction sync");
1682 	mutex_init(&lock, "block cache");
1683 
1684 	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
1685 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1686 	if (buffer_cache == NULL)
1687 		return B_NO_MEMORY;
1688 
1689 	hash = new BlockTable();
1690 	if (hash == NULL || hash->Init(1024) != B_OK)
1691 		return B_NO_MEMORY;
1692 
1693 	transaction_hash = new(std::nothrow) TransactionTable();
1694 	if (transaction_hash == NULL || transaction_hash->Init(16) != B_OK)
1695 		return B_NO_MEMORY;
1696 
1697 	return register_low_resource_handler(&_LowMemoryHandler, this,
1698 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1699 			| B_KERNEL_RESOURCE_ADDRESS_SPACE, 0);
1700 }
1701 
1702 
1703 void
Free(void * buffer)1704 block_cache::Free(void* buffer)
1705 {
1706 	if (buffer != NULL)
1707 		object_cache_free(buffer_cache, buffer, 0);
1708 }
1709 
1710 
1711 void*
Allocate()1712 block_cache::Allocate()
1713 {
1714 	void* block = object_cache_alloc(buffer_cache, 0);
1715 	if (block != NULL)
1716 		return block;
1717 
1718 	// recycle existing before allocating a new one
1719 	RemoveUnusedBlocks(100);
1720 
1721 	return object_cache_alloc(buffer_cache, 0);
1722 }
1723 
1724 
1725 void
FreeBlock(cached_block * block)1726 block_cache::FreeBlock(cached_block* block)
1727 {
1728 	Free(block->current_data);
1729 
1730 	if (block->original_data != NULL || block->parent_data != NULL) {
1731 		panic("block_cache::FreeBlock(): %" B_PRIdOFF ", original %p, parent %p\n",
1732 			block->block_number, block->original_data, block->parent_data);
1733 	}
1734 
1735 #if BLOCK_CACHE_DEBUG_CHANGED
1736 	Free(block->compare);
1737 #endif
1738 
1739 	object_cache_free(sBlockCache, block, 0);
1740 }
1741 
1742 
1743 /*! Allocates a new block for \a blockNumber, ready for use */
1744 cached_block*
NewBlock(off_t blockNumber)1745 block_cache::NewBlock(off_t blockNumber)
1746 {
1747 	cached_block* block = NULL;
1748 
1749 	if (low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1750 			| B_KERNEL_RESOURCE_ADDRESS_SPACE) != B_NO_LOW_RESOURCE) {
1751 		// recycle existing instead of allocating a new one
1752 		block = _GetUnusedBlock();
1753 	}
1754 	if (block == NULL) {
1755 		block = (cached_block*)object_cache_alloc(sBlockCache, 0);
1756 		if (block != NULL) {
1757 			block->current_data = Allocate();
1758 			if (block->current_data == NULL) {
1759 				object_cache_free(sBlockCache, block, 0);
1760 				return NULL;
1761 			}
1762 		} else {
1763 			TB(Error(this, blockNumber, "allocation failed"));
1764 			TRACE_ALWAYS("block allocation failed, unused list is %sempty.\n",
1765 				unused_blocks.IsEmpty() ? "" : "not ");
1766 
1767 			// allocation failed, try to reuse an unused block
1768 			block = _GetUnusedBlock();
1769 			if (block == NULL) {
1770 				TB(Error(this, blockNumber, "get unused failed"));
1771 				FATAL(("could not allocate block!\n"));
1772 				return NULL;
1773 			}
1774 		}
1775 	}
1776 
1777 	block->block_number = blockNumber;
1778 	block->ref_count = 0;
1779 	block->last_accessed = 0;
1780 	block->transaction_next = NULL;
1781 	block->transaction = block->previous_transaction = NULL;
1782 	block->original_data = NULL;
1783 	block->parent_data = NULL;
1784 	block->busy_reading = false;
1785 	block->busy_writing = false;
1786 	block->is_writing = false;
1787 	block->is_dirty = false;
1788 	block->unused = false;
1789 	block->discard = false;
1790 	block->busy_reading_waiters = false;
1791 	block->busy_writing_waiters = false;
1792 #if BLOCK_CACHE_DEBUG_CHANGED
1793 	block->compare = NULL;
1794 #endif
1795 
1796 	return block;
1797 }
1798 
1799 
1800 void
FreeBlockParentData(cached_block * block)1801 block_cache::FreeBlockParentData(cached_block* block)
1802 {
1803 	ASSERT(block->parent_data != NULL);
1804 	if (block->parent_data != block->current_data)
1805 		Free(block->parent_data);
1806 	block->parent_data = NULL;
1807 }
1808 
1809 
1810 void
RemoveUnusedBlocks(int32 count,int32 minSecondsOld)1811 block_cache::RemoveUnusedBlocks(int32 count, int32 minSecondsOld)
1812 {
1813 	TRACE(("block_cache: remove up to %" B_PRId32 " unused blocks\n", count));
1814 
1815 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1816 			cached_block* block = iterator.Next();) {
1817 		if (minSecondsOld >= block->LastAccess()) {
1818 			// The list is sorted by last access
1819 			break;
1820 		}
1821 		if (block->busy_reading || block->busy_writing)
1822 			continue;
1823 
1824 		TB(Flush(this, block));
1825 		TRACE(("  remove block %" B_PRIdOFF ", last accessed %" B_PRId32 "\n",
1826 			block->block_number, block->last_accessed));
1827 
1828 		// this can only happen if no transactions are used
1829 		if (block->is_dirty && !block->discard) {
1830 			if (block->busy_writing)
1831 				continue;
1832 
1833 			BlockWriter::WriteBlock(this, block);
1834 		}
1835 
1836 		// remove block from lists
1837 		iterator.Remove();
1838 		unused_block_count--;
1839 		RemoveBlock(block);
1840 
1841 		if (--count <= 0)
1842 			break;
1843 	}
1844 }
1845 
1846 
1847 void
RemoveBlock(cached_block * block)1848 block_cache::RemoveBlock(cached_block* block)
1849 {
1850 	hash->Remove(block);
1851 	FreeBlock(block);
1852 }
1853 
1854 
1855 /*!	Discards the block from a transaction (this method must not be called
1856 	for blocks not part of a transaction).
1857 */
1858 void
DiscardBlock(cached_block * block)1859 block_cache::DiscardBlock(cached_block* block)
1860 {
1861 	ASSERT(block->discard);
1862 	ASSERT(block->previous_transaction == NULL);
1863 
1864 	if (block->parent_data != NULL)
1865 		FreeBlockParentData(block);
1866 
1867 	if (block->original_data != NULL) {
1868 		Free(block->original_data);
1869 		block->original_data = NULL;
1870 	}
1871 
1872 	RemoveBlock(block);
1873 }
1874 
1875 
1876 void
_LowMemoryHandler(void * data,uint32 resources,int32 level)1877 block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
1878 {
1879 	TRACE(("block_cache: low memory handler called with level %" B_PRId32 "\n", level));
1880 
1881 	// free some blocks according to the low memory state
1882 	// (if there is enough memory left, we don't free any)
1883 
1884 	block_cache* cache = (block_cache*)data;
1885 	if (cache->unused_block_count <= 1)
1886 		return;
1887 
1888 	int32 free = 0;
1889 	int32 secondsOld = 0;
1890 	switch (level) {
1891 		case B_NO_LOW_RESOURCE:
1892 			return;
1893 		case B_LOW_RESOURCE_NOTE:
1894 			free = cache->unused_block_count / 4;
1895 			secondsOld = 120;
1896 			break;
1897 		case B_LOW_RESOURCE_WARNING:
1898 			free = cache->unused_block_count / 2;
1899 			secondsOld = 10;
1900 			break;
1901 		case B_LOW_RESOURCE_CRITICAL:
1902 			free = cache->unused_block_count - 1;
1903 			secondsOld = 0;
1904 			break;
1905 	}
1906 
1907 	MutexLocker locker(&cache->lock);
1908 
1909 	if (!locker.IsLocked()) {
1910 		// If our block_cache were deleted, it could be that we had
1911 		// been called before that deletion went through, therefore,
1912 		// acquiring its lock might fail.
1913 		return;
1914 	}
1915 
1916 #ifdef TRACE_BLOCK_CACHE
1917 	uint32 oldUnused = cache->unused_block_count;
1918 #endif
1919 
1920 	cache->RemoveUnusedBlocks(free, secondsOld);
1921 
1922 	TRACE(("block_cache::_LowMemoryHandler(): %p: unused: %" B_PRIu32 " -> %" B_PRIu32 "\n",
1923 		cache, oldUnused, cache->unused_block_count));
1924 }
1925 
1926 
1927 cached_block*
_GetUnusedBlock()1928 block_cache::_GetUnusedBlock()
1929 {
1930 	TRACE(("block_cache: get unused block\n"));
1931 
1932 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1933 			cached_block* block = iterator.Next();) {
1934 		TB(Flush(this, block, true));
1935 		// this can only happen if no transactions are used
1936 		if (block->is_dirty && !block->busy_writing && !block->discard)
1937 			BlockWriter::WriteBlock(this, block);
1938 
1939 		// remove block from lists
1940 		iterator.Remove();
1941 		unused_block_count--;
1942 		hash->Remove(block);
1943 
1944 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1945 		block->unused = false;
1946 
1947 		// TODO: see if compare data is handled correctly here!
1948 #if BLOCK_CACHE_DEBUG_CHANGED
1949 		if (block->compare != NULL)
1950 			Free(block->compare);
1951 #endif
1952 		return block;
1953 	}
1954 
1955 	return NULL;
1956 }
1957 
1958 
1959 //	#pragma mark - private block functions
1960 
1961 
1962 /*!	Cache must be locked.
1963 */
1964 static void
mark_block_busy_reading(block_cache * cache,cached_block * block)1965 mark_block_busy_reading(block_cache* cache, cached_block* block)
1966 {
1967 	block->busy_reading = true;
1968 	cache->busy_reading_count++;
1969 }
1970 
1971 
1972 /*!	Cache must be locked.
1973 */
1974 static void
mark_block_unbusy_reading(block_cache * cache,cached_block * block)1975 mark_block_unbusy_reading(block_cache* cache, cached_block* block)
1976 {
1977 	block->busy_reading = false;
1978 	cache->busy_reading_count--;
1979 
1980 	if ((cache->busy_reading_waiters && cache->busy_reading_count == 0)
1981 			|| block->busy_reading_waiters) {
1982 		cache->busy_reading_waiters = false;
1983 		block->busy_reading_waiters = false;
1984 		cache->busy_reading_condition.NotifyAll();
1985 	}
1986 }
1987 
1988 
1989 /*!	Cache must be locked.
1990 */
1991 static void
wait_for_busy_reading_block(block_cache * cache,cached_block * block)1992 wait_for_busy_reading_block(block_cache* cache, cached_block* block)
1993 {
1994 	while (block->busy_reading) {
1995 		// wait for at least the specified block to be read in
1996 		ConditionVariableEntry entry;
1997 		cache->busy_reading_condition.Add(&entry);
1998 		block->busy_reading_waiters = true;
1999 
2000 		mutex_unlock(&cache->lock);
2001 		entry.Wait();
2002 		mutex_lock(&cache->lock);
2003 	}
2004 }
2005 
2006 
2007 /*!	Cache must be locked.
2008 */
2009 static void
wait_for_busy_reading_blocks(block_cache * cache)2010 wait_for_busy_reading_blocks(block_cache* cache)
2011 {
2012 	while (cache->busy_reading_count != 0) {
2013 		// wait for all blocks to be read in
2014 		ConditionVariableEntry entry;
2015 		cache->busy_reading_condition.Add(&entry);
2016 		cache->busy_reading_waiters = true;
2017 
2018 		mutex_unlock(&cache->lock);
2019 		entry.Wait();
2020 		mutex_lock(&cache->lock);
2021 	}
2022 }
2023 
2024 
2025 /*!	Cache must be locked.
2026 */
2027 static void
wait_for_busy_writing_block(block_cache * cache,cached_block * block)2028 wait_for_busy_writing_block(block_cache* cache, cached_block* block)
2029 {
2030 	while (block->busy_writing) {
2031 		// wait for all blocks to be written back
2032 		ConditionVariableEntry entry;
2033 		cache->busy_writing_condition.Add(&entry);
2034 		block->busy_writing_waiters = true;
2035 
2036 		mutex_unlock(&cache->lock);
2037 		entry.Wait();
2038 		mutex_lock(&cache->lock);
2039 	}
2040 }
2041 
2042 
2043 /*!	Cache must be locked.
2044 */
2045 static void
wait_for_busy_writing_blocks(block_cache * cache)2046 wait_for_busy_writing_blocks(block_cache* cache)
2047 {
2048 	while (cache->busy_writing_count != 0) {
2049 		// wait for all blocks to be written back
2050 		ConditionVariableEntry entry;
2051 		cache->busy_writing_condition.Add(&entry);
2052 		cache->busy_writing_waiters = true;
2053 
2054 		mutex_unlock(&cache->lock);
2055 		entry.Wait();
2056 		mutex_lock(&cache->lock);
2057 	}
2058 }
2059 
2060 
2061 /*!	Removes a reference from the specified \a block. If this was the last
2062 	reference, the block is moved into the unused list.
2063 	In low memory situations, it will also free some blocks from that list,
2064 	but not necessarily the \a block it just released.
2065 */
2066 static void
put_cached_block(block_cache * cache,cached_block * block)2067 put_cached_block(block_cache* cache, cached_block* block)
2068 {
2069 #if BLOCK_CACHE_DEBUG_CHANGED
2070 	if (!block->is_dirty && block->compare != NULL
2071 		&& memcmp(block->current_data, block->compare, cache->block_size)) {
2072 		TRACE_ALWAYS("new block:\n");
2073 		dump_block((const char*)block->current_data, 256, "  ");
2074 		TRACE_ALWAYS("unchanged block:\n");
2075 		dump_block((const char*)block->compare, 256, "  ");
2076 		BlockWriter::WriteBlock(cache, block);
2077 		panic("block_cache: supposed to be clean block was changed!\n");
2078 
2079 		cache->Free(block->compare);
2080 		block->compare = NULL;
2081 	}
2082 #endif
2083 	TB(Put(cache, block));
2084 
2085 	if (block->ref_count < 1) {
2086 		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
2087 		return;
2088 	}
2089 
2090 	if (--block->ref_count == 0
2091 		&& block->transaction == NULL && block->previous_transaction == NULL) {
2092 		// This block is not used anymore, and not part of any transaction
2093 		block->is_writing = false;
2094 
2095 		if (block->discard) {
2096 			cache->RemoveBlock(block);
2097 		} else {
2098 			// put this block in the list of unused blocks
2099 			ASSERT(!block->unused);
2100 			block->unused = true;
2101 
2102 			ASSERT(block->original_data == NULL && block->parent_data == NULL);
2103 			cache->unused_blocks.Add(block);
2104 			cache->unused_block_count++;
2105 		}
2106 	}
2107 }
2108 
2109 
2110 static void
put_cached_block(block_cache * cache,off_t blockNumber)2111 put_cached_block(block_cache* cache, off_t blockNumber)
2112 {
2113 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
2114 		panic("put_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
2115 			blockNumber, cache->max_blocks - 1);
2116 	}
2117 
2118 	cached_block* block = cache->hash->Lookup(blockNumber);
2119 	if (block != NULL)
2120 		put_cached_block(cache, block);
2121 	else {
2122 		TB(Error(cache, blockNumber, "put unknown"));
2123 	}
2124 }
2125 
2126 
2127 /*!	Retrieves the block \a blockNumber from the hash table, if it's already
2128 	there, or reads it from the disk.
2129 	You need to have the cache locked when calling this function.
2130 
2131 	\param _allocated tells you whether or not a new block has been allocated
2132 		to satisfy your request.
2133 	\param readBlock if \c false, the block will not be read in case it was
2134 		not already in the cache. The block you retrieve may contain random
2135 		data. If \c true, the cache will be temporarily unlocked while the
2136 		block is read in.
2137 */
2138 static status_t
get_cached_block(block_cache * cache,off_t blockNumber,bool * _allocated,bool readBlock,cached_block ** _block)2139 get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
2140 	bool readBlock, cached_block** _block)
2141 {
2142 	ASSERT_LOCKED_MUTEX(&cache->lock);
2143 
2144 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
2145 		panic("get_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
2146 			blockNumber, cache->max_blocks - 1);
2147 		return B_BAD_VALUE;
2148 	}
2149 
2150 retry:
2151 	cached_block* block = cache->hash->Lookup(blockNumber);
2152 	*_allocated = false;
2153 
2154 	if (block == NULL) {
2155 		// put block into cache
2156 		block = cache->NewBlock(blockNumber);
2157 		if (block == NULL)
2158 			return B_NO_MEMORY;
2159 
2160 		cache->hash->Insert(block);
2161 		*_allocated = true;
2162 	} else if (block->busy_reading) {
2163 		// The block is currently busy_reading - wait and try again later
2164 		wait_for_busy_reading_block(cache, block);
2165 
2166 		// The block may have been deleted or replaced in the meantime,
2167 		// so we must look it up in the hash again after waiting.
2168 		goto retry;
2169 	}
2170 
2171 	if (block->unused) {
2172 		//TRACE(("remove block %" B_PRIdOFF " from unused\n", blockNumber));
2173 		block->unused = false;
2174 		cache->unused_blocks.Remove(block);
2175 		cache->unused_block_count--;
2176 	}
2177 
2178 	if (*_allocated && readBlock) {
2179 		// read block into cache
2180 		int32 blockSize = cache->block_size;
2181 
2182 		mark_block_busy_reading(cache, block);
2183 		mutex_unlock(&cache->lock);
2184 
2185 		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
2186 			block->current_data, blockSize);
2187 
2188 		mutex_lock(&cache->lock);
2189 		if (bytesRead < blockSize) {
2190 			cache->RemoveBlock(block);
2191 			TB(Error(cache, blockNumber, "read failed", bytesRead));
2192 
2193 			TRACE_ALWAYS("could not read block %" B_PRIdOFF ": bytesRead: %zd,"
2194 				" error: %s\n", blockNumber, bytesRead, strerror(errno));
2195 			return errno;
2196 		}
2197 		TB(Read(cache, block));
2198 
2199 		mark_block_unbusy_reading(cache, block);
2200 	}
2201 
2202 	block->ref_count++;
2203 	block->last_accessed = system_time() / 1000000L;
2204 
2205 	*_block = block;
2206 	return B_OK;
2207 }
2208 
2209 
2210 /*!	Returns the writable block data for the requested blockNumber.
2211 	If \a cleared is true, the block is not read from disk; an empty block
2212 	is returned.
2213 
2214 	This is the only method to insert a block into a transaction. It makes
2215 	sure that the previous block contents are preserved in that case.
2216 */
2217 static status_t
get_writable_cached_block(block_cache * cache,off_t blockNumber,int32 transactionID,bool cleared,void ** _block)2218 get_writable_cached_block(block_cache* cache, off_t blockNumber,
2219 	int32 transactionID, bool cleared, void** _block)
2220 {
2221 	TRACE(("get_writable_cached_block(blockNumber = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
2222 		blockNumber, transactionID));
2223 
2224 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
2225 		panic("get_writable_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
2226 			blockNumber, cache->max_blocks - 1);
2227 		return B_BAD_VALUE;
2228 	}
2229 
2230 	bool allocated;
2231 	cached_block* block;
2232 	status_t status = get_cached_block(cache, blockNumber, &allocated,
2233 		!cleared, &block);
2234 	if (status != B_OK)
2235 		return status;
2236 
2237 	if (block->busy_writing)
2238 		wait_for_busy_writing_block(cache, block);
2239 
2240 	block->discard = false;
2241 
2242 	// if there is no transaction support, we just return the current block
2243 	if (transactionID == -1) {
2244 		if (cleared) {
2245 			mark_block_busy_reading(cache, block);
2246 			mutex_unlock(&cache->lock);
2247 
2248 			memset(block->current_data, 0, cache->block_size);
2249 
2250 			mutex_lock(&cache->lock);
2251 			mark_block_unbusy_reading(cache, block);
2252 		}
2253 
2254 		block->is_writing = true;
2255 
2256 		if (!block->is_dirty) {
2257 			cache->num_dirty_blocks++;
2258 			block->is_dirty = true;
2259 				// mark the block as dirty
2260 		}
2261 
2262 		TB(Get(cache, block));
2263 		*_block = block->current_data;
2264 		return B_OK;
2265 	}
2266 
2267 	cache_transaction* transaction = block->transaction;
2268 
2269 	if (transaction != NULL && transaction->id != transactionID) {
2270 		// TODO: we have to wait here until the other transaction is done.
2271 		//	Maybe we should even panic, since we can't prevent any deadlocks.
2272 		panic("get_writable_cached_block(): asked to get busy writable block "
2273 			"(transaction %" B_PRId32 ")\n", block->transaction->id);
2274 		put_cached_block(cache, block);
2275 		return B_BAD_VALUE;
2276 	}
2277 	if (transaction == NULL && transactionID != -1) {
2278 		// get new transaction
2279 		transaction = lookup_transaction(cache, transactionID);
2280 		if (transaction == NULL) {
2281 			panic("get_writable_cached_block(): invalid transaction %" B_PRId32 "!\n",
2282 				transactionID);
2283 			put_cached_block(cache, block);
2284 			return B_BAD_VALUE;
2285 		}
2286 		if (!transaction->open) {
2287 			panic("get_writable_cached_block(): transaction already done!\n");
2288 			put_cached_block(cache, block);
2289 			return B_BAD_VALUE;
2290 		}
2291 
2292 		block->transaction = transaction;
2293 
2294 		// attach the block to the transaction block list
2295 		block->transaction_next = transaction->first_block;
2296 		transaction->first_block = block;
2297 		transaction->num_blocks++;
2298 	}
2299 	if (transaction != NULL)
2300 		transaction->last_used = system_time();
2301 
2302 	bool wasUnchanged = block->original_data == NULL
2303 		|| block->previous_transaction != NULL;
2304 
2305 	if (!(allocated && cleared) && block->original_data == NULL) {
2306 		// we already have data, so we need to preserve it
2307 		block->original_data = cache->Allocate();
2308 		if (block->original_data == NULL) {
2309 			TB(Error(cache, blockNumber, "allocate original failed"));
2310 			FATAL(("could not allocate original_data\n"));
2311 			put_cached_block(cache, block);
2312 			return B_NO_MEMORY;
2313 		}
2314 
2315 		mark_block_busy_reading(cache, block);
2316 		mutex_unlock(&cache->lock);
2317 
2318 		memcpy(block->original_data, block->current_data, cache->block_size);
2319 
2320 		mutex_lock(&cache->lock);
2321 		mark_block_unbusy_reading(cache, block);
2322 	}
2323 	if (block->parent_data == block->current_data) {
2324 		// remember any previous contents for the parent transaction
2325 		block->parent_data = cache->Allocate();
2326 		if (block->parent_data == NULL) {
2327 			// TODO: maybe we should just continue the current transaction in
2328 			// this case...
2329 			TB(Error(cache, blockNumber, "allocate parent failed"));
2330 			FATAL(("could not allocate parent\n"));
2331 			put_cached_block(cache, block);
2332 			return B_NO_MEMORY;
2333 		}
2334 
2335 		mark_block_busy_reading(cache, block);
2336 		mutex_unlock(&cache->lock);
2337 
2338 		memcpy(block->parent_data, block->current_data, cache->block_size);
2339 
2340 		mutex_lock(&cache->lock);
2341 		mark_block_unbusy_reading(cache, block);
2342 
2343 		transaction->sub_num_blocks++;
2344 	} else if (transaction != NULL && transaction->has_sub_transaction
2345 		&& block->parent_data == NULL && wasUnchanged)
2346 		transaction->sub_num_blocks++;
2347 
2348 	if (cleared) {
2349 		mark_block_busy_reading(cache, block);
2350 		mutex_unlock(&cache->lock);
2351 
2352 		memset(block->current_data, 0, cache->block_size);
2353 
2354 		mutex_lock(&cache->lock);
2355 		mark_block_unbusy_reading(cache, block);
2356 	}
2357 
2358 	block->is_dirty = true;
2359 	TB(Get(cache, block));
2360 	TB2(BlockData(cache, block, "get writable"));
2361 
2362 	*_block = block->current_data;
2363 	return B_OK;
2364 }
2365 
2366 
2367 #if DEBUG_BLOCK_CACHE
2368 
2369 
2370 static void
dump_block(cached_block * block)2371 dump_block(cached_block* block)
2372 {
2373 	kprintf("%08lx %9" B_PRIdOFF " %08lx %08lx %08lx %5" B_PRId32 " %6" B_PRId32
2374 		" %c%c%c%c%c%c %08lx %08lx\n",
2375 		(addr_t)block, block->block_number,
2376 		(addr_t)block->current_data, (addr_t)block->original_data,
2377 		(addr_t)block->parent_data, block->ref_count, block->LastAccess(),
2378 		block->busy_reading ? 'r' : '-', block->busy_writing ? 'w' : '-',
2379 		block->is_writing ? 'W' : '-', block->is_dirty ? 'D' : '-',
2380 		block->unused ? 'U' : '-', block->discard ? 'D' : '-',
2381 		(addr_t)block->transaction,
2382 		(addr_t)block->previous_transaction);
2383 }
2384 
2385 
2386 static void
dump_block_long(cached_block * block)2387 dump_block_long(cached_block* block)
2388 {
2389 	kprintf("BLOCK %p\n", block);
2390 	kprintf(" current data:  %p\n", block->current_data);
2391 	kprintf(" original data: %p\n", block->original_data);
2392 	kprintf(" parent data:   %p\n", block->parent_data);
2393 #if BLOCK_CACHE_DEBUG_CHANGED
2394 	kprintf(" compare data:  %p\n", block->compare);
2395 #endif
2396 	kprintf(" ref_count:     %" B_PRId32 "\n", block->ref_count);
2397 	kprintf(" accessed:      %" B_PRId32 "\n", block->LastAccess());
2398 	kprintf(" flags:        ");
2399 	if (block->busy_reading)
2400 		kprintf(" busy_reading");
2401 	if (block->busy_writing)
2402 		kprintf(" busy_writing");
2403 	if (block->is_writing)
2404 		kprintf(" is-writing");
2405 	if (block->is_dirty)
2406 		kprintf(" is-dirty");
2407 	if (block->unused)
2408 		kprintf(" unused");
2409 	if (block->discard)
2410 		kprintf(" discard");
2411 	kprintf("\n");
2412 	if (block->transaction != NULL) {
2413 		kprintf(" transaction:   %p (%" B_PRId32 ")\n", block->transaction,
2414 			block->transaction->id);
2415 		if (block->transaction_next != NULL) {
2416 			kprintf(" next in transaction: %" B_PRIdOFF "\n",
2417 				block->transaction_next->block_number);
2418 		}
2419 	}
2420 	if (block->previous_transaction != NULL) {
2421 		kprintf(" previous transaction: %p (%" B_PRId32 ")\n",
2422 			block->previous_transaction,
2423 			block->previous_transaction->id);
2424 	}
2425 
2426 	set_debug_variable("_current", (addr_t)block->current_data);
2427 	set_debug_variable("_original", (addr_t)block->original_data);
2428 	set_debug_variable("_parent", (addr_t)block->parent_data);
2429 }
2430 
2431 
2432 static int
dump_cached_block(int argc,char ** argv)2433 dump_cached_block(int argc, char** argv)
2434 {
2435 	if (argc != 2) {
2436 		kprintf("usage: %s <block-address>\n", argv[0]);
2437 		return 0;
2438 	}
2439 
2440 	dump_block_long((struct cached_block*)(addr_t)parse_expression(argv[1]));
2441 	return 0;
2442 }
2443 
2444 
2445 static int
dump_cache(int argc,char ** argv)2446 dump_cache(int argc, char** argv)
2447 {
2448 	bool showTransactions = false;
2449 	bool showBlocks = false;
2450 	int32 i = 1;
2451 	while (argv[i] != NULL && argv[i][0] == '-') {
2452 		for (char* arg = &argv[i][1]; arg[0]; arg++) {
2453 			switch (arg[0]) {
2454 				case 'b':
2455 					showBlocks = true;
2456 					break;
2457 				case 't':
2458 					showTransactions = true;
2459 					break;
2460 				default:
2461 					print_debugger_command_usage(argv[0]);
2462 					return 0;
2463 			}
2464 		}
2465 		i++;
2466 	}
2467 
2468 	if (i >= argc) {
2469 		print_debugger_command_usage(argv[0]);
2470 		return 0;
2471 	}
2472 
2473 	block_cache* cache = (struct block_cache*)(addr_t)parse_expression(argv[i]);
2474 	if (cache == NULL) {
2475 		kprintf("invalid cache address\n");
2476 		return 0;
2477 	}
2478 
2479 	off_t blockNumber = -1;
2480 	if (i + 1 < argc) {
2481 		blockNumber = parse_expression(argv[i + 1]);
2482 		cached_block* block = cache->hash->Lookup(blockNumber);
2483 		if (block != NULL)
2484 			dump_block_long(block);
2485 		else
2486 			kprintf("block %" B_PRIdOFF " not found\n", blockNumber);
2487 		return 0;
2488 	}
2489 
2490 	kprintf("BLOCK CACHE: %p\n", cache);
2491 
2492 	kprintf(" fd:           %d\n", cache->fd);
2493 	kprintf(" max_blocks:   %" B_PRIdOFF "\n", cache->max_blocks);
2494 	kprintf(" block_size:   %zu\n", cache->block_size);
2495 	kprintf(" next_transaction_id: %" B_PRId32 "\n", cache->next_transaction_id);
2496 	kprintf(" buffer_cache: %p\n", cache->buffer_cache);
2497 	kprintf(" busy_reading: %" B_PRIu32 ", %s waiters\n", cache->busy_reading_count,
2498 		cache->busy_reading_waiters ? "has" : "no");
2499 	kprintf(" busy_writing: %" B_PRIu32 ", %s waiters\n", cache->busy_writing_count,
2500 		cache->busy_writing_waiters ? "has" : "no");
2501 
2502 	if (!cache->pending_notifications.IsEmpty()) {
2503 		kprintf(" pending notifications:\n");
2504 
2505 		NotificationList::Iterator iterator
2506 			= cache->pending_notifications.GetIterator();
2507 		while (iterator.HasNext()) {
2508 			cache_notification* notification = iterator.Next();
2509 
2510 			kprintf("  %p %5" B_PRIx32 " %p - %p\n", notification,
2511 				notification->events_pending, notification->hook,
2512 				notification->data);
2513 		}
2514 	}
2515 
2516 	if (showTransactions) {
2517 		kprintf(" transactions:\n");
2518 		kprintf("address       id state  blocks  main   sub\n");
2519 
2520 		TransactionTable::Iterator iterator(cache->transaction_hash);
2521 
2522 		while (iterator.HasNext()) {
2523 			cache_transaction* transaction = iterator.Next();
2524 			kprintf("%p %5" B_PRId32 " %-7s %5" B_PRId32 " %5" B_PRId32 " %5"
2525 				B_PRId32 "\n", transaction, transaction->id,
2526 				transaction->open ? "open" : "closed",
2527 				transaction->num_blocks, transaction->main_num_blocks,
2528 				transaction->sub_num_blocks);
2529 		}
2530 	}
2531 
2532 	if (showBlocks) {
2533 		kprintf(" blocks:\n");
2534 		kprintf("address  block no. current  original parent    refs access "
2535 			"flags transact prev. trans\n");
2536 	}
2537 
2538 	uint32 referenced = 0;
2539 	uint32 count = 0;
2540 	uint32 dirty = 0;
2541 	uint32 discarded = 0;
2542 	BlockTable::Iterator iterator(cache->hash);
2543 	while (iterator.HasNext()) {
2544 		cached_block* block = iterator.Next();
2545 		if (showBlocks)
2546 			dump_block(block);
2547 
2548 		if (block->is_dirty)
2549 			dirty++;
2550 		if (block->discard)
2551 			discarded++;
2552 		if (block->ref_count)
2553 			referenced++;
2554 		count++;
2555 	}
2556 
2557 	kprintf(" %" B_PRIu32 " blocks total, %" B_PRIu32 " dirty, %" B_PRIu32
2558 		" discarded, %" B_PRIu32 " referenced, %" B_PRIu32 " busy, %" B_PRIu32
2559 		" in unused.\n",
2560 		count, dirty, discarded, referenced, cache->busy_reading_count,
2561 		cache->unused_block_count);
2562 	return 0;
2563 }
2564 
2565 
2566 static int
dump_transaction(int argc,char ** argv)2567 dump_transaction(int argc, char** argv)
2568 {
2569 	bool showBlocks = false;
2570 	int i = 1;
2571 	if (argc > 1 && !strcmp(argv[1], "-b")) {
2572 		showBlocks = true;
2573 		i++;
2574 	}
2575 
2576 	if (argc - i < 1 || argc - i > 2) {
2577 		print_debugger_command_usage(argv[0]);
2578 		return 0;
2579 	}
2580 
2581 	cache_transaction* transaction = NULL;
2582 
2583 	if (argc - i == 1) {
2584 		transaction = (cache_transaction*)(addr_t)parse_expression(argv[i]);
2585 	} else {
2586 		block_cache* cache = (block_cache*)(addr_t)parse_expression(argv[i]);
2587 		int32 id = parse_expression(argv[i + 1]);
2588 		transaction = lookup_transaction(cache, id);
2589 		if (transaction == NULL) {
2590 			kprintf("No transaction with ID %" B_PRId32 " found.\n", id);
2591 			return 0;
2592 		}
2593 	}
2594 
2595 	kprintf("TRANSACTION %p\n", transaction);
2596 
2597 	kprintf(" id:             %" B_PRId32 "\n", transaction->id);
2598 	kprintf(" num block:      %" B_PRId32 "\n", transaction->num_blocks);
2599 	kprintf(" main num block: %" B_PRId32 "\n", transaction->main_num_blocks);
2600 	kprintf(" sub num block:  %" B_PRId32 "\n", transaction->sub_num_blocks);
2601 	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
2602 	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
2603 	kprintf(" idle:           %" B_PRId64 " secs\n",
2604 		(system_time() - transaction->last_used) / 1000000);
2605 
2606 	kprintf(" listeners:\n");
2607 
2608 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2609 	while (iterator.HasNext()) {
2610 		cache_listener* listener = iterator.Next();
2611 
2612 		kprintf("  %p %5" B_PRIx32 " %p - %p\n", listener, listener->events_pending,
2613 			listener->hook, listener->data);
2614 	}
2615 
2616 	if (!showBlocks)
2617 		return 0;
2618 
2619 	kprintf(" blocks:\n");
2620 	kprintf("address  block no. current  original parent    refs access "
2621 		"flags transact prev. trans\n");
2622 
2623 	cached_block* block = transaction->first_block;
2624 	while (block != NULL) {
2625 		dump_block(block);
2626 		block = block->transaction_next;
2627 	}
2628 
2629 	kprintf("--\n");
2630 
2631 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
2632 	while (blockIterator.HasNext()) {
2633 		block = blockIterator.Next();
2634 		dump_block(block);
2635 	}
2636 
2637 	return 0;
2638 }
2639 
2640 
2641 static int
dump_caches(int argc,char ** argv)2642 dump_caches(int argc, char** argv)
2643 {
2644 	kprintf("Block caches:\n");
2645 	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
2646 	while (i.HasNext()) {
2647 		block_cache* cache = i.Next();
2648 		if (cache == (block_cache*)&sMarkCache)
2649 			continue;
2650 
2651 		kprintf("  %p\n", cache);
2652 	}
2653 
2654 	return 0;
2655 }
2656 
2657 
2658 #if BLOCK_CACHE_BLOCK_TRACING >= 2
2659 static int
dump_block_data(int argc,char ** argv)2660 dump_block_data(int argc, char** argv)
2661 {
2662 	using namespace BlockTracing;
2663 
2664 	// Determine which blocks to show
2665 
2666 	bool printStackTrace = true;
2667 	uint32 which = 0;
2668 	int32 i = 1;
2669 	while (i < argc && argv[i][0] == '-') {
2670 		char* arg = &argv[i][1];
2671 		while (arg[0]) {
2672 			switch (arg[0]) {
2673 				case 'c':
2674 					which |= BlockData::kCurrent;
2675 					break;
2676 				case 'p':
2677 					which |= BlockData::kParent;
2678 					break;
2679 				case 'o':
2680 					which |= BlockData::kOriginal;
2681 					break;
2682 
2683 				default:
2684 					kprintf("invalid block specifier (only o/c/p are "
2685 						"allowed).\n");
2686 					return 0;
2687 			}
2688 			arg++;
2689 		}
2690 
2691 		i++;
2692 	}
2693 	if (which == 0)
2694 		which = BlockData::kCurrent | BlockData::kParent | BlockData::kOriginal;
2695 
2696 	if (i == argc) {
2697 		print_debugger_command_usage(argv[0]);
2698 		return 0;
2699 	}
2700 
2701 	// Get the range of blocks to print
2702 
2703 	int64 from = parse_expression(argv[i]);
2704 	int64 to = from;
2705 	if (argc > i + 1)
2706 		to = parse_expression(argv[i + 1]);
2707 	if (to < from)
2708 		to = from;
2709 
2710 	uint32 offset = 0;
2711 	uint32 size = LONG_MAX;
2712 	if (argc > i + 2)
2713 		offset = parse_expression(argv[i + 2]);
2714 	if (argc > i + 3)
2715 		size = parse_expression(argv[i + 3]);
2716 
2717 	TraceEntryIterator iterator;
2718 	iterator.MoveTo(from - 1);
2719 
2720 	static char sBuffer[1024];
2721 	LazyTraceOutput out(sBuffer, sizeof(sBuffer), TRACE_OUTPUT_TEAM_ID);
2722 
2723 	while (TraceEntry* entry = iterator.Next()) {
2724 		int32 index = iterator.Index();
2725 		if (index > to)
2726 			break;
2727 
2728 		Action* action = dynamic_cast<Action*>(entry);
2729 		if (action != NULL) {
2730 			out.Clear();
2731 			out.DumpEntry(action);
2732 			continue;
2733 		}
2734 
2735 		BlockData* blockData = dynamic_cast<BlockData*>(entry);
2736 		if (blockData == NULL)
2737 			continue;
2738 
2739 		out.Clear();
2740 
2741 		const char* dump = out.DumpEntry(entry);
2742 		int length = strlen(dump);
2743 		if (length > 0 && dump[length - 1] == '\n')
2744 			length--;
2745 
2746 		kprintf("%5" B_PRId32 ". %.*s\n", index, length, dump);
2747 
2748 		if (printStackTrace) {
2749 			out.Clear();
2750 			entry->DumpStackTrace(out);
2751 			if (out.Size() > 0)
2752 				kputs(out.Buffer());
2753 		}
2754 
2755 		blockData->DumpBlocks(which, offset, size);
2756 	}
2757 
2758 	return 0;
2759 }
2760 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
2761 
2762 
2763 #endif	// DEBUG_BLOCK_CACHE
2764 
2765 
2766 /*!	Traverses through the block_cache list, and returns one cache after the
2767 	other. The cache returned is automatically locked when you get it, and
2768 	unlocked with the next call to this function. Ignores caches that are in
2769 	deletion state.
2770 	Returns \c NULL when the end of the list is reached.
2771 */
2772 static block_cache*
get_next_locked_block_cache(block_cache * last)2773 get_next_locked_block_cache(block_cache* last)
2774 {
2775 	MutexLocker _(sCachesLock);
2776 
2777 	block_cache* cache;
2778 	if (last != NULL) {
2779 		mutex_unlock(&last->lock);
2780 
2781 		cache = sCaches.GetNext((block_cache*)&sMarkCache);
2782 		sCaches.Remove((block_cache*)&sMarkCache);
2783 	} else
2784 		cache = sCaches.Head();
2785 
2786 	if (cache != NULL) {
2787 		mutex_lock(&cache->lock);
2788 		sCaches.InsertBefore(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
2789 	}
2790 
2791 	return cache;
2792 }
2793 
2794 
2795 /*!	Background thread that continuously checks for pending notifications of
2796 	all caches.
2797 	Every two seconds, it will also write back up to 64 blocks per cache.
2798 */
2799 static status_t
block_notifier_and_writer(void *)2800 block_notifier_and_writer(void* /*data*/)
2801 {
2802 	const bigtime_t kDefaultTimeout = 2000000LL;
2803 	bigtime_t timeout = kDefaultTimeout;
2804 
2805 	while (true) {
2806 		bigtime_t start = system_time();
2807 
2808 		status_t status = acquire_sem_etc(sEventSemaphore, 1,
2809 			B_RELATIVE_TIMEOUT, timeout);
2810 		if (status == B_OK) {
2811 			flush_pending_notifications();
2812 			timeout -= system_time() - start;
2813 			continue;
2814 		}
2815 
2816 		// Write 64 blocks of each block_cache roughly every 2 seconds,
2817 		// potentially more or less depending on congestion and drive speeds
2818 		// (usually much less.) We do not want to queue everything at once
2819 		// because a future transaction might then get held up waiting for
2820 		// a specific block to be written.
2821 		timeout = kDefaultTimeout;
2822 		size_t usedMemory;
2823 		object_cache_get_usage(sBlockCache, &usedMemory);
2824 
2825 		block_cache* cache = NULL;
2826 		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
2827 			// Give some breathing room: wait 2x the length of the potential
2828 			// maximum block count-sized write between writes, and also skip
2829 			// if there are more than 16 blocks currently being written.
2830 			const bigtime_t next = cache->last_block_write
2831 					+ cache->last_block_write_duration * 2 * 64;
2832 			if (cache->busy_writing_count > 16 || system_time() < next) {
2833 				if (cache->last_block_write_duration > 0) {
2834 					timeout = min_c(timeout,
2835 						cache->last_block_write_duration * 2 * 64);
2836 				}
2837 				continue;
2838 			}
2839 
2840 			BlockWriter writer(cache, 64);
2841 			bool hasMoreBlocks = false;
2842 
2843 			size_t cacheUsedMemory;
2844 			object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
2845 			usedMemory += cacheUsedMemory;
2846 
2847 			if (cache->num_dirty_blocks) {
2848 				// This cache is not using transactions, we'll scan the blocks
2849 				// directly
2850 				BlockTable::Iterator iterator(cache->hash);
2851 
2852 				while (iterator.HasNext()) {
2853 					cached_block* block = iterator.Next();
2854 					if (block->CanBeWritten() && !writer.Add(block)) {
2855 						hasMoreBlocks = true;
2856 						break;
2857 					}
2858 				}
2859 			} else {
2860 				TransactionTable::Iterator iterator(cache->transaction_hash);
2861 
2862 				while (iterator.HasNext()) {
2863 					cache_transaction* transaction = iterator.Next();
2864 					if (transaction->open) {
2865 						if (system_time() > transaction->last_used
2866 								+ kTransactionIdleTime) {
2867 							// Transaction is open but idle
2868 							notify_transaction_listeners(cache, transaction,
2869 								TRANSACTION_IDLE);
2870 						}
2871 						continue;
2872 					}
2873 
2874 					bool hasLeftOvers;
2875 						// we ignore this one
2876 					if (!writer.Add(transaction, hasLeftOvers)) {
2877 						hasMoreBlocks = true;
2878 						break;
2879 					}
2880 				}
2881 			}
2882 
2883 			writer.Write();
2884 
2885 			if (hasMoreBlocks && cache->last_block_write_duration > 0) {
2886 				// There are probably still more blocks that we could write, so
2887 				// see if we can decrease the timeout.
2888 				timeout = min_c(timeout,
2889 					cache->last_block_write_duration * 2 * 64);
2890 			}
2891 
2892 			if ((block_cache_used_memory() / B_PAGE_SIZE)
2893 					> vm_page_num_pages() / 2) {
2894 				// Try to reduce memory usage to half of the available
2895 				// RAM at maximum
2896 				cache->RemoveUnusedBlocks(1000, 10);
2897 			}
2898 		}
2899 
2900 		MutexLocker _(sCachesMemoryUseLock);
2901 		sUsedMemory = usedMemory;
2902 	}
2903 
2904 	// never can get here
2905 	return B_OK;
2906 }
2907 
2908 
2909 /*!	Notify function for wait_for_notifications(). */
2910 static void
notify_sync(int32 transactionID,int32 event,void * _cache)2911 notify_sync(int32 transactionID, int32 event, void* _cache)
2912 {
2913 	block_cache* cache = (block_cache*)_cache;
2914 
2915 	cache->condition_variable.NotifyOne();
2916 }
2917 
2918 
2919 /*!	Must be called with the sCachesLock held. */
2920 static bool
is_valid_cache(block_cache * cache)2921 is_valid_cache(block_cache* cache)
2922 {
2923 	ASSERT_LOCKED_MUTEX(&sCachesLock);
2924 
2925 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
2926 	while (iterator.HasNext()) {
2927 		if (cache == iterator.Next())
2928 			return true;
2929 	}
2930 
2931 	return false;
2932 }
2933 
2934 
2935 /*!	Waits until all pending notifications are carried out.
2936 	Safe to be called from the block writer/notifier thread.
2937 	You must not hold the \a cache lock when calling this function.
2938 */
2939 static void
wait_for_notifications(block_cache * cache)2940 wait_for_notifications(block_cache* cache)
2941 {
2942 	MutexLocker locker(sCachesLock);
2943 
2944 	if (find_thread(NULL) == sNotifierWriterThread) {
2945 		// We're the notifier thread, don't wait, but flush all pending
2946 		// notifications directly.
2947 		if (is_valid_cache(cache))
2948 			flush_pending_notifications(cache);
2949 		return;
2950 	}
2951 
2952 	// add sync notification
2953 	cache_notification notification;
2954 	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
2955 		cache);
2956 
2957 	ConditionVariableEntry entry;
2958 	cache->condition_variable.Add(&entry);
2959 
2960 	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
2961 	locker.Unlock();
2962 
2963 	// wait for notification hook to be called
2964 	entry.Wait();
2965 }
2966 
2967 
2968 status_t
block_cache_init(void)2969 block_cache_init(void)
2970 {
2971 	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
2972 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
2973 	if (sBlockCache == NULL)
2974 		return B_NO_MEMORY;
2975 
2976 	sCacheNotificationCache = create_object_cache("cache notifications",
2977 		sizeof(cache_listener), 8, NULL, NULL, NULL);
2978 	if (sCacheNotificationCache == NULL)
2979 		return B_NO_MEMORY;
2980 
2981 	new (&sCaches) DoublyLinkedList<block_cache>;
2982 		// manually call constructor
2983 
2984 	sEventSemaphore = create_sem(0, "block cache event");
2985 	if (sEventSemaphore < B_OK)
2986 		return sEventSemaphore;
2987 
2988 	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
2989 		"block notifier/writer", B_LOW_PRIORITY, NULL);
2990 	if (sNotifierWriterThread >= B_OK)
2991 		resume_thread(sNotifierWriterThread);
2992 
2993 #if DEBUG_BLOCK_CACHE
2994 	add_debugger_command_etc("block_caches", &dump_caches,
2995 		"dumps all block caches", "\n", 0);
2996 	add_debugger_command_etc("block_cache", &dump_cache,
2997 		"dumps a specific block cache",
2998 		"[-bt] <cache-address> [block-number]\n"
2999 		"  -t lists the transactions\n"
3000 		"  -b lists all blocks\n", 0);
3001 	add_debugger_command("cached_block", &dump_cached_block,
3002 		"dumps the specified cached block");
3003 	add_debugger_command_etc("transaction", &dump_transaction,
3004 		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
3005 		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
3006 		"  -b lists all blocks that are part of this transaction\n", 0);
3007 #	if BLOCK_CACHE_BLOCK_TRACING >= 2
3008 	add_debugger_command_etc("block_cache_data", &dump_block_data,
3009 		"dumps the data blocks logged for the actions",
3010 		"[-cpo] <from> [<to> [<offset> [<size>]]]\n"
3011 		"If no data specifier is used, all blocks are shown by default.\n"
3012 		" -c       the current data is shown, if available.\n"
3013 		" -p       the parent data is shown, if available.\n"
3014 		" -o       the original data is shown, if available.\n"
3015 		" <from>   first index of tracing entries to show.\n"
3016 		" <to>     if given, the last entry. If not, only <from> is shown.\n"
3017 		" <offset> the offset of the block data.\n"
3018 		" <from>   the size of the block data that is dumped\n", 0);
3019 #	endif
3020 #endif	// DEBUG_BLOCK_CACHE
3021 
3022 	return B_OK;
3023 }
3024 
3025 
3026 size_t
block_cache_used_memory(void)3027 block_cache_used_memory(void)
3028 {
3029 	MutexLocker _(sCachesMemoryUseLock);
3030 	return sUsedMemory;
3031 }
3032 
3033 
3034 //	#pragma mark - public transaction API
3035 
3036 
3037 int32
cache_start_transaction(void * _cache)3038 cache_start_transaction(void* _cache)
3039 {
3040 	block_cache* cache = (block_cache*)_cache;
3041 	TransactionLocker locker(cache);
3042 
3043 	if (cache->last_transaction && cache->last_transaction->open) {
3044 		panic("last transaction (%" B_PRId32 ") still open!\n",
3045 			cache->last_transaction->id);
3046 	}
3047 
3048 	cache_transaction* transaction = new(std::nothrow) cache_transaction;
3049 	if (transaction == NULL)
3050 		return B_NO_MEMORY;
3051 
3052 	transaction->id = atomic_add(&cache->next_transaction_id, 1);
3053 	cache->last_transaction = transaction;
3054 
3055 	TRACE(("cache_start_transaction(): id %" B_PRId32 " started\n", transaction->id));
3056 	T(Action("start", cache, transaction));
3057 
3058 	cache->transaction_hash->Insert(transaction);
3059 
3060 	return transaction->id;
3061 }
3062 
3063 
3064 status_t
cache_sync_transaction(void * _cache,int32 id)3065 cache_sync_transaction(void* _cache, int32 id)
3066 {
3067 	block_cache* cache = (block_cache*)_cache;
3068 	bool hadBusy;
3069 
3070 	TRACE(("cache_sync_transaction(id %" B_PRId32 ")\n", id));
3071 
3072 	do {
3073 		TransactionLocker locker(cache);
3074 		hadBusy = false;
3075 
3076 		BlockWriter writer(cache);
3077 		TransactionTable::Iterator iterator(cache->transaction_hash);
3078 
3079 		while (iterator.HasNext()) {
3080 			// close all earlier transactions which haven't been closed yet
3081 			cache_transaction* transaction = iterator.Next();
3082 
3083 			if (transaction->busy_writing_count != 0) {
3084 				hadBusy = true;
3085 				continue;
3086 			}
3087 			if (transaction->id <= id && !transaction->open) {
3088 				// write back all of their remaining dirty blocks
3089 				T(Action("sync", cache, transaction));
3090 
3091 				bool hasLeftOvers;
3092 				writer.Add(transaction, hasLeftOvers);
3093 
3094 				if (hasLeftOvers) {
3095 					// This transaction contains blocks that a previous
3096 					// transaction is trying to write back in this write run
3097 					hadBusy = true;
3098 				}
3099 			}
3100 		}
3101 
3102 		status_t status = writer.Write();
3103 		if (status != B_OK)
3104 			return status;
3105 	} while (hadBusy);
3106 
3107 	wait_for_notifications(cache);
3108 		// make sure that all pending TRANSACTION_WRITTEN notifications
3109 		// are handled after we return
3110 	return B_OK;
3111 }
3112 
3113 
3114 status_t
cache_end_transaction(void * _cache,int32 id,transaction_notification_hook hook,void * data)3115 cache_end_transaction(void* _cache, int32 id,
3116 	transaction_notification_hook hook, void* data)
3117 {
3118 	block_cache* cache = (block_cache*)_cache;
3119 	TransactionLocker locker(cache);
3120 
3121 	TRACE(("cache_end_transaction(id = %" B_PRId32 ")\n", id));
3122 
3123 	cache_transaction* transaction = lookup_transaction(cache, id);
3124 	if (transaction == NULL) {
3125 		panic("cache_end_transaction(): invalid transaction ID\n");
3126 		return B_BAD_VALUE;
3127 	}
3128 
3129 	// Write back all pending transaction blocks
3130 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
3131 	if (status != B_OK)
3132 		return status;
3133 
3134 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3135 
3136 	if (hook != NULL
3137 		&& add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN,
3138 			hook, data) != B_OK) {
3139 		return B_NO_MEMORY;
3140 	}
3141 
3142 	T(Action("end", cache, transaction));
3143 
3144 	// iterate through all blocks and free the unchanged original contents
3145 
3146 	cached_block* next;
3147 	for (cached_block* block = transaction->first_block; block != NULL;
3148 			block = next) {
3149 		next = block->transaction_next;
3150 		ASSERT(block->previous_transaction == NULL);
3151 
3152 		if (block->discard) {
3153 			// This block has been discarded in the transaction
3154 			cache->DiscardBlock(block);
3155 			transaction->num_blocks--;
3156 			continue;
3157 		}
3158 
3159 		if (block->original_data != NULL) {
3160 			cache->Free(block->original_data);
3161 			block->original_data = NULL;
3162 		}
3163 		if (block->parent_data != NULL) {
3164 			ASSERT(transaction->has_sub_transaction);
3165 			cache->FreeBlockParentData(block);
3166 		}
3167 
3168 		// move the block to the previous transaction list
3169 		transaction->blocks.Add(block);
3170 
3171 		block->previous_transaction = transaction;
3172 		block->transaction_next = NULL;
3173 		block->transaction = NULL;
3174 	}
3175 
3176 	transaction->open = false;
3177 	return B_OK;
3178 }
3179 
3180 
3181 status_t
cache_abort_transaction(void * _cache,int32 id)3182 cache_abort_transaction(void* _cache, int32 id)
3183 {
3184 	block_cache* cache = (block_cache*)_cache;
3185 	TransactionLocker locker(cache);
3186 
3187 	TRACE(("cache_abort_transaction(id = %" B_PRId32 ")\n", id));
3188 
3189 	cache_transaction* transaction = lookup_transaction(cache, id);
3190 	if (transaction == NULL) {
3191 		panic("cache_abort_transaction(): invalid transaction ID\n");
3192 		return B_BAD_VALUE;
3193 	}
3194 
3195 	T(Abort(cache, transaction));
3196 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3197 
3198 	// iterate through all blocks and restore their original contents
3199 
3200 	cached_block* block = transaction->first_block;
3201 	cached_block* next;
3202 	for (; block != NULL; block = next) {
3203 		next = block->transaction_next;
3204 
3205 		if (block->original_data != NULL) {
3206 			TRACE(("cache_abort_transaction(id = %" B_PRId32 "): restored contents of "
3207 				"block %" B_PRIdOFF "\n", transaction->id, block->block_number));
3208 			memcpy(block->current_data, block->original_data,
3209 				cache->block_size);
3210 			cache->Free(block->original_data);
3211 			block->original_data = NULL;
3212 		}
3213 		if (transaction->has_sub_transaction && block->parent_data != NULL)
3214 			cache->FreeBlockParentData(block);
3215 
3216 		block->transaction_next = NULL;
3217 		block->transaction = NULL;
3218 		block->discard = false;
3219 		if (block->previous_transaction == NULL)
3220 			block->is_dirty = false;
3221 	}
3222 
3223 	cache->transaction_hash->Remove(transaction);
3224 	delete_transaction(cache, transaction);
3225 	return B_OK;
3226 }
3227 
3228 
3229 /*!	Acknowledges the current parent transaction, and starts a new transaction
3230 	from its sub transaction.
3231 	The new transaction also gets a new transaction ID.
3232 */
3233 int32
cache_detach_sub_transaction(void * _cache,int32 id,transaction_notification_hook hook,void * data)3234 cache_detach_sub_transaction(void* _cache, int32 id,
3235 	transaction_notification_hook hook, void* data)
3236 {
3237 	block_cache* cache = (block_cache*)_cache;
3238 	TransactionLocker locker(cache);
3239 
3240 	TRACE(("cache_detach_sub_transaction(id = %" B_PRId32 ")\n", id));
3241 
3242 	cache_transaction* transaction = lookup_transaction(cache, id);
3243 	if (transaction == NULL) {
3244 		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
3245 		return B_BAD_VALUE;
3246 	}
3247 	if (!transaction->has_sub_transaction)
3248 		return B_BAD_VALUE;
3249 
3250 	// iterate through all blocks and free the unchanged original contents
3251 
3252 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
3253 	if (status != B_OK)
3254 		return status;
3255 
3256 	// create a new transaction for the sub transaction
3257 	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
3258 	if (newTransaction == NULL)
3259 		return B_NO_MEMORY;
3260 
3261 	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
3262 	T(Detach(cache, transaction, newTransaction));
3263 
3264 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3265 
3266 	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
3267 			data) != B_OK) {
3268 		delete newTransaction;
3269 		return B_NO_MEMORY;
3270 	}
3271 
3272 	cached_block* last = NULL;
3273 	cached_block* next;
3274 	for (cached_block* block = transaction->first_block; block != NULL;
3275 			block = next) {
3276 		next = block->transaction_next;
3277 		ASSERT(block->previous_transaction == NULL);
3278 
3279 		if (block->discard) {
3280 			cache->DiscardBlock(block);
3281 			transaction->main_num_blocks--;
3282 			continue;
3283 		}
3284 
3285 		if (block->parent_data != NULL) {
3286 			// The block changed in the parent - free the original data, since
3287 			// they will be replaced by what is in current.
3288 			ASSERT(block->original_data != NULL);
3289 			cache->Free(block->original_data);
3290 
3291 			if (block->parent_data != block->current_data) {
3292 				// The block had been changed in both transactions
3293 				block->original_data = block->parent_data;
3294 			} else {
3295 				// The block has only been changed in the parent
3296 				block->original_data = NULL;
3297 			}
3298 			block->parent_data = NULL;
3299 
3300 			// move the block to the previous transaction list
3301 			transaction->blocks.Add(block);
3302 			block->previous_transaction = transaction;
3303 		}
3304 
3305 		if (block->original_data != NULL) {
3306 			// This block had been changed in the current sub transaction,
3307 			// we need to move this block over to the new transaction.
3308 			ASSERT(block->parent_data == NULL);
3309 
3310 			if (last == NULL)
3311 				newTransaction->first_block = block;
3312 			else
3313 				last->transaction_next = block;
3314 
3315 			block->transaction = newTransaction;
3316 			last = block;
3317 		} else
3318 			block->transaction = NULL;
3319 
3320 		block->transaction_next = NULL;
3321 	}
3322 
3323 	newTransaction->num_blocks = transaction->sub_num_blocks;
3324 
3325 	transaction->open = false;
3326 	transaction->has_sub_transaction = false;
3327 	transaction->num_blocks = transaction->main_num_blocks;
3328 	transaction->sub_num_blocks = 0;
3329 
3330 	cache->transaction_hash->Insert(newTransaction);
3331 	cache->last_transaction = newTransaction;
3332 
3333 	return newTransaction->id;
3334 }
3335 
3336 
3337 status_t
cache_abort_sub_transaction(void * _cache,int32 id)3338 cache_abort_sub_transaction(void* _cache, int32 id)
3339 {
3340 	block_cache* cache = (block_cache*)_cache;
3341 	TransactionLocker locker(cache);
3342 
3343 	TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 ")\n", id));
3344 
3345 	cache_transaction* transaction = lookup_transaction(cache, id);
3346 	if (transaction == NULL) {
3347 		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
3348 		return B_BAD_VALUE;
3349 	}
3350 	if (!transaction->has_sub_transaction)
3351 		return B_BAD_VALUE;
3352 
3353 	T(Abort(cache, transaction));
3354 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3355 
3356 	// revert all changes back to the version of the parent
3357 
3358 	cached_block* block = transaction->first_block;
3359 	cached_block* last = NULL;
3360 	cached_block* next;
3361 	for (; block != NULL; block = next) {
3362 		next = block->transaction_next;
3363 
3364 		if (block->parent_data == NULL) {
3365 			// The parent transaction didn't change the block, but the sub
3366 			// transaction did - we need to revert to the original data.
3367 			// The block is no longer part of the transaction
3368 			if (block->original_data != NULL) {
3369 				// The block might not have original data if was empty
3370 				memcpy(block->current_data, block->original_data,
3371 					cache->block_size);
3372 			}
3373 
3374 			if (last != NULL)
3375 				last->transaction_next = next;
3376 			else
3377 				transaction->first_block = next;
3378 
3379 			block->transaction_next = NULL;
3380 			block->transaction = NULL;
3381 			transaction->num_blocks--;
3382 
3383 			if (block->previous_transaction == NULL) {
3384 				cache->Free(block->original_data);
3385 				block->original_data = NULL;
3386 				block->is_dirty = false;
3387 
3388 				if (block->ref_count == 0) {
3389 					// Move the block into the unused list if possible
3390 					block->unused = true;
3391 					cache->unused_blocks.Add(block);
3392 					cache->unused_block_count++;
3393 				}
3394 			}
3395 		} else {
3396 			if (block->parent_data != block->current_data) {
3397 				// The block has been changed and must be restored - the block
3398 				// is still dirty and part of the transaction
3399 				TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 "): "
3400 					"restored contents of block %" B_PRIdOFF "\n",
3401 					transaction->id, block->block_number));
3402 				memcpy(block->current_data, block->parent_data,
3403 					cache->block_size);
3404 				cache->Free(block->parent_data);
3405 				// The block stays dirty
3406 			}
3407 			block->parent_data = NULL;
3408 			last = block;
3409 		}
3410 
3411 		block->discard = false;
3412 	}
3413 
3414 	// all subsequent changes will go into the main transaction
3415 	transaction->has_sub_transaction = false;
3416 	transaction->sub_num_blocks = 0;
3417 
3418 	return B_OK;
3419 }
3420 
3421 
3422 status_t
cache_start_sub_transaction(void * _cache,int32 id)3423 cache_start_sub_transaction(void* _cache, int32 id)
3424 {
3425 	block_cache* cache = (block_cache*)_cache;
3426 	TransactionLocker locker(cache);
3427 
3428 	TRACE(("cache_start_sub_transaction(id = %" B_PRId32 ")\n", id));
3429 
3430 	cache_transaction* transaction = lookup_transaction(cache, id);
3431 	if (transaction == NULL) {
3432 		panic("cache_start_sub_transaction(): invalid transaction ID %" B_PRId32 "\n",
3433 			id);
3434 		return B_BAD_VALUE;
3435 	}
3436 
3437 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3438 
3439 	// move all changed blocks up to the parent
3440 
3441 	cached_block* block = transaction->first_block;
3442 	cached_block* next;
3443 	for (; block != NULL; block = next) {
3444 		next = block->transaction_next;
3445 
3446 		if (block->parent_data != NULL) {
3447 			// There already is an older sub transaction - we acknowledge
3448 			// its changes and move its blocks up to the parent
3449 			ASSERT(transaction->has_sub_transaction);
3450 			cache->FreeBlockParentData(block);
3451 		}
3452 		if (block->discard) {
3453 			// This block has been discarded in the parent transaction.
3454 			// Just throw away any changes made in this transaction, so that
3455 			// it can still be reverted to its original contents if needed
3456 			ASSERT(block->previous_transaction == NULL);
3457 			if (block->original_data != NULL) {
3458 				memcpy(block->current_data, block->original_data,
3459 					cache->block_size);
3460 
3461 				cache->Free(block->original_data);
3462 				block->original_data = NULL;
3463 			}
3464 			continue;
3465 		}
3466 
3467 		// we "allocate" the parent data lazily, that means, we don't copy
3468 		// the data (and allocate memory for it) until we need to
3469 		block->parent_data = block->current_data;
3470 	}
3471 
3472 	// all subsequent changes will go into the sub transaction
3473 	transaction->has_sub_transaction = true;
3474 	transaction->main_num_blocks = transaction->num_blocks;
3475 	transaction->sub_num_blocks = 0;
3476 	T(Action("start-sub", cache, transaction));
3477 
3478 	return B_OK;
3479 }
3480 
3481 
3482 /*!	Adds a transaction listener that gets notified when the transaction
3483 	is ended, aborted, written, or idle as specified by \a events.
3484 	The listener gets automatically removed when the transaction ends.
3485 */
3486 status_t
cache_add_transaction_listener(void * _cache,int32 id,int32 events,transaction_notification_hook hook,void * data)3487 cache_add_transaction_listener(void* _cache, int32 id, int32 events,
3488 	transaction_notification_hook hook, void* data)
3489 {
3490 	block_cache* cache = (block_cache*)_cache;
3491 	TransactionLocker locker(cache);
3492 
3493 	cache_transaction* transaction = lookup_transaction(cache, id);
3494 	if (transaction == NULL)
3495 		return B_BAD_VALUE;
3496 
3497 	return add_transaction_listener(cache, transaction, events, hook, data);
3498 }
3499 
3500 
3501 status_t
cache_remove_transaction_listener(void * _cache,int32 id,transaction_notification_hook hookFunction,void * data)3502 cache_remove_transaction_listener(void* _cache, int32 id,
3503 	transaction_notification_hook hookFunction, void* data)
3504 {
3505 	block_cache* cache = (block_cache*)_cache;
3506 	TransactionLocker locker(cache);
3507 
3508 	cache_transaction* transaction = lookup_transaction(cache, id);
3509 	if (transaction == NULL)
3510 		return B_BAD_VALUE;
3511 
3512 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
3513 	while (iterator.HasNext()) {
3514 		cache_listener* listener = iterator.Next();
3515 		if (listener->data == data && listener->hook == hookFunction) {
3516 			iterator.Remove();
3517 
3518 			if (listener->events_pending != 0) {
3519 				MutexLocker _(sNotificationsLock);
3520 				if (listener->events_pending != 0)
3521 					cache->pending_notifications.Remove(listener);
3522 			}
3523 			delete listener;
3524 			return B_OK;
3525 		}
3526 	}
3527 
3528 	return B_ENTRY_NOT_FOUND;
3529 }
3530 
3531 
3532 status_t
cache_next_block_in_transaction(void * _cache,int32 id,bool mainOnly,long * _cookie,off_t * _blockNumber,void ** _data,void ** _unchangedData)3533 cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
3534 	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
3535 {
3536 	cached_block* block = (cached_block*)*_cookie;
3537 	block_cache* cache = (block_cache*)_cache;
3538 	TransactionLocker locker(cache);
3539 
3540 	cache_transaction* transaction = lookup_transaction(cache, id);
3541 	if (transaction == NULL || !transaction->open)
3542 		return B_BAD_VALUE;
3543 
3544 	if (block == NULL)
3545 		block = transaction->first_block;
3546 	else
3547 		block = block->transaction_next;
3548 
3549 	if (transaction->has_sub_transaction) {
3550 		if (mainOnly) {
3551 			// find next block that the parent changed
3552 			while (block != NULL && block->parent_data == NULL)
3553 				block = block->transaction_next;
3554 		} else {
3555 			// find next non-discarded block
3556 			while (block != NULL && block->discard)
3557 				block = block->transaction_next;
3558 		}
3559 	}
3560 
3561 	if (block == NULL)
3562 		return B_ENTRY_NOT_FOUND;
3563 
3564 	if (_blockNumber)
3565 		*_blockNumber = block->block_number;
3566 	if (_data)
3567 		*_data = mainOnly ? block->parent_data : block->current_data;
3568 	if (_unchangedData)
3569 		*_unchangedData = block->original_data;
3570 
3571 	*_cookie = (addr_t)block;
3572 	return B_OK;
3573 }
3574 
3575 
3576 int32
cache_blocks_in_transaction(void * _cache,int32 id)3577 cache_blocks_in_transaction(void* _cache, int32 id)
3578 {
3579 	block_cache* cache = (block_cache*)_cache;
3580 	TransactionLocker locker(cache);
3581 
3582 	cache_transaction* transaction = lookup_transaction(cache, id);
3583 	if (transaction == NULL)
3584 		return B_BAD_VALUE;
3585 
3586 	return transaction->num_blocks;
3587 }
3588 
3589 
3590 /*!	Returns the number of blocks that are part of the main transaction. If this
3591 	transaction does not have a sub transaction yet, this is the same value as
3592 	cache_blocks_in_transaction() would return.
3593 */
3594 int32
cache_blocks_in_main_transaction(void * _cache,int32 id)3595 cache_blocks_in_main_transaction(void* _cache, int32 id)
3596 {
3597 	block_cache* cache = (block_cache*)_cache;
3598 	TransactionLocker locker(cache);
3599 
3600 	cache_transaction* transaction = lookup_transaction(cache, id);
3601 	if (transaction == NULL)
3602 		return B_BAD_VALUE;
3603 
3604 	if (transaction->has_sub_transaction)
3605 		return transaction->main_num_blocks;
3606 
3607 	return transaction->num_blocks;
3608 }
3609 
3610 
3611 int32
cache_blocks_in_sub_transaction(void * _cache,int32 id)3612 cache_blocks_in_sub_transaction(void* _cache, int32 id)
3613 {
3614 	block_cache* cache = (block_cache*)_cache;
3615 	TransactionLocker locker(cache);
3616 
3617 	cache_transaction* transaction = lookup_transaction(cache, id);
3618 	if (transaction == NULL)
3619 		return B_BAD_VALUE;
3620 
3621 	return transaction->sub_num_blocks;
3622 }
3623 
3624 
3625 /*!	Check if block is in transaction
3626 */
3627 bool
cache_has_block_in_transaction(void * _cache,int32 id,off_t blockNumber)3628 cache_has_block_in_transaction(void* _cache, int32 id, off_t blockNumber)
3629 {
3630 	block_cache* cache = (block_cache*)_cache;
3631 	TransactionLocker locker(cache);
3632 
3633 	cached_block* block = cache->hash->Lookup(blockNumber);
3634 
3635 	return (block != NULL && block->transaction != NULL
3636 		&& block->transaction->id == id);
3637 }
3638 
3639 
3640 //	#pragma mark - public block cache API
3641 
3642 
3643 void
block_cache_delete(void * _cache,bool allowWrites)3644 block_cache_delete(void* _cache, bool allowWrites)
3645 {
3646 	block_cache* cache = (block_cache*)_cache;
3647 
3648 	if (allowWrites)
3649 		block_cache_sync(cache);
3650 
3651 	mutex_lock(&sCachesLock);
3652 	sCaches.Remove(cache);
3653 	mutex_unlock(&sCachesLock);
3654 
3655 	mutex_lock(&cache->lock);
3656 
3657 	// wait for all blocks to become unbusy
3658 	wait_for_busy_reading_blocks(cache);
3659 	wait_for_busy_writing_blocks(cache);
3660 
3661 	// free all blocks
3662 
3663 	cached_block* block = cache->hash->Clear(true);
3664 	while (block != NULL) {
3665 		cached_block* next = block->next;
3666 		cache->FreeBlock(block);
3667 		block = next;
3668 	}
3669 
3670 	// free all transactions (they will all be aborted)
3671 
3672 	cache_transaction* transaction = cache->transaction_hash->Clear(true);
3673 	while (transaction != NULL) {
3674 		cache_transaction* next = transaction->next;
3675 		delete transaction;
3676 		transaction = next;
3677 	}
3678 
3679 	delete cache;
3680 }
3681 
3682 
3683 void*
block_cache_create(int fd,off_t numBlocks,size_t blockSize,bool readOnly)3684 block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
3685 {
3686 	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
3687 		readOnly);
3688 	if (cache == NULL)
3689 		return NULL;
3690 
3691 	if (cache->Init() != B_OK) {
3692 		delete cache;
3693 		return NULL;
3694 	}
3695 
3696 	MutexLocker _(sCachesLock);
3697 	sCaches.Add(cache);
3698 
3699 	return cache;
3700 }
3701 
3702 
3703 status_t
block_cache_sync(void * _cache)3704 block_cache_sync(void* _cache)
3705 {
3706 	block_cache* cache = (block_cache*)_cache;
3707 
3708 	// We will sync all dirty blocks to disk that have a completed
3709 	// transaction or no transaction only
3710 
3711 	MutexLocker locker(&cache->lock);
3712 
3713 	BlockWriter writer(cache);
3714 	BlockTable::Iterator iterator(cache->hash);
3715 
3716 	while (iterator.HasNext()) {
3717 		cached_block* block = iterator.Next();
3718 		if (block->CanBeWritten())
3719 			writer.Add(block);
3720 	}
3721 
3722 	status_t status = writer.Write();
3723 
3724 	locker.Unlock();
3725 
3726 	wait_for_notifications(cache);
3727 		// make sure that all pending TRANSACTION_WRITTEN notifications
3728 		// are handled after we return
3729 	return status;
3730 }
3731 
3732 
3733 status_t
block_cache_sync_etc(void * _cache,off_t blockNumber,size_t numBlocks)3734 block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
3735 {
3736 	block_cache* cache = (block_cache*)_cache;
3737 
3738 	// We will sync all dirty blocks to disk that have a completed
3739 	// transaction or no transaction only
3740 
3741 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
3742 		panic("block_cache_sync_etc: invalid block number %" B_PRIdOFF
3743 			" (max %" B_PRIdOFF ")",
3744 			blockNumber, cache->max_blocks - 1);
3745 		return B_BAD_VALUE;
3746 	}
3747 
3748 	MutexLocker locker(&cache->lock);
3749 	BlockWriter writer(cache);
3750 
3751 	for (; numBlocks > 0; numBlocks--, blockNumber++) {
3752 		cached_block* block = cache->hash->Lookup(blockNumber);
3753 		if (block == NULL)
3754 			continue;
3755 
3756 		if (block->CanBeWritten())
3757 			writer.Add(block);
3758 	}
3759 
3760 	status_t status = writer.Write();
3761 
3762 	locker.Unlock();
3763 
3764 	wait_for_notifications(cache);
3765 		// make sure that all pending TRANSACTION_WRITTEN notifications
3766 		// are handled after we return
3767 	return status;
3768 }
3769 
3770 
3771 /*!	Discards a block from the current transaction or from the cache.
3772 	You have to call this function when you no longer use a block, ie. when it
3773 	might be reclaimed by the file cache in order to make sure they won't
3774 	interfere.
3775 */
3776 void
block_cache_discard(void * _cache,off_t blockNumber,size_t numBlocks)3777 block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
3778 {
3779 	// TODO: this could be a nice place to issue the ATA trim command
3780 	block_cache* cache = (block_cache*)_cache;
3781 	TransactionLocker locker(cache);
3782 
3783 	BlockWriter writer(cache);
3784 
3785 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3786 		cached_block* block = cache->hash->Lookup(blockNumber);
3787 		if (block != NULL && block->previous_transaction != NULL)
3788 			writer.Add(block);
3789 	}
3790 
3791 	writer.Write();
3792 		// TODO: this can fail, too!
3793 
3794 	blockNumber -= numBlocks;
3795 		// reset blockNumber to its original value
3796 
3797 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3798 		cached_block* block = cache->hash->Lookup(blockNumber);
3799 		if (block == NULL)
3800 			continue;
3801 
3802 		ASSERT(block->previous_transaction == NULL);
3803 
3804 		if (block->unused) {
3805 			cache->unused_blocks.Remove(block);
3806 			cache->unused_block_count--;
3807 			cache->RemoveBlock(block);
3808 		} else {
3809 			if (block->transaction != NULL && block->parent_data != NULL
3810 				&& block->parent_data != block->current_data) {
3811 				panic("Discarded block %" B_PRIdOFF " has already been changed in this "
3812 					"transaction!", blockNumber);
3813 			}
3814 
3815 			// mark it as discarded (in the current transaction only, if any)
3816 			block->discard = true;
3817 		}
3818 	}
3819 }
3820 
3821 
3822 status_t
block_cache_make_writable(void * _cache,off_t blockNumber,int32 transaction)3823 block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
3824 {
3825 	block_cache* cache = (block_cache*)_cache;
3826 	MutexLocker locker(&cache->lock);
3827 
3828 	if (cache->read_only) {
3829 		panic("tried to make block writable on a read-only cache!");
3830 		return B_ERROR;
3831 	}
3832 
3833 	// TODO: this can be done better!
3834 	void* block;
3835 	status_t status = get_writable_cached_block(cache, blockNumber,
3836 		transaction, false, &block);
3837 	if (status == B_OK) {
3838 		put_cached_block((block_cache*)_cache, blockNumber);
3839 		return B_OK;
3840 	}
3841 
3842 	return status;
3843 }
3844 
3845 
3846 status_t
block_cache_get_writable_etc(void * _cache,off_t blockNumber,int32 transaction,void ** _block)3847 block_cache_get_writable_etc(void* _cache, off_t blockNumber,
3848 	int32 transaction, void** _block)
3849 {
3850 	block_cache* cache = (block_cache*)_cache;
3851 	MutexLocker locker(&cache->lock);
3852 
3853 	TRACE(("block_cache_get_writable_etc(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3854 		blockNumber, transaction));
3855 	if (cache->read_only)
3856 		panic("tried to get writable block on a read-only cache!");
3857 
3858 	return get_writable_cached_block(cache, blockNumber,
3859 		transaction, false, _block);
3860 }
3861 
3862 
3863 void*
block_cache_get_writable(void * _cache,off_t blockNumber,int32 transaction)3864 block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
3865 {
3866 	void* block;
3867 	if (block_cache_get_writable_etc(_cache, blockNumber,
3868 			transaction, &block) == B_OK)
3869 		return block;
3870 
3871 	return NULL;
3872 }
3873 
3874 
3875 void*
block_cache_get_empty(void * _cache,off_t blockNumber,int32 transaction)3876 block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
3877 {
3878 	block_cache* cache = (block_cache*)_cache;
3879 	MutexLocker locker(&cache->lock);
3880 
3881 	TRACE(("block_cache_get_empty(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3882 		blockNumber, transaction));
3883 	if (cache->read_only)
3884 		panic("tried to get empty writable block on a read-only cache!");
3885 
3886 	void* block;
3887 	if (get_writable_cached_block((block_cache*)_cache, blockNumber,
3888 			transaction, true, &block) == B_OK)
3889 		return block;
3890 
3891 	return NULL;
3892 }
3893 
3894 
3895 status_t
block_cache_get_etc(void * _cache,off_t blockNumber,const void ** _block)3896 block_cache_get_etc(void* _cache, off_t blockNumber, const void** _block)
3897 {
3898 	block_cache* cache = (block_cache*)_cache;
3899 	MutexLocker locker(&cache->lock);
3900 	bool allocated;
3901 
3902 	cached_block* block;
3903 	status_t status = get_cached_block(cache, blockNumber, &allocated, true,
3904 		&block);
3905 	if (status != B_OK)
3906 		return status;
3907 
3908 #if BLOCK_CACHE_DEBUG_CHANGED
3909 	if (block->compare == NULL)
3910 		block->compare = cache->Allocate();
3911 	if (block->compare != NULL)
3912 		memcpy(block->compare, block->current_data, cache->block_size);
3913 #endif
3914 	TB(Get(cache, block));
3915 
3916 	*_block = block->current_data;
3917 	return B_OK;
3918 }
3919 
3920 
3921 const void*
block_cache_get(void * _cache,off_t blockNumber)3922 block_cache_get(void* _cache, off_t blockNumber)
3923 {
3924 	const void* block;
3925 	if (block_cache_get_etc(_cache, blockNumber, &block) == B_OK)
3926 		return block;
3927 
3928 	return NULL;
3929 }
3930 
3931 
3932 /*!	Changes the internal status of a writable block to \a dirty. This can be
3933 	helpful in case you realize you don't need to change that block anymore
3934 	for whatever reason.
3935 
3936 	Note, you must only use this function on blocks that were acquired
3937 	writable!
3938 */
3939 status_t
block_cache_set_dirty(void * _cache,off_t blockNumber,bool dirty,int32 transaction)3940 block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
3941 	int32 transaction)
3942 {
3943 	block_cache* cache = (block_cache*)_cache;
3944 	MutexLocker locker(&cache->lock);
3945 
3946 	cached_block* block = cache->hash->Lookup(blockNumber);
3947 	if (block == NULL)
3948 		return B_BAD_VALUE;
3949 	if (block->is_dirty == dirty) {
3950 		// there is nothing to do for us
3951 		return B_OK;
3952 	}
3953 
3954 	// TODO: not yet implemented
3955 	if (dirty)
3956 		panic("block_cache_set_dirty(): not yet implemented that way!\n");
3957 
3958 	return B_OK;
3959 }
3960 
3961 
3962 void
block_cache_put(void * _cache,off_t blockNumber)3963 block_cache_put(void* _cache, off_t blockNumber)
3964 {
3965 	block_cache* cache = (block_cache*)_cache;
3966 	MutexLocker locker(&cache->lock);
3967 
3968 	put_cached_block(cache, blockNumber);
3969 }
3970 
3971 
3972 /*! Allocates blocks and schedules them to be read from disk, but does not get references to the
3973 	blocks.
3974 	@param blockNumber The index of the first requested block.
3975 	@param _numBlocks As input, the number of blocks requested. As output, the number of
3976 	blocks actually scheduled.  Prefetching will stop short if the requested range includes a
3977 	block that is already cached.
3978 */
3979 status_t
block_cache_prefetch(void * _cache,off_t blockNumber,size_t * _numBlocks)3980 block_cache_prefetch(void* _cache, off_t blockNumber, size_t* _numBlocks)
3981 {
3982 #ifndef BUILDING_USERLAND_FS_SERVER
3983 	TRACE(("block_cache_prefetch: fetching %" B_PRIuSIZE " blocks starting with %" B_PRIdOFF "\n",
3984 		*_numBlocks, blockNumber));
3985 
3986 	block_cache* cache = reinterpret_cast<block_cache*>(_cache);
3987 	MutexLocker locker(&cache->lock);
3988 
3989 	size_t numBlocks = *_numBlocks;
3990 	*_numBlocks = 0;
3991 
3992 	BlockPrefetcher* blockPrefetcher = new BlockPrefetcher(cache, blockNumber, numBlocks);
3993 
3994 	status_t status = blockPrefetcher->Allocate();
3995 	if (status != B_OK || blockPrefetcher->NumAllocated() == 0) {
3996 		TRACE(("block_cache_prefetch returning early (%s): allocated %" B_PRIuSIZE "\n",
3997 			strerror(status), blockPrefetcher->NumAllocated()));
3998 		delete blockPrefetcher;
3999 		return status;
4000 	}
4001 
4002 	numBlocks = blockPrefetcher->NumAllocated();
4003 
4004 	status = blockPrefetcher->ReadAsync(locker);
4005 
4006 	if (status == B_OK)
4007 		*_numBlocks = numBlocks;
4008 
4009 	return status;
4010 #else // BUILDING_USERLAND_FS_SERVER
4011 	*_numBlocks = 0;
4012 	return B_UNSUPPORTED;
4013 #endif // !BUILDING_USERLAND_FS_SERVER
4014 }
4015