xref: /haiku/src/system/kernel/cache/block_cache.cpp (revision 90c3b9bf9fe633e4c6a59d42c11c3f523183c553)
1 /*
2  * Copyright 2004-2012, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include <block_cache.h>
8 
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13 
14 #include <KernelExport.h>
15 #include <fs_cache.h>
16 
17 #include <condition_variable.h>
18 #include <lock.h>
19 #include <low_resource_manager.h>
20 #include <slab/Slab.h>
21 #include <tracing.h>
22 #include <util/kernel_cpp.h>
23 #include <util/DoublyLinkedList.h>
24 #include <util/AutoLock.h>
25 #include <vm/vm_page.h>
26 
27 #include "kernel_debug_config.h"
28 
29 
30 // TODO: this is a naive but growing implementation to test the API:
31 //	block reading/writing is not at all optimized for speed, it will
32 //	just read and write single blocks.
33 // TODO: the retrieval/copy of the original data could be delayed until the
34 //		new data must be written, ie. in low memory situations.
35 
36 //#define TRACE_BLOCK_CACHE
37 #ifdef TRACE_BLOCK_CACHE
38 #	define TRACE(x)	dprintf x
39 #else
40 #	define TRACE(x) ;
41 #endif
42 
43 // This macro is used for fatal situations that are acceptable in a running
44 // system, like out of memory situations - should only panic for debugging.
45 #define FATAL(x) panic x
46 
47 static const bigtime_t kTransactionIdleTime = 2000000LL;
48 	// a transaction is considered idle after 2 seconds of inactivity
49 
50 
51 struct cache_transaction;
52 struct cached_block;
53 struct block_cache;
54 typedef DoublyLinkedListLink<cached_block> block_link;
55 
56 struct cached_block {
57 	cached_block*	next;			// next in hash
58 	cached_block*	transaction_next;
59 	block_link		link;
60 	off_t			block_number;
61 	void*			current_data;
62 		// The data that is seen by everyone using the API; this one is always
63 		// present.
64 	void*			original_data;
65 		// When in a transaction, this contains the original data from before
66 		// the transaction.
67 	void*			parent_data;
68 		// This is a lazily alloced buffer that represents the contents of the
69 		// block in the parent transaction. It may point to current_data if the
70 		// contents have been changed only in the parent transaction, or, if the
71 		// block has been changed in the current sub transaction already, to a
72 		// new block containing the contents changed in the parent transaction.
73 		// If this is NULL, the block has not been changed in the parent
74 		// transaction at all.
75 #if BLOCK_CACHE_DEBUG_CHANGED
76 	void*			compare;
77 #endif
78 	int32			ref_count;
79 	int32			last_accessed;
80 	bool			busy_reading : 1;
81 	bool			busy_writing : 1;
82 	bool			is_writing : 1;
83 		// Block has been checked out for writing without transactions, and
84 		// cannot be written back if set
85 	bool			is_dirty : 1;
86 	bool			unused : 1;
87 	bool			discard : 1;
88 	bool			busy_reading_waiters : 1;
89 	bool			busy_writing_waiters : 1;
90 	cache_transaction* transaction;
91 		// This is the current active transaction, if any, the block is
92 		// currently in (meaning was changed as a part of it).
93 	cache_transaction* previous_transaction;
94 		// This is set to the last transaction that was ended containing this
95 		// block. In this case, the block has not yet written back yet, and
96 		// the changed data is either in current_data, or original_data -- the
97 		// latter if the block is already being part of another transaction.
98 		// There can only be one previous transaction, so when the active
99 		// transaction ends, the changes of the previous transaction have to
100 		// be written back before that transaction becomes the next previous
101 		// transaction.
102 
103 	bool CanBeWritten() const;
104 	int32 LastAccess() const
105 		{ return system_time() / 1000000L - last_accessed; }
106 };
107 
108 typedef DoublyLinkedList<cached_block,
109 	DoublyLinkedListMemberGetLink<cached_block,
110 		&cached_block::link> > block_list;
111 
112 struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
113 	int32			transaction_id;
114 	int32			events_pending;
115 	int32			events;
116 	transaction_notification_hook hook;
117 	void*			data;
118 	bool			delete_after_event;
119 };
120 
121 typedef DoublyLinkedList<cache_notification> NotificationList;
122 
123 struct BlockHash {
124 	typedef off_t			KeyType;
125 	typedef	cached_block	ValueType;
126 
127 	size_t HashKey(KeyType key) const
128 	{
129 		return key;
130 	}
131 
132 	size_t Hash(ValueType* block) const
133 	{
134 		return block->block_number;
135 	}
136 
137 	bool Compare(KeyType key, ValueType* block) const
138 	{
139 		return block->block_number == key;
140 	}
141 
142 	ValueType*& GetLink(ValueType* value) const
143 	{
144 		return value->next;
145 	}
146 };
147 
148 typedef BOpenHashTable<BlockHash> BlockTable;
149 
150 
151 struct TransactionHash {
152 	typedef int32				KeyType;
153 	typedef	cache_transaction	ValueType;
154 
155 	size_t HashKey(KeyType key) const
156 	{
157 		return key;
158 	}
159 
160 	size_t Hash(ValueType* transaction) const;
161 	bool Compare(KeyType key, ValueType* transaction) const;
162 	ValueType*& GetLink(ValueType* value) const;
163 };
164 
165 typedef BOpenHashTable<TransactionHash> TransactionTable;
166 
167 
168 struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
169 	BlockTable*		hash;
170 	mutex			lock;
171 	int				fd;
172 	off_t			max_blocks;
173 	size_t			block_size;
174 	int32			next_transaction_id;
175 	cache_transaction* last_transaction;
176 	TransactionTable* transaction_hash;
177 
178 	object_cache*	buffer_cache;
179 	block_list		unused_blocks;
180 	uint32			unused_block_count;
181 
182 	ConditionVariable busy_reading_condition;
183 	uint32			busy_reading_count;
184 	bool			busy_reading_waiters;
185 
186 	ConditionVariable busy_writing_condition;
187 	uint32			busy_writing_count;
188 	bool			busy_writing_waiters;
189 
190 	uint32			num_dirty_blocks;
191 	bool			read_only;
192 
193 	NotificationList pending_notifications;
194 	ConditionVariable condition_variable;
195 
196 					block_cache(int fd, off_t numBlocks, size_t blockSize,
197 						bool readOnly);
198 					~block_cache();
199 
200 	status_t		Init();
201 
202 	void			Free(void* buffer);
203 	void*			Allocate();
204 	void			FreeBlock(cached_block* block);
205 	cached_block*	NewBlock(off_t blockNumber);
206 	void			FreeBlockParentData(cached_block* block);
207 
208 	void			RemoveUnusedBlocks(int32 count, int32 minSecondsOld = 0);
209 	void			RemoveBlock(cached_block* block);
210 	void			DiscardBlock(cached_block* block);
211 
212 private:
213 	static void		_LowMemoryHandler(void* data, uint32 resources,
214 						int32 level);
215 	cached_block*	_GetUnusedBlock();
216 };
217 
218 struct cache_listener;
219 typedef DoublyLinkedListLink<cache_listener> listener_link;
220 
221 struct cache_listener : cache_notification {
222 	listener_link	link;
223 };
224 
225 typedef DoublyLinkedList<cache_listener,
226 	DoublyLinkedListMemberGetLink<cache_listener,
227 		&cache_listener::link> > ListenerList;
228 
229 
230 struct cache_transaction {
231 	cache_transaction();
232 
233 	cache_transaction* next;
234 	int32			id;
235 	int32			num_blocks;
236 	int32			main_num_blocks;
237 	int32			sub_num_blocks;
238 	cached_block*	first_block;
239 	block_list		blocks;
240 	ListenerList	listeners;
241 	bool			open;
242 	bool			has_sub_transaction;
243 	bigtime_t		last_used;
244 	int32			busy_writing_count;
245 };
246 
247 
248 class BlockWriter {
249 public:
250 								BlockWriter(block_cache* cache,
251 									size_t max = SIZE_MAX);
252 								~BlockWriter();
253 
254 			bool				Add(cached_block* block,
255 									cache_transaction* transaction = NULL);
256 			bool				Add(cache_transaction* transaction,
257 									bool& hasLeftOvers);
258 
259 			status_t			Write(cache_transaction* transaction = NULL,
260 									bool canUnlock = true);
261 
262 			bool				DeletedTransaction() const
263 									{ return fDeletedTransaction; }
264 
265 	static	status_t			WriteBlock(block_cache* cache,
266 									cached_block* block);
267 
268 private:
269 			void*				_Data(cached_block* block) const;
270 			status_t			_WriteBlock(cached_block* block);
271 			void				_BlockDone(cached_block* block,
272 									cache_transaction* transaction);
273 			void				_UnmarkWriting(cached_block* block);
274 
275 	static	int					_CompareBlocks(const void* _blockA,
276 									const void* _blockB);
277 
278 private:
279 	static	const size_t		kBufferSize = 64;
280 
281 			block_cache*		fCache;
282 			cached_block*		fBuffer[kBufferSize];
283 			cached_block**		fBlocks;
284 			size_t				fCount;
285 			size_t				fTotal;
286 			size_t				fCapacity;
287 			size_t				fMax;
288 			status_t			fStatus;
289 			bool				fDeletedTransaction;
290 };
291 
292 
293 class TransactionLocking {
294 public:
295 	inline bool Lock(block_cache* cache)
296 	{
297 		mutex_lock(&cache->lock);
298 
299 		while (cache->busy_writing_count != 0) {
300 			// wait for all blocks to be written
301 			ConditionVariableEntry entry;
302 			cache->busy_writing_condition.Add(&entry);
303 			cache->busy_writing_waiters = true;
304 
305 			mutex_unlock(&cache->lock);
306 
307 			entry.Wait();
308 
309 			mutex_lock(&cache->lock);
310 		}
311 
312 		return true;
313 	}
314 
315 	inline void Unlock(block_cache* cache)
316 	{
317 		mutex_unlock(&cache->lock);
318 	}
319 };
320 
321 typedef AutoLocker<block_cache, TransactionLocking> TransactionLocker;
322 
323 
324 #if BLOCK_CACHE_BLOCK_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
325 namespace BlockTracing {
326 
327 class Action : public AbstractTraceEntry {
328 public:
329 	Action(block_cache* cache, cached_block* block)
330 		:
331 		fCache(cache),
332 		fBlockNumber(block->block_number),
333 		fIsDirty(block->is_dirty),
334 		fHasOriginal(block->original_data != NULL),
335 		fHasParent(block->parent_data != NULL),
336 		fTransactionID(-1),
337 		fPreviousID(-1)
338 	{
339 		if (block->transaction != NULL)
340 			fTransactionID = block->transaction->id;
341 		if (block->previous_transaction != NULL)
342 			fPreviousID = block->previous_transaction->id;
343 	}
344 
345 	virtual void AddDump(TraceOutput& out)
346 	{
347 		out.Print("block cache %p, %s %" B_PRIu64 ", %c%c%c transaction %" B_PRId32
348 			" (previous id %" B_PRId32 ")\n", fCache, _Action(), fBlockNumber,
349 			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
350 			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
351 	}
352 
353 	virtual const char* _Action() const = 0;
354 
355 private:
356 	block_cache*		fCache;
357 	uint64				fBlockNumber;
358 	bool				fIsDirty;
359 	bool				fHasOriginal;
360 	bool				fHasParent;
361 	int32				fTransactionID;
362 	int32				fPreviousID;
363 };
364 
365 class Get : public Action {
366 public:
367 	Get(block_cache* cache, cached_block* block)
368 		:
369 		Action(cache, block)
370 	{
371 		Initialized();
372 	}
373 
374 	virtual const char* _Action() const { return "get"; }
375 };
376 
377 class Put : public Action {
378 public:
379 	Put(block_cache* cache, cached_block* block)
380 		:
381 		Action(cache, block)
382 	{
383 		Initialized();
384 	}
385 
386 	virtual const char* _Action() const { return "put"; }
387 };
388 
389 class Read : public Action {
390 public:
391 	Read(block_cache* cache, cached_block* block)
392 		:
393 		Action(cache, block)
394 	{
395 		Initialized();
396 	}
397 
398 	virtual const char* _Action() const { return "read"; }
399 };
400 
401 class Write : public Action {
402 public:
403 	Write(block_cache* cache, cached_block* block)
404 		:
405 		Action(cache, block)
406 	{
407 		Initialized();
408 	}
409 
410 	virtual const char* _Action() const { return "write"; }
411 };
412 
413 class Flush : public Action {
414 public:
415 	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
416 		:
417 		Action(cache, block),
418 		fGetUnused(getUnused)
419 	{
420 		Initialized();
421 	}
422 
423 	virtual const char* _Action() const
424 		{ return fGetUnused ? "get-unused" : "flush"; }
425 
426 private:
427 	bool	fGetUnused;
428 };
429 
430 class Error : public AbstractTraceEntry {
431 public:
432 	Error(block_cache* cache, uint64 blockNumber, const char* message,
433 			status_t status = B_OK)
434 		:
435 		fCache(cache),
436 		fBlockNumber(blockNumber),
437 		fMessage(message),
438 		fStatus(status)
439 	{
440 		Initialized();
441 	}
442 
443 	virtual void AddDump(TraceOutput& out)
444 	{
445 		out.Print("block cache %p, error %" B_PRIu64 ", %s%s%s",
446 			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
447 			fStatus != B_OK ? strerror(fStatus) : "");
448 	}
449 
450 private:
451 	block_cache*	fCache;
452 	uint64			fBlockNumber;
453 	const char*		fMessage;
454 	status_t		fStatus;
455 };
456 
457 #if BLOCK_CACHE_BLOCK_TRACING >= 2
458 class BlockData : public AbstractTraceEntry {
459 public:
460 	enum {
461 		kCurrent	= 0x01,
462 		kParent		= 0x02,
463 		kOriginal	= 0x04
464 	};
465 
466 	BlockData(block_cache* cache, cached_block* block, const char* message)
467 		:
468 		fCache(cache),
469 		fSize(cache->block_size),
470 		fBlockNumber(block->block_number),
471 		fMessage(message)
472 	{
473 		_Allocate(fCurrent, block->current_data);
474 		_Allocate(fParent, block->parent_data);
475 		_Allocate(fOriginal, block->original_data);
476 
477 #if KTRACE_PRINTF_STACK_TRACE
478 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
479 			false);
480 #endif
481 
482 		Initialized();
483 	}
484 
485 	virtual void AddDump(TraceOutput& out)
486 	{
487 		out.Print("block cache %p, block %" B_PRIu64 ", data %c%c%c: %s",
488 			fCache, fBlockNumber, fCurrent != NULL ? 'c' : '-',
489 			fParent != NULL ? 'p' : '-', fOriginal != NULL ? 'o' : '-',
490 			fMessage);
491 	}
492 
493 #if KTRACE_PRINTF_STACK_TRACE
494 	virtual void DumpStackTrace(TraceOutput& out)
495 	{
496 		out.PrintStackTrace(fStackTrace);
497 	}
498 #endif
499 
500 	void DumpBlocks(uint32 which, uint32 offset, uint32 size)
501 	{
502 		if ((which & kCurrent) != 0)
503 			DumpBlock(kCurrent, offset, size);
504 		if ((which & kParent) != 0)
505 			DumpBlock(kParent, offset, size);
506 		if ((which & kOriginal) != 0)
507 			DumpBlock(kOriginal, offset, size);
508 	}
509 
510 	void DumpBlock(uint32 which, uint32 offset, uint32 size)
511 	{
512 		if (offset > fSize) {
513 			kprintf("invalid offset (block size %" B_PRIu32 ")\n", fSize);
514 			return;
515 		}
516 		if (offset + size > fSize)
517 			size = fSize - offset;
518 
519 		const char* label;
520 		uint8* data;
521 
522 		if ((which & kCurrent) != 0) {
523 			label = "current";
524 			data = fCurrent;
525 		} else if ((which & kParent) != 0) {
526 			label = "parent";
527 			data = fParent;
528 		} else if ((which & kOriginal) != 0) {
529 			label = "original";
530 			data = fOriginal;
531 		} else
532 			return;
533 
534 		kprintf("%s: offset %" B_PRIu32 ", %" B_PRIu32 " bytes\n", label, offset, size);
535 
536 		static const uint32 kBlockSize = 16;
537 		data += offset;
538 
539 		for (uint32 i = 0; i < size;) {
540 			int start = i;
541 
542 			kprintf("  %04" B_PRIx32 " ", i);
543 			for (; i < start + kBlockSize; i++) {
544 				if (!(i % 4))
545 					kprintf(" ");
546 
547 				if (i >= size)
548 					kprintf("  ");
549 				else
550 					kprintf("%02x", data[i]);
551 			}
552 
553 			kprintf("\n");
554 		}
555 	}
556 
557 private:
558 	void _Allocate(uint8*& target, void* source)
559 	{
560 		if (source == NULL) {
561 			target = NULL;
562 			return;
563 		}
564 
565 		target = alloc_tracing_buffer_memcpy(source, fSize, false);
566 	}
567 
568 	block_cache*	fCache;
569 	uint32			fSize;
570 	uint64			fBlockNumber;
571 	const char*		fMessage;
572 	uint8*			fCurrent;
573 	uint8*			fParent;
574 	uint8*			fOriginal;
575 #if KTRACE_PRINTF_STACK_TRACE
576 	tracing_stack_trace* fStackTrace;
577 #endif
578 };
579 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
580 
581 }	// namespace BlockTracing
582 
583 #	define TB(x) new(std::nothrow) BlockTracing::x;
584 #else
585 #	define TB(x) ;
586 #endif
587 
588 #if BLOCK_CACHE_BLOCK_TRACING >= 2
589 #	define TB2(x) new(std::nothrow) BlockTracing::x;
590 #else
591 #	define TB2(x) ;
592 #endif
593 
594 
595 #if BLOCK_CACHE_TRANSACTION_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
596 namespace TransactionTracing {
597 
598 class Action : public AbstractTraceEntry {
599 public:
600 	Action(const char* label, block_cache* cache,
601 			cache_transaction* transaction)
602 		:
603 		fCache(cache),
604 		fTransaction(transaction),
605 		fID(transaction->id),
606 		fSub(transaction->has_sub_transaction),
607 		fNumBlocks(transaction->num_blocks),
608 		fSubNumBlocks(transaction->sub_num_blocks)
609 	{
610 		strlcpy(fLabel, label, sizeof(fLabel));
611 		Initialized();
612 	}
613 
614 	virtual void AddDump(TraceOutput& out)
615 	{
616 		out.Print("block cache %p, %s transaction %p (id %" B_PRId32 ")%s"
617 			", %" B_PRId32 "/%" B_PRId32 " blocks", fCache, fLabel, fTransaction,
618 			fID, fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
619 	}
620 
621 private:
622 	char				fLabel[12];
623 	block_cache*		fCache;
624 	cache_transaction*	fTransaction;
625 	int32				fID;
626 	bool				fSub;
627 	int32				fNumBlocks;
628 	int32				fSubNumBlocks;
629 };
630 
631 class Detach : public AbstractTraceEntry {
632 public:
633 	Detach(block_cache* cache, cache_transaction* transaction,
634 			cache_transaction* newTransaction)
635 		:
636 		fCache(cache),
637 		fTransaction(transaction),
638 		fID(transaction->id),
639 		fSub(transaction->has_sub_transaction),
640 		fNewTransaction(newTransaction),
641 		fNewID(newTransaction->id)
642 	{
643 		Initialized();
644 	}
645 
646 	virtual void AddDump(TraceOutput& out)
647 	{
648 		out.Print("block cache %p, detach transaction %p (id %" B_PRId32 ")"
649 			"from transaction %p (id %" B_PRId32 ")%s",
650 			fCache, fNewTransaction, fNewID, fTransaction, fID,
651 			fSub ? " sub" : "");
652 	}
653 
654 private:
655 	block_cache*		fCache;
656 	cache_transaction*	fTransaction;
657 	int32				fID;
658 	bool				fSub;
659 	cache_transaction*	fNewTransaction;
660 	int32				fNewID;
661 };
662 
663 class Abort : public AbstractTraceEntry {
664 public:
665 	Abort(block_cache* cache, cache_transaction* transaction)
666 		:
667 		fCache(cache),
668 		fTransaction(transaction),
669 		fID(transaction->id),
670 		fNumBlocks(0)
671 	{
672 		bool isSub = transaction->has_sub_transaction;
673 		fNumBlocks = isSub ? transaction->sub_num_blocks
674 			: transaction->num_blocks;
675 		fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
676 		if (fBlocks != NULL) {
677 			cached_block* block = transaction->first_block;
678 			for (int32 i = 0; block != NULL && i < fNumBlocks;
679 					block = block->transaction_next) {
680 				fBlocks[i++] = block->block_number;
681 			}
682 		} else
683 			fNumBlocks = 0;
684 
685 #if KTRACE_PRINTF_STACK_TRACE
686 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
687 			false);
688 #endif
689 
690 		Initialized();
691 	}
692 
693 	virtual void AddDump(TraceOutput& out)
694 	{
695 		out.Print("block cache %p, abort transaction "
696 			"%p (id %" B_PRId32 "), blocks", fCache, fTransaction, fID);
697 		for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
698 			out.Print(" %" B_PRIdOFF, fBlocks[i]);
699 	}
700 
701 #if KTRACE_PRINTF_STACK_TRACE
702 	virtual void DumpStackTrace(TraceOutput& out)
703 	{
704 		out.PrintStackTrace(fStackTrace);
705 	}
706 #endif
707 
708 private:
709 	block_cache*		fCache;
710 	cache_transaction*	fTransaction;
711 	int32				fID;
712 	off_t*				fBlocks;
713 	int32				fNumBlocks;
714 #if KTRACE_PRINTF_STACK_TRACE
715 	tracing_stack_trace* fStackTrace;
716 #endif
717 };
718 
719 }	// namespace TransactionTracing
720 
721 #	define T(x) new(std::nothrow) TransactionTracing::x;
722 #else
723 #	define T(x) ;
724 #endif
725 
726 
727 static DoublyLinkedList<block_cache> sCaches;
728 static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
729 static mutex sCachesMemoryUseLock
730 	= MUTEX_INITIALIZER("block caches memory use");
731 static size_t sUsedMemory;
732 static sem_id sEventSemaphore;
733 static mutex sNotificationsLock
734 	= MUTEX_INITIALIZER("block cache notifications");
735 static thread_id sNotifierWriterThread;
736 static DoublyLinkedListLink<block_cache> sMarkCache;
737 	// TODO: this only works if the link is the first entry of block_cache
738 static object_cache* sBlockCache;
739 
740 
741 //	#pragma mark - notifications/listener
742 
743 
744 /*!	Checks whether or not this is an event that closes a transaction. */
745 static inline bool
746 is_closing_event(int32 event)
747 {
748 	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
749 }
750 
751 
752 static inline bool
753 is_written_event(int32 event)
754 {
755 	return (event & TRANSACTION_WRITTEN) != 0;
756 }
757 
758 
759 /*!	From the specified \a notification, it will remove the lowest pending
760 	event, and return that one in \a _event.
761 	If there is no pending event anymore, it will return \c false.
762 */
763 static bool
764 get_next_pending_event(cache_notification* notification, int32* _event)
765 {
766 	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
767 		int32 pending = atomic_and(&notification->events_pending,
768 			~eventMask);
769 
770 		bool more = (pending & ~eventMask) != 0;
771 
772 		if ((pending & eventMask) != 0) {
773 			*_event = eventMask;
774 			return more;
775 		}
776 	}
777 
778 	return false;
779 }
780 
781 
782 static void
783 flush_pending_notifications(block_cache* cache)
784 {
785 	ASSERT_LOCKED_MUTEX(&sCachesLock);
786 
787 	while (true) {
788 		MutexLocker locker(sNotificationsLock);
789 
790 		cache_notification* notification = cache->pending_notifications.Head();
791 		if (notification == NULL)
792 			return;
793 
794 		bool deleteAfterEvent = false;
795 		int32 event = -1;
796 		if (!get_next_pending_event(notification, &event)) {
797 			// remove the notification if this was the last pending event
798 			cache->pending_notifications.Remove(notification);
799 			deleteAfterEvent = notification->delete_after_event;
800 		}
801 
802 		if (event >= 0) {
803 			// Notify listener, we need to copy the notification, as it might
804 			// be removed when we unlock the list.
805 			cache_notification copy = *notification;
806 			locker.Unlock();
807 
808 			copy.hook(copy.transaction_id, event, copy.data);
809 
810 			locker.Lock();
811 		}
812 
813 		if (deleteAfterEvent)
814 			delete notification;
815 	}
816 }
817 
818 
819 /*!	Flushes all pending notifications by calling the appropriate hook
820 	functions.
821 	Must not be called with a cache lock held.
822 */
823 static void
824 flush_pending_notifications()
825 {
826 	MutexLocker _(sCachesLock);
827 
828 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
829 	while (iterator.HasNext()) {
830 		block_cache* cache = iterator.Next();
831 
832 		flush_pending_notifications(cache);
833 	}
834 }
835 
836 
837 /*!	Initializes the \a notification as specified. */
838 static void
839 set_notification(cache_transaction* transaction,
840 	cache_notification &notification, int32 events,
841 	transaction_notification_hook hook, void* data)
842 {
843 	notification.transaction_id = transaction != NULL ? transaction->id : -1;
844 	notification.events_pending = 0;
845 	notification.events = events;
846 	notification.hook = hook;
847 	notification.data = data;
848 	notification.delete_after_event = false;
849 }
850 
851 
852 /*!	Makes sure the notification is deleted. It either deletes it directly,
853 	when possible, or marks it for deletion if the notification is pending.
854 */
855 static void
856 delete_notification(cache_notification* notification)
857 {
858 	MutexLocker locker(sNotificationsLock);
859 
860 	if (notification->events_pending != 0)
861 		notification->delete_after_event = true;
862 	else
863 		delete notification;
864 }
865 
866 
867 /*!	Adds the notification to the pending notifications list, or, if it's
868 	already part of it, updates its events_pending field.
869 	Also marks the notification to be deleted if \a deleteNotification
870 	is \c true.
871 	Triggers the notifier thread to run.
872 */
873 static void
874 add_notification(block_cache* cache, cache_notification* notification,
875 	int32 event, bool deleteNotification)
876 {
877 	if (notification->hook == NULL)
878 		return;
879 
880 	int32 pending = atomic_or(&notification->events_pending, event);
881 	if (pending == 0) {
882 		// not yet part of the notification list
883 		MutexLocker locker(sNotificationsLock);
884 		if (deleteNotification)
885 			notification->delete_after_event = true;
886 		cache->pending_notifications.Add(notification);
887 	} else if (deleteNotification) {
888 		// we might need to delete it ourselves if we're late
889 		delete_notification(notification);
890 	}
891 
892 	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
893 		// We're probably still holding some locks that makes rescheduling
894 		// not a good idea at this point.
895 }
896 
897 
898 /*!	Notifies all interested listeners of this transaction about the \a event.
899 	If \a event is a closing event (ie. TRANSACTION_ENDED, and
900 	TRANSACTION_ABORTED), all listeners except those listening to
901 	TRANSACTION_WRITTEN will be removed.
902 */
903 static void
904 notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
905 	int32 event)
906 {
907 	T(Action("notify", cache, transaction));
908 
909 	bool isClosing = is_closing_event(event);
910 	bool isWritten = is_written_event(event);
911 
912 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
913 	while (iterator.HasNext()) {
914 		cache_listener* listener = iterator.Next();
915 
916 		bool remove = (isClosing && !is_written_event(listener->events))
917 			|| (isWritten && is_written_event(listener->events));
918 		if (remove)
919 			iterator.Remove();
920 
921 		if ((listener->events & event) != 0)
922 			add_notification(cache, listener, event, remove);
923 		else if (remove)
924 			delete_notification(listener);
925 	}
926 }
927 
928 
929 /*!	Removes and deletes all listeners that are still monitoring this
930 	transaction.
931 */
932 static void
933 remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
934 {
935 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
936 	while (iterator.HasNext()) {
937 		cache_listener* listener = iterator.Next();
938 		iterator.Remove();
939 
940 		delete_notification(listener);
941 	}
942 }
943 
944 
945 static status_t
946 add_transaction_listener(block_cache* cache, cache_transaction* transaction,
947 	int32 events, transaction_notification_hook hookFunction, void* data)
948 {
949 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
950 	while (iterator.HasNext()) {
951 		cache_listener* listener = iterator.Next();
952 
953 		if (listener->data == data && listener->hook == hookFunction) {
954 			// this listener already exists, just update it
955 			listener->events |= events;
956 			return B_OK;
957 		}
958 	}
959 
960 	cache_listener* listener = new(std::nothrow) cache_listener;
961 	if (listener == NULL)
962 		return B_NO_MEMORY;
963 
964 	set_notification(transaction, *listener, events, hookFunction, data);
965 	transaction->listeners.Add(listener);
966 	return B_OK;
967 }
968 
969 
970 //	#pragma mark - private transaction
971 
972 
973 cache_transaction::cache_transaction()
974 {
975 	num_blocks = 0;
976 	main_num_blocks = 0;
977 	sub_num_blocks = 0;
978 	first_block = NULL;
979 	open = true;
980 	has_sub_transaction = false;
981 	last_used = system_time();
982 	busy_writing_count = 0;
983 }
984 
985 
986 static void
987 delete_transaction(block_cache* cache, cache_transaction* transaction)
988 {
989 	if (cache->last_transaction == transaction)
990 		cache->last_transaction = NULL;
991 
992 	remove_transaction_listeners(cache, transaction);
993 	delete transaction;
994 }
995 
996 
997 static cache_transaction*
998 lookup_transaction(block_cache* cache, int32 id)
999 {
1000 	return cache->transaction_hash->Lookup(id);
1001 }
1002 
1003 
1004 size_t TransactionHash::Hash(cache_transaction* transaction) const
1005 {
1006 	return transaction->id;
1007 }
1008 
1009 
1010 bool TransactionHash::Compare(int32 key, cache_transaction* transaction) const
1011 {
1012 	return transaction->id == key;
1013 }
1014 
1015 
1016 cache_transaction*& TransactionHash::GetLink(cache_transaction* value) const
1017 {
1018 	return value->next;
1019 }
1020 
1021 
1022 /*!	Writes back any changes made to blocks in \a transaction that are still
1023 	part of a previous transacton.
1024 */
1025 static status_t
1026 write_blocks_in_previous_transaction(block_cache* cache,
1027 	cache_transaction* transaction)
1028 {
1029 	BlockWriter writer(cache);
1030 
1031 	cached_block* block = transaction->first_block;
1032 	for (; block != NULL; block = block->transaction_next) {
1033 		if (block->previous_transaction != NULL) {
1034 			// need to write back pending changes
1035 			writer.Add(block);
1036 		}
1037 	}
1038 
1039 	return writer.Write();
1040 }
1041 
1042 
1043 //	#pragma mark - cached_block
1044 
1045 
1046 bool
1047 cached_block::CanBeWritten() const
1048 {
1049 	return !busy_writing && !busy_reading
1050 		&& (previous_transaction != NULL
1051 			|| (transaction == NULL && is_dirty && !is_writing));
1052 }
1053 
1054 
1055 //	#pragma mark - BlockWriter
1056 
1057 
1058 BlockWriter::BlockWriter(block_cache* cache, size_t max)
1059 	:
1060 	fCache(cache),
1061 	fBlocks(fBuffer),
1062 	fCount(0),
1063 	fTotal(0),
1064 	fCapacity(kBufferSize),
1065 	fMax(max),
1066 	fStatus(B_OK),
1067 	fDeletedTransaction(false)
1068 {
1069 }
1070 
1071 
1072 BlockWriter::~BlockWriter()
1073 {
1074 	if (fBlocks != fBuffer)
1075 		free(fBlocks);
1076 }
1077 
1078 
1079 /*!	Adds the specified block to the to be written array. If no more blocks can
1080 	be added, false is returned, otherwise true.
1081 */
1082 bool
1083 BlockWriter::Add(cached_block* block, cache_transaction* transaction)
1084 {
1085 	ASSERT(block->CanBeWritten());
1086 
1087 	if (fTotal == fMax)
1088 		return false;
1089 
1090 	if (fCount >= fCapacity) {
1091 		// Enlarge array if necessary
1092 		cached_block** newBlocks;
1093 		size_t newCapacity = max_c(256, fCapacity * 2);
1094 		if (fBlocks == fBuffer)
1095 			newBlocks = (cached_block**)malloc(newCapacity * sizeof(void*));
1096 		else {
1097 			newBlocks = (cached_block**)realloc(fBlocks,
1098 				newCapacity * sizeof(void*));
1099 		}
1100 
1101 		if (newBlocks == NULL) {
1102 			// Allocating a larger array failed - we need to write back what
1103 			// we have synchronously now (this will also clear the array)
1104 			Write(transaction, false);
1105 		} else {
1106 			if (fBlocks == fBuffer)
1107 				memcpy(newBlocks, fBuffer, kBufferSize * sizeof(void*));
1108 
1109 			fBlocks = newBlocks;
1110 			fCapacity = newCapacity;
1111 		}
1112 	}
1113 
1114 	fBlocks[fCount++] = block;
1115 	fTotal++;
1116 	block->busy_writing = true;
1117 	fCache->busy_writing_count++;
1118 	if (block->previous_transaction != NULL)
1119 		block->previous_transaction->busy_writing_count++;
1120 
1121 	return true;
1122 }
1123 
1124 
1125 /*!	Adds all blocks of the specified transaction to the to be written array.
1126 	If no more blocks can be added, false is returned, otherwise true.
1127 */
1128 bool
1129 BlockWriter::Add(cache_transaction* transaction, bool& hasLeftOvers)
1130 {
1131 	ASSERT(!transaction->open);
1132 
1133 	if (transaction->busy_writing_count != 0) {
1134 		hasLeftOvers = true;
1135 		return true;
1136 	}
1137 
1138 	hasLeftOvers = false;
1139 
1140 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1141 	while (cached_block* block = blockIterator.Next()) {
1142 		if (!block->CanBeWritten()) {
1143 			// This block was already part of a previous transaction within this
1144 			// writer
1145 			hasLeftOvers = true;
1146 			continue;
1147 		}
1148 		if (!Add(block, transaction))
1149 			return false;
1150 
1151 		if (DeletedTransaction())
1152 			break;
1153 	}
1154 
1155 	return true;
1156 }
1157 
1158 
1159 /*! Cache must be locked when calling this method, but it will be unlocked
1160 	while the blocks are written back.
1161 */
1162 status_t
1163 BlockWriter::Write(cache_transaction* transaction, bool canUnlock)
1164 {
1165 	if (fCount == 0)
1166 		return B_OK;
1167 
1168 	if (canUnlock)
1169 		mutex_unlock(&fCache->lock);
1170 
1171 	// Sort blocks in their on-disk order
1172 	// TODO: ideally, this should be handled by the I/O scheduler
1173 
1174 	qsort(fBlocks, fCount, sizeof(void*), &_CompareBlocks);
1175 	fDeletedTransaction = false;
1176 
1177 	for (uint32 i = 0; i < fCount; i++) {
1178 		status_t status = _WriteBlock(fBlocks[i]);
1179 		if (status != B_OK) {
1180 			// propagate to global error handling
1181 			if (fStatus == B_OK)
1182 				fStatus = status;
1183 
1184 			_UnmarkWriting(fBlocks[i]);
1185 			fBlocks[i] = NULL;
1186 				// This block will not be marked clean
1187 		}
1188 	}
1189 
1190 	if (canUnlock)
1191 		mutex_lock(&fCache->lock);
1192 
1193 	for (uint32 i = 0; i < fCount; i++)
1194 		_BlockDone(fBlocks[i], transaction);
1195 
1196 	fCount = 0;
1197 	return fStatus;
1198 }
1199 
1200 
1201 /*!	Writes the specified \a block back to disk. It will always only write back
1202 	the oldest change of the block if it is part of more than one transaction.
1203 	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1204 	delete transactions when they are no longer used, and \a deleteTransaction
1205 	is \c true.
1206 */
1207 /*static*/ status_t
1208 BlockWriter::WriteBlock(block_cache* cache, cached_block* block)
1209 {
1210 	BlockWriter writer(cache);
1211 
1212 	writer.Add(block);
1213 	return writer.Write();
1214 }
1215 
1216 
1217 void*
1218 BlockWriter::_Data(cached_block* block) const
1219 {
1220 	return block->previous_transaction != NULL && block->original_data != NULL
1221 		? block->original_data : block->current_data;
1222 		// We first need to write back changes from previous transactions
1223 }
1224 
1225 
1226 status_t
1227 BlockWriter::_WriteBlock(cached_block* block)
1228 {
1229 	ASSERT(block->busy_writing);
1230 
1231 	TRACE(("BlockWriter::_WriteBlock(block %" B_PRIdOFF ")\n", block->block_number));
1232 	TB(Write(fCache, block));
1233 	TB2(BlockData(fCache, block, "before write"));
1234 
1235 	size_t blockSize = fCache->block_size;
1236 
1237 	ssize_t written = write_pos(fCache->fd,
1238 		block->block_number * blockSize, _Data(block), blockSize);
1239 
1240 	if (written != (ssize_t)blockSize) {
1241 		TB(Error(fCache, block->block_number, "write failed", written));
1242 		FATAL(("could not write back block %" B_PRIdOFF " (%s)\n", block->block_number,
1243 			strerror(errno)));
1244 		if (written < 0)
1245 			return errno;
1246 
1247 		return B_IO_ERROR;
1248 	}
1249 
1250 	return B_OK;
1251 }
1252 
1253 
1254 void
1255 BlockWriter::_BlockDone(cached_block* block,
1256 	cache_transaction* transaction)
1257 {
1258 	if (block == NULL) {
1259 		// An error occured when trying to write this block
1260 		return;
1261 	}
1262 
1263 	if (fCache->num_dirty_blocks > 0)
1264 		fCache->num_dirty_blocks--;
1265 
1266 	if (_Data(block) == block->current_data)
1267 		block->is_dirty = false;
1268 
1269 	_UnmarkWriting(block);
1270 
1271 	cache_transaction* previous = block->previous_transaction;
1272 	if (previous != NULL) {
1273 		previous->blocks.Remove(block);
1274 		block->previous_transaction = NULL;
1275 
1276 		if (block->original_data != NULL && block->transaction == NULL) {
1277 			// This block is not part of a transaction, so it does not need
1278 			// its original pointer anymore.
1279 			fCache->Free(block->original_data);
1280 			block->original_data = NULL;
1281 		}
1282 
1283 		// Has the previous transaction been finished with that write?
1284 		if (--previous->num_blocks == 0) {
1285 			TRACE(("cache transaction %" B_PRId32 " finished!\n", previous->id));
1286 			T(Action("written", fCache, previous));
1287 
1288 			notify_transaction_listeners(fCache, previous,
1289 				TRANSACTION_WRITTEN);
1290 
1291 			if (transaction != NULL) {
1292 				// This function is called while iterating transaction_hash. We
1293 				// use RemoveUnchecked so the iterator is still valid. A regular
1294 				// Remove can trigger a resize of the hash table which would
1295 				// result in the linked items in the table changing order.
1296 				fCache->transaction_hash->RemoveUnchecked(transaction);
1297 			} else
1298 				fCache->transaction_hash->Remove(previous);
1299 
1300 			delete_transaction(fCache, previous);
1301 			fDeletedTransaction = true;
1302 		}
1303 	}
1304 	if (block->transaction == NULL && block->ref_count == 0 && !block->unused) {
1305 		// the block is no longer used
1306 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1307 		block->unused = true;
1308 		fCache->unused_blocks.Add(block);
1309 		fCache->unused_block_count++;
1310 	}
1311 
1312 	TB2(BlockData(fCache, block, "after write"));
1313 }
1314 
1315 
1316 void
1317 BlockWriter::_UnmarkWriting(cached_block* block)
1318 {
1319 	block->busy_writing = false;
1320 	if (block->previous_transaction != NULL)
1321 		block->previous_transaction->busy_writing_count--;
1322 	fCache->busy_writing_count--;
1323 
1324 	if ((fCache->busy_writing_waiters && fCache->busy_writing_count == 0)
1325 		|| block->busy_writing_waiters) {
1326 		fCache->busy_writing_waiters = false;
1327 		block->busy_writing_waiters = false;
1328 		fCache->busy_writing_condition.NotifyAll();
1329 	}
1330 }
1331 
1332 
1333 /*static*/ int
1334 BlockWriter::_CompareBlocks(const void* _blockA, const void* _blockB)
1335 {
1336 	cached_block* blockA = *(cached_block**)_blockA;
1337 	cached_block* blockB = *(cached_block**)_blockB;
1338 
1339 	off_t diff = blockA->block_number - blockB->block_number;
1340 	if (diff > 0)
1341 		return 1;
1342 
1343 	return diff < 0 ? -1 : 0;
1344 }
1345 
1346 
1347 //	#pragma mark - block_cache
1348 
1349 
1350 block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
1351 		bool readOnly)
1352 	:
1353 	hash(NULL),
1354 	fd(_fd),
1355 	max_blocks(numBlocks),
1356 	block_size(blockSize),
1357 	next_transaction_id(1),
1358 	last_transaction(NULL),
1359 	transaction_hash(NULL),
1360 	buffer_cache(NULL),
1361 	unused_block_count(0),
1362 	busy_reading_count(0),
1363 	busy_reading_waiters(false),
1364 	busy_writing_count(0),
1365 	busy_writing_waiters(0),
1366 	num_dirty_blocks(0),
1367 	read_only(readOnly)
1368 {
1369 }
1370 
1371 
1372 /*! Should be called with the cache's lock held. */
1373 block_cache::~block_cache()
1374 {
1375 	unregister_low_resource_handler(&_LowMemoryHandler, this);
1376 
1377 	delete transaction_hash;
1378 	delete hash;
1379 
1380 	delete_object_cache(buffer_cache);
1381 
1382 	mutex_destroy(&lock);
1383 }
1384 
1385 
1386 status_t
1387 block_cache::Init()
1388 {
1389 	busy_reading_condition.Init(this, "cache block busy_reading");
1390 	busy_writing_condition.Init(this, "cache block busy writing");
1391 	condition_variable.Init(this, "cache transaction sync");
1392 	mutex_init(&lock, "block cache");
1393 
1394 	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
1395 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1396 	if (buffer_cache == NULL)
1397 		return B_NO_MEMORY;
1398 
1399 	hash = new BlockTable();
1400 	if (hash == NULL || hash->Init(1024) != B_OK)
1401 		return B_NO_MEMORY;
1402 
1403 	transaction_hash = new(std::nothrow) TransactionTable();
1404 	if (transaction_hash == NULL || transaction_hash->Init(16) != B_OK)
1405 		return B_NO_MEMORY;
1406 
1407 	return register_low_resource_handler(&_LowMemoryHandler, this,
1408 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1409 			| B_KERNEL_RESOURCE_ADDRESS_SPACE, 0);
1410 }
1411 
1412 
1413 void
1414 block_cache::Free(void* buffer)
1415 {
1416 	if (buffer != NULL)
1417 		object_cache_free(buffer_cache, buffer, 0);
1418 }
1419 
1420 
1421 void*
1422 block_cache::Allocate()
1423 {
1424 	void* block = object_cache_alloc(buffer_cache, 0);
1425 	if (block != NULL)
1426 		return block;
1427 
1428 	// recycle existing before allocating a new one
1429 	RemoveUnusedBlocks(100);
1430 
1431 	return object_cache_alloc(buffer_cache, 0);
1432 }
1433 
1434 
1435 void
1436 block_cache::FreeBlock(cached_block* block)
1437 {
1438 	Free(block->current_data);
1439 
1440 	if (block->original_data != NULL || block->parent_data != NULL) {
1441 		panic("block_cache::FreeBlock(): %" B_PRIdOFF ", original %p, parent %p\n",
1442 			block->block_number, block->original_data, block->parent_data);
1443 	}
1444 
1445 #if BLOCK_CACHE_DEBUG_CHANGED
1446 	Free(block->compare);
1447 #endif
1448 
1449 	object_cache_free(sBlockCache, block, 0);
1450 }
1451 
1452 
1453 /*! Allocates a new block for \a blockNumber, ready for use */
1454 cached_block*
1455 block_cache::NewBlock(off_t blockNumber)
1456 {
1457 	cached_block* block = NULL;
1458 
1459 	if (low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1460 			| B_KERNEL_RESOURCE_ADDRESS_SPACE) != B_NO_LOW_RESOURCE) {
1461 		// recycle existing instead of allocating a new one
1462 		block = _GetUnusedBlock();
1463 	}
1464 	if (block == NULL) {
1465 		block = (cached_block*)object_cache_alloc(sBlockCache, 0);
1466 		if (block != NULL) {
1467 			block->current_data = Allocate();
1468 			if (block->current_data == NULL) {
1469 				object_cache_free(sBlockCache, block, 0);
1470 				return NULL;
1471 			}
1472 		} else {
1473 			TB(Error(this, blockNumber, "allocation failed"));
1474 			dprintf("block allocation failed, unused list is %sempty.\n",
1475 				unused_blocks.IsEmpty() ? "" : "not ");
1476 
1477 			// allocation failed, try to reuse an unused block
1478 			block = _GetUnusedBlock();
1479 			if (block == NULL) {
1480 				TB(Error(this, blockNumber, "get unused failed"));
1481 				FATAL(("could not allocate block!\n"));
1482 				return NULL;
1483 			}
1484 		}
1485 	}
1486 
1487 	block->block_number = blockNumber;
1488 	block->ref_count = 0;
1489 	block->last_accessed = 0;
1490 	block->transaction_next = NULL;
1491 	block->transaction = block->previous_transaction = NULL;
1492 	block->original_data = NULL;
1493 	block->parent_data = NULL;
1494 	block->busy_reading = false;
1495 	block->busy_writing = false;
1496 	block->is_writing = false;
1497 	block->is_dirty = false;
1498 	block->unused = false;
1499 	block->discard = false;
1500 	block->busy_reading_waiters = false;
1501 	block->busy_writing_waiters = false;
1502 #if BLOCK_CACHE_DEBUG_CHANGED
1503 	block->compare = NULL;
1504 #endif
1505 
1506 	return block;
1507 }
1508 
1509 
1510 void
1511 block_cache::FreeBlockParentData(cached_block* block)
1512 {
1513 	ASSERT(block->parent_data != NULL);
1514 	if (block->parent_data != block->current_data)
1515 		Free(block->parent_data);
1516 	block->parent_data = NULL;
1517 }
1518 
1519 
1520 void
1521 block_cache::RemoveUnusedBlocks(int32 count, int32 minSecondsOld)
1522 {
1523 	TRACE(("block_cache: remove up to %" B_PRId32 " unused blocks\n", count));
1524 
1525 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1526 			cached_block* block = iterator.Next();) {
1527 		if (minSecondsOld >= block->LastAccess()) {
1528 			// The list is sorted by last access
1529 			break;
1530 		}
1531 		if (block->busy_reading || block->busy_writing)
1532 			continue;
1533 
1534 		TB(Flush(this, block));
1535 		TRACE(("  remove block %" B_PRIdOFF ", last accessed %" B_PRId32 "\n",
1536 			block->block_number, block->last_accessed));
1537 
1538 		// this can only happen if no transactions are used
1539 		if (block->is_dirty && !block->discard) {
1540 			if (block->busy_writing)
1541 				continue;
1542 
1543 			BlockWriter::WriteBlock(this, block);
1544 		}
1545 
1546 		// remove block from lists
1547 		iterator.Remove();
1548 		unused_block_count--;
1549 		RemoveBlock(block);
1550 
1551 		if (--count <= 0)
1552 			break;
1553 	}
1554 }
1555 
1556 
1557 void
1558 block_cache::RemoveBlock(cached_block* block)
1559 {
1560 	hash->Remove(block);
1561 	FreeBlock(block);
1562 }
1563 
1564 
1565 /*!	Discards the block from a transaction (this method must not be called
1566 	for blocks not part of a transaction).
1567 */
1568 void
1569 block_cache::DiscardBlock(cached_block* block)
1570 {
1571 	ASSERT(block->discard);
1572 	ASSERT(block->previous_transaction == NULL);
1573 
1574 	if (block->parent_data != NULL)
1575 		FreeBlockParentData(block);
1576 
1577 	if (block->original_data != NULL) {
1578 		Free(block->original_data);
1579 		block->original_data = NULL;
1580 	}
1581 
1582 	RemoveBlock(block);
1583 }
1584 
1585 
1586 void
1587 block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
1588 {
1589 	TRACE(("block_cache: low memory handler called with level %" B_PRId32 "\n", level));
1590 
1591 	// free some blocks according to the low memory state
1592 	// (if there is enough memory left, we don't free any)
1593 
1594 	block_cache* cache = (block_cache*)data;
1595 	int32 free = 0;
1596 	int32 secondsOld = 0;
1597 	switch (level) {
1598 		case B_NO_LOW_RESOURCE:
1599 			return;
1600 		case B_LOW_RESOURCE_NOTE:
1601 			free = cache->unused_block_count / 8;
1602 			secondsOld = 120;
1603 			break;
1604 		case B_LOW_RESOURCE_WARNING:
1605 			free = cache->unused_block_count / 4;
1606 			secondsOld = 10;
1607 			break;
1608 		case B_LOW_RESOURCE_CRITICAL:
1609 			free = cache->unused_block_count / 2;
1610 			secondsOld = 0;
1611 			break;
1612 	}
1613 
1614 	MutexLocker locker(&cache->lock);
1615 
1616 	if (!locker.IsLocked()) {
1617 		// If our block_cache were deleted, it could be that we had
1618 		// been called before that deletion went through, therefore,
1619 		// acquiring its lock might fail.
1620 		return;
1621 	}
1622 
1623 #ifdef TRACE_BLOCK_CACHE
1624 	uint32 oldUnused = cache->unused_block_count;
1625 #endif
1626 
1627 	cache->RemoveUnusedBlocks(free, secondsOld);
1628 
1629 	TRACE(("block_cache::_LowMemoryHandler(): %p: unused: %" B_PRIu32 " -> %" B_PRIu32 "\n",
1630 		cache, oldUnused, cache->unused_block_count));
1631 }
1632 
1633 
1634 cached_block*
1635 block_cache::_GetUnusedBlock()
1636 {
1637 	TRACE(("block_cache: get unused block\n"));
1638 
1639 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1640 			cached_block* block = iterator.Next();) {
1641 		TB(Flush(this, block, true));
1642 		// this can only happen if no transactions are used
1643 		if (block->is_dirty && !block->busy_writing && !block->discard)
1644 			BlockWriter::WriteBlock(this, block);
1645 
1646 		// remove block from lists
1647 		iterator.Remove();
1648 		unused_block_count--;
1649 		hash->Remove(block);
1650 
1651 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1652 		block->unused = false;
1653 
1654 		// TODO: see if compare data is handled correctly here!
1655 #if BLOCK_CACHE_DEBUG_CHANGED
1656 		if (block->compare != NULL)
1657 			Free(block->compare);
1658 #endif
1659 		return block;
1660 	}
1661 
1662 	return NULL;
1663 }
1664 
1665 
1666 //	#pragma mark - private block functions
1667 
1668 
1669 /*!	Cache must be locked.
1670 */
1671 static void
1672 mark_block_busy_reading(block_cache* cache, cached_block* block)
1673 {
1674 	block->busy_reading = true;
1675 	cache->busy_reading_count++;
1676 }
1677 
1678 
1679 /*!	Cache must be locked.
1680 */
1681 static void
1682 mark_block_unbusy_reading(block_cache* cache, cached_block* block)
1683 {
1684 	block->busy_reading = false;
1685 	cache->busy_reading_count--;
1686 
1687 	if ((cache->busy_reading_waiters && cache->busy_reading_count == 0)
1688 		|| block->busy_reading_waiters) {
1689 		cache->busy_reading_waiters = false;
1690 		block->busy_reading_waiters = false;
1691 		cache->busy_reading_condition.NotifyAll();
1692 	}
1693 }
1694 
1695 
1696 /*!	Cache must be locked.
1697 */
1698 static void
1699 wait_for_busy_reading_block(block_cache* cache, cached_block* block)
1700 {
1701 	while (block->busy_reading) {
1702 		// wait for at least the specified block to be read in
1703 		ConditionVariableEntry entry;
1704 		cache->busy_reading_condition.Add(&entry);
1705 		block->busy_reading_waiters = true;
1706 
1707 		mutex_unlock(&cache->lock);
1708 
1709 		entry.Wait();
1710 
1711 		mutex_lock(&cache->lock);
1712 	}
1713 }
1714 
1715 
1716 /*!	Cache must be locked.
1717 */
1718 static void
1719 wait_for_busy_reading_blocks(block_cache* cache)
1720 {
1721 	while (cache->busy_reading_count != 0) {
1722 		// wait for all blocks to be read in
1723 		ConditionVariableEntry entry;
1724 		cache->busy_reading_condition.Add(&entry);
1725 		cache->busy_reading_waiters = true;
1726 
1727 		mutex_unlock(&cache->lock);
1728 
1729 		entry.Wait();
1730 
1731 		mutex_lock(&cache->lock);
1732 	}
1733 }
1734 
1735 
1736 /*!	Cache must be locked.
1737 */
1738 static void
1739 wait_for_busy_writing_block(block_cache* cache, cached_block* block)
1740 {
1741 	while (block->busy_writing) {
1742 		// wait for all blocks to be written back
1743 		ConditionVariableEntry entry;
1744 		cache->busy_writing_condition.Add(&entry);
1745 		block->busy_writing_waiters = true;
1746 
1747 		mutex_unlock(&cache->lock);
1748 
1749 		entry.Wait();
1750 
1751 		mutex_lock(&cache->lock);
1752 	}
1753 }
1754 
1755 
1756 /*!	Cache must be locked.
1757 */
1758 static void
1759 wait_for_busy_writing_blocks(block_cache* cache)
1760 {
1761 	while (cache->busy_writing_count != 0) {
1762 		// wait for all blocks to be written back
1763 		ConditionVariableEntry entry;
1764 		cache->busy_writing_condition.Add(&entry);
1765 		cache->busy_writing_waiters = true;
1766 
1767 		mutex_unlock(&cache->lock);
1768 
1769 		entry.Wait();
1770 
1771 		mutex_lock(&cache->lock);
1772 	}
1773 }
1774 
1775 
1776 /*!	Removes a reference from the specified \a block. If this was the last
1777 	reference, the block is moved into the unused list.
1778 	In low memory situations, it will also free some blocks from that list,
1779 	but not necessarily the \a block it just released.
1780 */
1781 static void
1782 put_cached_block(block_cache* cache, cached_block* block)
1783 {
1784 #if BLOCK_CACHE_DEBUG_CHANGED
1785 	if (!block->is_dirty && block->compare != NULL
1786 		&& memcmp(block->current_data, block->compare, cache->block_size)) {
1787 		dprintf("new block:\n");
1788 		dump_block((const char*)block->current_data, 256, "  ");
1789 		dprintf("unchanged block:\n");
1790 		dump_block((const char*)block->compare, 256, "  ");
1791 		BlockWriter::WriteBlock(cache, block);
1792 		panic("block_cache: supposed to be clean block was changed!\n");
1793 
1794 		cache->Free(block->compare);
1795 		block->compare = NULL;
1796 	}
1797 #endif
1798 	TB(Put(cache, block));
1799 
1800 	if (block->ref_count < 1) {
1801 		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
1802 		return;
1803 	}
1804 
1805 	if (--block->ref_count == 0
1806 		&& block->transaction == NULL && block->previous_transaction == NULL) {
1807 		// This block is not used anymore, and not part of any transaction
1808 		block->is_writing = false;
1809 
1810 		if (block->discard) {
1811 			cache->RemoveBlock(block);
1812 		} else {
1813 			// put this block in the list of unused blocks
1814 			ASSERT(!block->unused);
1815 			block->unused = true;
1816 
1817 			ASSERT(block->original_data == NULL && block->parent_data == NULL);
1818 			cache->unused_blocks.Add(block);
1819 			cache->unused_block_count++;
1820 		}
1821 	}
1822 }
1823 
1824 
1825 static void
1826 put_cached_block(block_cache* cache, off_t blockNumber)
1827 {
1828 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1829 		panic("put_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1830 			blockNumber, cache->max_blocks - 1);
1831 	}
1832 
1833 	cached_block* block = cache->hash->Lookup(blockNumber);
1834 	if (block != NULL)
1835 		put_cached_block(cache, block);
1836 	else {
1837 		TB(Error(cache, blockNumber, "put unknown"));
1838 	}
1839 }
1840 
1841 
1842 /*!	Retrieves the block \a blockNumber from the hash table, if it's already
1843 	there, or reads it from the disk.
1844 	You need to have the cache locked when calling this function.
1845 
1846 	\param _allocated tells you whether or not a new block has been allocated
1847 		to satisfy your request.
1848 	\param readBlock if \c false, the block will not be read in case it was
1849 		not already in the cache. The block you retrieve may contain random
1850 		data. If \c true, the cache will be temporarily unlocked while the
1851 		block is read in.
1852 */
1853 static cached_block*
1854 get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
1855 	bool readBlock = true)
1856 {
1857 	ASSERT_LOCKED_MUTEX(&cache->lock);
1858 
1859 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1860 		panic("get_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1861 			blockNumber, cache->max_blocks - 1);
1862 		return NULL;
1863 	}
1864 
1865 retry:
1866 	cached_block* block = cache->hash->Lookup(blockNumber);
1867 	*_allocated = false;
1868 
1869 	if (block == NULL) {
1870 		// put block into cache
1871 		block = cache->NewBlock(blockNumber);
1872 		if (block == NULL)
1873 			return NULL;
1874 
1875 		cache->hash->Insert(block);
1876 		*_allocated = true;
1877 	} else if (block->busy_reading) {
1878 		// The block is currently busy_reading - wait and try again later
1879 		wait_for_busy_reading_block(cache, block);
1880 		goto retry;
1881 	}
1882 
1883 	if (block->unused) {
1884 		//TRACE(("remove block %" B_PRIdOFF " from unused\n", blockNumber));
1885 		block->unused = false;
1886 		cache->unused_blocks.Remove(block);
1887 		cache->unused_block_count--;
1888 	}
1889 
1890 	if (*_allocated && readBlock) {
1891 		// read block into cache
1892 		int32 blockSize = cache->block_size;
1893 
1894 		mark_block_busy_reading(cache, block);
1895 		mutex_unlock(&cache->lock);
1896 
1897 		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
1898 			block->current_data, blockSize);
1899 
1900 		mutex_lock(&cache->lock);
1901 		if (bytesRead < blockSize) {
1902 			cache->RemoveBlock(block);
1903 			TB(Error(cache, blockNumber, "read failed", bytesRead));
1904 
1905 			FATAL(("could not read block %" B_PRIdOFF ": bytesRead: %zd, error: %s\n",
1906 				blockNumber, bytesRead, strerror(errno)));
1907 			return NULL;
1908 		}
1909 		TB(Read(cache, block));
1910 
1911 		mark_block_unbusy_reading(cache, block);
1912 	}
1913 
1914 	block->ref_count++;
1915 	block->last_accessed = system_time() / 1000000L;
1916 
1917 	return block;
1918 }
1919 
1920 
1921 /*!	Returns the writable block data for the requested blockNumber.
1922 	If \a cleared is true, the block is not read from disk; an empty block
1923 	is returned.
1924 
1925 	This is the only method to insert a block into a transaction. It makes
1926 	sure that the previous block contents are preserved in that case.
1927 */
1928 static void*
1929 get_writable_cached_block(block_cache* cache, off_t blockNumber, off_t base,
1930 	off_t length, int32 transactionID, bool cleared)
1931 {
1932 	TRACE(("get_writable_cached_block(blockNumber = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
1933 		blockNumber, transactionID));
1934 
1935 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1936 		panic("get_writable_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1937 			blockNumber, cache->max_blocks - 1);
1938 	}
1939 
1940 	bool allocated;
1941 	cached_block* block = get_cached_block(cache, blockNumber, &allocated,
1942 		!cleared);
1943 	if (block == NULL)
1944 		return NULL;
1945 
1946 	if (block->busy_writing)
1947 		wait_for_busy_writing_block(cache, block);
1948 
1949 	block->discard = false;
1950 
1951 	// if there is no transaction support, we just return the current block
1952 	if (transactionID == -1) {
1953 		if (cleared) {
1954 			mark_block_busy_reading(cache, block);
1955 			mutex_unlock(&cache->lock);
1956 
1957 			memset(block->current_data, 0, cache->block_size);
1958 
1959 			mutex_lock(&cache->lock);
1960 			mark_block_unbusy_reading(cache, block);
1961 		}
1962 
1963 		block->is_writing = true;
1964 
1965 		if (!block->is_dirty) {
1966 			cache->num_dirty_blocks++;
1967 			block->is_dirty = true;
1968 				// mark the block as dirty
1969 		}
1970 
1971 		TB(Get(cache, block));
1972 		return block->current_data;
1973 	}
1974 
1975 	cache_transaction* transaction = block->transaction;
1976 
1977 	if (transaction != NULL && transaction->id != transactionID) {
1978 		// TODO: we have to wait here until the other transaction is done.
1979 		//	Maybe we should even panic, since we can't prevent any deadlocks.
1980 		panic("get_writable_cached_block(): asked to get busy writable block "
1981 			"(transaction %" B_PRId32 ")\n", block->transaction->id);
1982 		put_cached_block(cache, block);
1983 		return NULL;
1984 	}
1985 	if (transaction == NULL && transactionID != -1) {
1986 		// get new transaction
1987 		transaction = lookup_transaction(cache, transactionID);
1988 		if (transaction == NULL) {
1989 			panic("get_writable_cached_block(): invalid transaction %" B_PRId32 "!\n",
1990 				transactionID);
1991 			put_cached_block(cache, block);
1992 			return NULL;
1993 		}
1994 		if (!transaction->open) {
1995 			panic("get_writable_cached_block(): transaction already done!\n");
1996 			put_cached_block(cache, block);
1997 			return NULL;
1998 		}
1999 
2000 		block->transaction = transaction;
2001 
2002 		// attach the block to the transaction block list
2003 		block->transaction_next = transaction->first_block;
2004 		transaction->first_block = block;
2005 		transaction->num_blocks++;
2006 	}
2007 	if (transaction != NULL)
2008 		transaction->last_used = system_time();
2009 
2010 	bool wasUnchanged = block->original_data == NULL
2011 		|| block->previous_transaction != NULL;
2012 
2013 	if (!(allocated && cleared) && block->original_data == NULL) {
2014 		// we already have data, so we need to preserve it
2015 		block->original_data = cache->Allocate();
2016 		if (block->original_data == NULL) {
2017 			TB(Error(cache, blockNumber, "allocate original failed"));
2018 			FATAL(("could not allocate original_data\n"));
2019 			put_cached_block(cache, block);
2020 			return NULL;
2021 		}
2022 
2023 		mark_block_busy_reading(cache, block);
2024 		mutex_unlock(&cache->lock);
2025 
2026 		memcpy(block->original_data, block->current_data, cache->block_size);
2027 
2028 		mutex_lock(&cache->lock);
2029 		mark_block_unbusy_reading(cache, block);
2030 	}
2031 	if (block->parent_data == block->current_data) {
2032 		// remember any previous contents for the parent transaction
2033 		block->parent_data = cache->Allocate();
2034 		if (block->parent_data == NULL) {
2035 			// TODO: maybe we should just continue the current transaction in
2036 			// this case...
2037 			TB(Error(cache, blockNumber, "allocate parent failed"));
2038 			FATAL(("could not allocate parent\n"));
2039 			put_cached_block(cache, block);
2040 			return NULL;
2041 		}
2042 
2043 		mark_block_busy_reading(cache, block);
2044 		mutex_unlock(&cache->lock);
2045 
2046 		memcpy(block->parent_data, block->current_data, cache->block_size);
2047 
2048 		mutex_lock(&cache->lock);
2049 		mark_block_unbusy_reading(cache, block);
2050 
2051 		transaction->sub_num_blocks++;
2052 	} else if (transaction != NULL && transaction->has_sub_transaction
2053 		&& block->parent_data == NULL && wasUnchanged)
2054 		transaction->sub_num_blocks++;
2055 
2056 	if (cleared) {
2057 		mark_block_busy_reading(cache, block);
2058 		mutex_unlock(&cache->lock);
2059 
2060 		memset(block->current_data, 0, cache->block_size);
2061 
2062 		mutex_lock(&cache->lock);
2063 		mark_block_unbusy_reading(cache, block);
2064 	}
2065 
2066 	block->is_dirty = true;
2067 	TB(Get(cache, block));
2068 	TB2(BlockData(cache, block, "get writable"));
2069 
2070 	return block->current_data;
2071 }
2072 
2073 
2074 #if DEBUG_BLOCK_CACHE
2075 
2076 
2077 static void
2078 dump_block(cached_block* block)
2079 {
2080 	kprintf("%08lx %9" B_PRIdOFF " %08lx %08lx %08lx %5" B_PRId32 " %6" B_PRId32
2081 		" %c%c%c%c%c%c %08lx %08lx\n",
2082 		(addr_t)block, block->block_number,
2083 		(addr_t)block->current_data, (addr_t)block->original_data,
2084 		(addr_t)block->parent_data, block->ref_count, block->LastAccess(),
2085 		block->busy_reading ? 'r' : '-', block->busy_writing ? 'w' : '-',
2086 		block->is_writing ? 'W' : '-', block->is_dirty ? 'D' : '-',
2087 		block->unused ? 'U' : '-', block->discard ? 'D' : '-',
2088 		(addr_t)block->transaction,
2089 		(addr_t)block->previous_transaction);
2090 }
2091 
2092 
2093 static void
2094 dump_block_long(cached_block* block)
2095 {
2096 	kprintf("BLOCK %p\n", block);
2097 	kprintf(" current data:  %p\n", block->current_data);
2098 	kprintf(" original data: %p\n", block->original_data);
2099 	kprintf(" parent data:   %p\n", block->parent_data);
2100 #if BLOCK_CACHE_DEBUG_CHANGED
2101 	kprintf(" compare data:  %p\n", block->compare);
2102 #endif
2103 	kprintf(" ref_count:     %" B_PRId32 "\n", block->ref_count);
2104 	kprintf(" accessed:      %" B_PRId32 "\n", block->LastAccess());
2105 	kprintf(" flags:        ");
2106 	if (block->busy_reading)
2107 		kprintf(" busy_reading");
2108 	if (block->busy_writing)
2109 		kprintf(" busy_writing");
2110 	if (block->is_writing)
2111 		kprintf(" is-writing");
2112 	if (block->is_dirty)
2113 		kprintf(" is-dirty");
2114 	if (block->unused)
2115 		kprintf(" unused");
2116 	if (block->discard)
2117 		kprintf(" discard");
2118 	kprintf("\n");
2119 	if (block->transaction != NULL) {
2120 		kprintf(" transaction:   %p (%" B_PRId32 ")\n", block->transaction,
2121 			block->transaction->id);
2122 		if (block->transaction_next != NULL) {
2123 			kprintf(" next in transaction: %" B_PRIdOFF "\n",
2124 				block->transaction_next->block_number);
2125 		}
2126 	}
2127 	if (block->previous_transaction != NULL) {
2128 		kprintf(" previous transaction: %p (%" B_PRId32 ")\n",
2129 			block->previous_transaction,
2130 			block->previous_transaction->id);
2131 	}
2132 
2133 	set_debug_variable("_current", (addr_t)block->current_data);
2134 	set_debug_variable("_original", (addr_t)block->original_data);
2135 	set_debug_variable("_parent", (addr_t)block->parent_data);
2136 }
2137 
2138 
2139 static int
2140 dump_cached_block(int argc, char** argv)
2141 {
2142 	if (argc != 2) {
2143 		kprintf("usage: %s <block-address>\n", argv[0]);
2144 		return 0;
2145 	}
2146 
2147 	dump_block_long((struct cached_block*)(addr_t)parse_expression(argv[1]));
2148 	return 0;
2149 }
2150 
2151 
2152 static int
2153 dump_cache(int argc, char** argv)
2154 {
2155 	bool showTransactions = false;
2156 	bool showBlocks = false;
2157 	int32 i = 1;
2158 	while (argv[i] != NULL && argv[i][0] == '-') {
2159 		for (char* arg = &argv[i][1]; arg[0]; arg++) {
2160 			switch (arg[0]) {
2161 				case 'b':
2162 					showBlocks = true;
2163 					break;
2164 				case 't':
2165 					showTransactions = true;
2166 					break;
2167 				default:
2168 					print_debugger_command_usage(argv[0]);
2169 					return 0;
2170 			}
2171 		}
2172 		i++;
2173 	}
2174 
2175 	if (i >= argc) {
2176 		print_debugger_command_usage(argv[0]);
2177 		return 0;
2178 	}
2179 
2180 	block_cache* cache = (struct block_cache*)(addr_t)parse_expression(argv[i]);
2181 	if (cache == NULL) {
2182 		kprintf("invalid cache address\n");
2183 		return 0;
2184 	}
2185 
2186 	off_t blockNumber = -1;
2187 	if (i + 1 < argc) {
2188 		blockNumber = parse_expression(argv[i + 1]);
2189 		cached_block* block = cache->hash->Lookup(blockNumber);
2190 		if (block != NULL)
2191 			dump_block_long(block);
2192 		else
2193 			kprintf("block %" B_PRIdOFF " not found\n", blockNumber);
2194 		return 0;
2195 	}
2196 
2197 	kprintf("BLOCK CACHE: %p\n", cache);
2198 
2199 	kprintf(" fd:           %d\n", cache->fd);
2200 	kprintf(" max_blocks:   %" B_PRIdOFF "\n", cache->max_blocks);
2201 	kprintf(" block_size:   %zu\n", cache->block_size);
2202 	kprintf(" next_transaction_id: %" B_PRId32 "\n", cache->next_transaction_id);
2203 	kprintf(" buffer_cache: %p\n", cache->buffer_cache);
2204 	kprintf(" busy_reading: %" B_PRIu32 ", %s waiters\n", cache->busy_reading_count,
2205 		cache->busy_reading_waiters ? "has" : "no");
2206 	kprintf(" busy_writing: %" B_PRIu32 ", %s waiters\n", cache->busy_writing_count,
2207 		cache->busy_writing_waiters ? "has" : "no");
2208 
2209 	if (!cache->pending_notifications.IsEmpty()) {
2210 		kprintf(" pending notifications:\n");
2211 
2212 		NotificationList::Iterator iterator
2213 			= cache->pending_notifications.GetIterator();
2214 		while (iterator.HasNext()) {
2215 			cache_notification* notification = iterator.Next();
2216 
2217 			kprintf("  %p %5" B_PRIx32 " %p - %p\n", notification,
2218 				notification->events_pending, notification->hook,
2219 				notification->data);
2220 		}
2221 	}
2222 
2223 	if (showTransactions) {
2224 		kprintf(" transactions:\n");
2225 		kprintf("address       id state  blocks  main   sub\n");
2226 
2227 		TransactionTable::Iterator iterator(cache->transaction_hash);
2228 
2229 		while (iterator.HasNext()) {
2230 			cache_transaction* transaction = iterator.Next();
2231 			kprintf("%p %5" B_PRId32 " %-7s %5" B_PRId32 " %5" B_PRId32 " %5"
2232 				B_PRId32 "\n", transaction, transaction->id,
2233 				transaction->open ? "open" : "closed",
2234 				transaction->num_blocks, transaction->main_num_blocks,
2235 				transaction->sub_num_blocks);
2236 		}
2237 	}
2238 
2239 	if (showBlocks) {
2240 		kprintf(" blocks:\n");
2241 		kprintf("address  block no. current  original parent    refs access "
2242 			"flags transact prev. trans\n");
2243 	}
2244 
2245 	uint32 referenced = 0;
2246 	uint32 count = 0;
2247 	uint32 dirty = 0;
2248 	uint32 discarded = 0;
2249 	BlockTable::Iterator iterator(cache->hash);
2250 	while (iterator.HasNext()) {
2251 		cached_block* block = iterator.Next();
2252 		if (showBlocks)
2253 			dump_block(block);
2254 
2255 		if (block->is_dirty)
2256 			dirty++;
2257 		if (block->discard)
2258 			discarded++;
2259 		if (block->ref_count)
2260 			referenced++;
2261 		count++;
2262 	}
2263 
2264 	kprintf(" %" B_PRIu32 " blocks total, %" B_PRIu32 " dirty, %" B_PRIu32
2265 		" discarded, %" B_PRIu32 " referenced, %" B_PRIu32 " busy, %" B_PRIu32
2266 		" in unused.\n",
2267 		count, dirty, discarded, referenced, cache->busy_reading_count,
2268 		cache->unused_block_count);
2269 	return 0;
2270 }
2271 
2272 
2273 static int
2274 dump_transaction(int argc, char** argv)
2275 {
2276 	bool showBlocks = false;
2277 	int i = 1;
2278 	if (argc > 1 && !strcmp(argv[1], "-b")) {
2279 		showBlocks = true;
2280 		i++;
2281 	}
2282 
2283 	if (argc - i < 1 || argc - i > 2) {
2284 		print_debugger_command_usage(argv[0]);
2285 		return 0;
2286 	}
2287 
2288 	cache_transaction* transaction = NULL;
2289 
2290 	if (argc - i == 1) {
2291 		transaction = (cache_transaction*)(addr_t)parse_expression(argv[i]);
2292 	} else {
2293 		block_cache* cache = (block_cache*)(addr_t)parse_expression(argv[i]);
2294 		int32 id = parse_expression(argv[i + 1]);
2295 		transaction = lookup_transaction(cache, id);
2296 		if (transaction == NULL) {
2297 			kprintf("No transaction with ID %" B_PRId32 " found.\n", id);
2298 			return 0;
2299 		}
2300 	}
2301 
2302 	kprintf("TRANSACTION %p\n", transaction);
2303 
2304 	kprintf(" id:             %" B_PRId32 "\n", transaction->id);
2305 	kprintf(" num block:      %" B_PRId32 "\n", transaction->num_blocks);
2306 	kprintf(" main num block: %" B_PRId32 "\n", transaction->main_num_blocks);
2307 	kprintf(" sub num block:  %" B_PRId32 "\n", transaction->sub_num_blocks);
2308 	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
2309 	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
2310 	kprintf(" idle:           %" B_PRId64 " secs\n",
2311 		(system_time() - transaction->last_used) / 1000000);
2312 
2313 	kprintf(" listeners:\n");
2314 
2315 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2316 	while (iterator.HasNext()) {
2317 		cache_listener* listener = iterator.Next();
2318 
2319 		kprintf("  %p %5" B_PRIx32 " %p - %p\n", listener, listener->events_pending,
2320 			listener->hook, listener->data);
2321 	}
2322 
2323 	if (!showBlocks)
2324 		return 0;
2325 
2326 	kprintf(" blocks:\n");
2327 	kprintf("address  block no. current  original parent    refs access "
2328 		"flags transact prev. trans\n");
2329 
2330 	cached_block* block = transaction->first_block;
2331 	while (block != NULL) {
2332 		dump_block(block);
2333 		block = block->transaction_next;
2334 	}
2335 
2336 	kprintf("--\n");
2337 
2338 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
2339 	while (blockIterator.HasNext()) {
2340 		block = blockIterator.Next();
2341 		dump_block(block);
2342 	}
2343 
2344 	return 0;
2345 }
2346 
2347 
2348 static int
2349 dump_caches(int argc, char** argv)
2350 {
2351 	kprintf("Block caches:\n");
2352 	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
2353 	while (i.HasNext()) {
2354 		block_cache* cache = i.Next();
2355 		if (cache == (block_cache*)&sMarkCache)
2356 			continue;
2357 
2358 		kprintf("  %p\n", cache);
2359 	}
2360 
2361 	return 0;
2362 }
2363 
2364 
2365 #if BLOCK_CACHE_BLOCK_TRACING >= 2
2366 static int
2367 dump_block_data(int argc, char** argv)
2368 {
2369 	using namespace BlockTracing;
2370 
2371 	// Determine which blocks to show
2372 
2373 	bool printStackTrace = true;
2374 	uint32 which = 0;
2375 	int32 i = 1;
2376 	while (i < argc && argv[i][0] == '-') {
2377 		char* arg = &argv[i][1];
2378 		while (arg[0]) {
2379 			switch (arg[0]) {
2380 				case 'c':
2381 					which |= BlockData::kCurrent;
2382 					break;
2383 				case 'p':
2384 					which |= BlockData::kParent;
2385 					break;
2386 				case 'o':
2387 					which |= BlockData::kOriginal;
2388 					break;
2389 
2390 				default:
2391 					kprintf("invalid block specifier (only o/c/p are "
2392 						"allowed).\n");
2393 					return 0;
2394 			}
2395 			arg++;
2396 		}
2397 
2398 		i++;
2399 	}
2400 	if (which == 0)
2401 		which = BlockData::kCurrent | BlockData::kParent | BlockData::kOriginal;
2402 
2403 	if (i == argc) {
2404 		print_debugger_command_usage(argv[0]);
2405 		return 0;
2406 	}
2407 
2408 	// Get the range of blocks to print
2409 
2410 	int64 from = parse_expression(argv[i]);
2411 	int64 to = from;
2412 	if (argc > i + 1)
2413 		to = parse_expression(argv[i + 1]);
2414 	if (to < from)
2415 		to = from;
2416 
2417 	uint32 offset = 0;
2418 	uint32 size = LONG_MAX;
2419 	if (argc > i + 2)
2420 		offset = parse_expression(argv[i + 2]);
2421 	if (argc > i + 3)
2422 		size = parse_expression(argv[i + 3]);
2423 
2424 	TraceEntryIterator iterator;
2425 	iterator.MoveTo(from - 1);
2426 
2427 	static char sBuffer[1024];
2428 	LazyTraceOutput out(sBuffer, sizeof(sBuffer), TRACE_OUTPUT_TEAM_ID);
2429 
2430 	while (TraceEntry* entry = iterator.Next()) {
2431 		int32 index = iterator.Index();
2432 		if (index > to)
2433 			break;
2434 
2435 		Action* action = dynamic_cast<Action*>(entry);
2436 		if (action != NULL) {
2437 			out.Clear();
2438 			out.DumpEntry(action);
2439 			continue;
2440 		}
2441 
2442 		BlockData* blockData = dynamic_cast<BlockData*>(entry);
2443 		if (blockData == NULL)
2444 			continue;
2445 
2446 		out.Clear();
2447 
2448 		const char* dump = out.DumpEntry(entry);
2449 		int length = strlen(dump);
2450 		if (length > 0 && dump[length - 1] == '\n')
2451 			length--;
2452 
2453 		kprintf("%5" B_PRId32 ". %.*s\n", index, length, dump);
2454 
2455 		if (printStackTrace) {
2456 			out.Clear();
2457 			entry->DumpStackTrace(out);
2458 			if (out.Size() > 0)
2459 				kputs(out.Buffer());
2460 		}
2461 
2462 		blockData->DumpBlocks(which, offset, size);
2463 	}
2464 
2465 	return 0;
2466 }
2467 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
2468 
2469 
2470 #endif	// DEBUG_BLOCK_CACHE
2471 
2472 
2473 /*!	Traverses through the block_cache list, and returns one cache after the
2474 	other. The cache returned is automatically locked when you get it, and
2475 	unlocked with the next call to this function. Ignores caches that are in
2476 	deletion state.
2477 	Returns \c NULL when the end of the list is reached.
2478 */
2479 static block_cache*
2480 get_next_locked_block_cache(block_cache* last)
2481 {
2482 	MutexLocker _(sCachesLock);
2483 
2484 	block_cache* cache;
2485 	if (last != NULL) {
2486 		mutex_unlock(&last->lock);
2487 
2488 		cache = sCaches.GetNext((block_cache*)&sMarkCache);
2489 		sCaches.Remove((block_cache*)&sMarkCache);
2490 	} else
2491 		cache = sCaches.Head();
2492 
2493 	if (cache != NULL) {
2494 		mutex_lock(&cache->lock);
2495 		sCaches.Insert(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
2496 	}
2497 
2498 	return cache;
2499 }
2500 
2501 
2502 /*!	Background thread that continuously checks for pending notifications of
2503 	all caches.
2504 	Every two seconds, it will also write back up to 64 blocks per cache.
2505 */
2506 static status_t
2507 block_notifier_and_writer(void* /*data*/)
2508 {
2509 	const bigtime_t kTimeout = 2000000LL;
2510 	bigtime_t timeout = kTimeout;
2511 
2512 	while (true) {
2513 		bigtime_t start = system_time();
2514 
2515 		status_t status = acquire_sem_etc(sEventSemaphore, 1,
2516 			B_RELATIVE_TIMEOUT, timeout);
2517 		if (status == B_OK) {
2518 			flush_pending_notifications();
2519 			timeout -= system_time() - start;
2520 			continue;
2521 		}
2522 
2523 		// write 64 blocks of each block_cache every two seconds
2524 		// TODO: change this once we have an I/O scheduler
2525 		timeout = kTimeout;
2526 		size_t usedMemory;
2527 		object_cache_get_usage(sBlockCache, &usedMemory);
2528 
2529 		block_cache* cache = NULL;
2530 		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
2531 			BlockWriter writer(cache, 64);
2532 
2533 			size_t cacheUsedMemory;
2534 			object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
2535 			usedMemory += cacheUsedMemory;
2536 
2537 			if (cache->num_dirty_blocks) {
2538 				// This cache is not using transactions, we'll scan the blocks
2539 				// directly
2540 				BlockTable::Iterator iterator(cache->hash);
2541 
2542 				while (iterator.HasNext()) {
2543 					cached_block* block = iterator.Next();
2544 					if (block->CanBeWritten() && !writer.Add(block))
2545 						break;
2546 				}
2547 
2548 			} else {
2549 				TransactionTable::Iterator iterator(cache->transaction_hash);
2550 
2551 				while (iterator.HasNext()) {
2552 					cache_transaction* transaction = iterator.Next();
2553 					if (transaction->open) {
2554 						if (system_time() > transaction->last_used
2555 								+ kTransactionIdleTime) {
2556 							// Transaction is open but idle
2557 							notify_transaction_listeners(cache, transaction,
2558 								TRANSACTION_IDLE);
2559 						}
2560 						continue;
2561 					}
2562 
2563 					bool hasLeftOvers;
2564 						// we ignore this one
2565 					if (!writer.Add(transaction, hasLeftOvers))
2566 						break;
2567 				}
2568 			}
2569 
2570 			writer.Write();
2571 
2572 			if ((block_cache_used_memory() / B_PAGE_SIZE)
2573 					> vm_page_num_pages() / 2) {
2574 				// Try to reduce memory usage to half of the available
2575 				// RAM at maximum
2576 				cache->RemoveUnusedBlocks(1000, 10);
2577 			}
2578 		}
2579 
2580 		MutexLocker _(sCachesMemoryUseLock);
2581 		sUsedMemory = usedMemory;
2582 	}
2583 
2584 	// never can get here
2585 	return B_OK;
2586 }
2587 
2588 
2589 /*!	Notify function for wait_for_notifications(). */
2590 static void
2591 notify_sync(int32 transactionID, int32 event, void* _cache)
2592 {
2593 	block_cache* cache = (block_cache*)_cache;
2594 
2595 	cache->condition_variable.NotifyOne();
2596 }
2597 
2598 
2599 /*!	Must be called with the sCachesLock held. */
2600 static bool
2601 is_valid_cache(block_cache* cache)
2602 {
2603 	ASSERT_LOCKED_MUTEX(&sCachesLock);
2604 
2605 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
2606 	while (iterator.HasNext()) {
2607 		if (cache == iterator.Next())
2608 			return true;
2609 	}
2610 
2611 	return false;
2612 }
2613 
2614 
2615 /*!	Waits until all pending notifications are carried out.
2616 	Safe to be called from the block writer/notifier thread.
2617 	You must not hold the \a cache lock when calling this function.
2618 */
2619 static void
2620 wait_for_notifications(block_cache* cache)
2621 {
2622 	MutexLocker locker(sCachesLock);
2623 
2624 	if (find_thread(NULL) == sNotifierWriterThread) {
2625 		// We're the notifier thread, don't wait, but flush all pending
2626 		// notifications directly.
2627 		if (is_valid_cache(cache))
2628 			flush_pending_notifications(cache);
2629 		return;
2630 	}
2631 
2632 	// add sync notification
2633 	cache_notification notification;
2634 	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
2635 		cache);
2636 
2637 	ConditionVariableEntry entry;
2638 	cache->condition_variable.Add(&entry);
2639 
2640 	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
2641 	locker.Unlock();
2642 
2643 	// wait for notification hook to be called
2644 	entry.Wait();
2645 }
2646 
2647 
2648 status_t
2649 block_cache_init(void)
2650 {
2651 	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
2652 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
2653 	if (sBlockCache == NULL)
2654 		return B_NO_MEMORY;
2655 
2656 	new (&sCaches) DoublyLinkedList<block_cache>;
2657 		// manually call constructor
2658 
2659 	sEventSemaphore = create_sem(0, "block cache event");
2660 	if (sEventSemaphore < B_OK)
2661 		return sEventSemaphore;
2662 
2663 	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
2664 		"block notifier/writer", B_LOW_PRIORITY, NULL);
2665 	if (sNotifierWriterThread >= B_OK)
2666 		resume_thread(sNotifierWriterThread);
2667 
2668 #if DEBUG_BLOCK_CACHE
2669 	add_debugger_command_etc("block_caches", &dump_caches,
2670 		"dumps all block caches", "\n", 0);
2671 	add_debugger_command_etc("block_cache", &dump_cache,
2672 		"dumps a specific block cache",
2673 		"[-bt] <cache-address> [block-number]\n"
2674 		"  -t lists the transactions\n"
2675 		"  -b lists all blocks\n", 0);
2676 	add_debugger_command("cached_block", &dump_cached_block,
2677 		"dumps the specified cached block");
2678 	add_debugger_command_etc("transaction", &dump_transaction,
2679 		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
2680 		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
2681 		"  -b lists all blocks that are part of this transaction\n", 0);
2682 #	if BLOCK_CACHE_BLOCK_TRACING >= 2
2683 	add_debugger_command_etc("block_cache_data", &dump_block_data,
2684 		"dumps the data blocks logged for the actions",
2685 		"[-cpo] <from> [<to> [<offset> [<size>]]]\n"
2686 		"If no data specifier is used, all blocks are shown by default.\n"
2687 		" -c       the current data is shown, if available.\n"
2688 		" -p       the parent data is shown, if available.\n"
2689 		" -o       the original data is shown, if available.\n"
2690 		" <from>   first index of tracing entries to show.\n"
2691 		" <to>     if given, the last entry. If not, only <from> is shown.\n"
2692 		" <offset> the offset of the block data.\n"
2693 		" <from>   the size of the block data that is dumped\n", 0);
2694 #	endif
2695 #endif	// DEBUG_BLOCK_CACHE
2696 
2697 	return B_OK;
2698 }
2699 
2700 
2701 size_t
2702 block_cache_used_memory(void)
2703 {
2704 	MutexLocker _(sCachesMemoryUseLock);
2705 	return sUsedMemory;
2706 }
2707 
2708 
2709 //	#pragma mark - public transaction API
2710 
2711 
2712 int32
2713 cache_start_transaction(void* _cache)
2714 {
2715 	block_cache* cache = (block_cache*)_cache;
2716 	TransactionLocker locker(cache);
2717 
2718 	if (cache->last_transaction && cache->last_transaction->open) {
2719 		panic("last transaction (%" B_PRId32 ") still open!\n",
2720 			cache->last_transaction->id);
2721 	}
2722 
2723 	cache_transaction* transaction = new(std::nothrow) cache_transaction;
2724 	if (transaction == NULL)
2725 		return B_NO_MEMORY;
2726 
2727 	transaction->id = atomic_add(&cache->next_transaction_id, 1);
2728 	cache->last_transaction = transaction;
2729 
2730 	TRACE(("cache_start_transaction(): id %" B_PRId32 " started\n", transaction->id));
2731 	T(Action("start", cache, transaction));
2732 
2733 	cache->transaction_hash->Insert(transaction);
2734 
2735 	return transaction->id;
2736 }
2737 
2738 
2739 status_t
2740 cache_sync_transaction(void* _cache, int32 id)
2741 {
2742 	block_cache* cache = (block_cache*)_cache;
2743 	bool hadBusy;
2744 
2745 	TRACE(("cache_sync_transaction(id %" B_PRId32 ")\n", id));
2746 
2747 	do {
2748 		TransactionLocker locker(cache);
2749 		hadBusy = false;
2750 
2751 		BlockWriter writer(cache);
2752 		TransactionTable::Iterator iterator(cache->transaction_hash);
2753 
2754 		while (iterator.HasNext()) {
2755 			// close all earlier transactions which haven't been closed yet
2756 			cache_transaction* transaction = iterator.Next();
2757 
2758 			if (transaction->busy_writing_count != 0) {
2759 				hadBusy = true;
2760 				continue;
2761 			}
2762 			if (transaction->id <= id && !transaction->open) {
2763 				// write back all of their remaining dirty blocks
2764 				T(Action("sync", cache, transaction));
2765 
2766 				bool hasLeftOvers;
2767 				writer.Add(transaction, hasLeftOvers);
2768 
2769 				if (hasLeftOvers) {
2770 					// This transaction contains blocks that a previous
2771 					// transaction is trying to write back in this write run
2772 					hadBusy = true;
2773 				}
2774 			}
2775 		}
2776 
2777 		status_t status = writer.Write();
2778 		if (status != B_OK)
2779 			return status;
2780 	} while (hadBusy);
2781 
2782 	wait_for_notifications(cache);
2783 		// make sure that all pending TRANSACTION_WRITTEN notifications
2784 		// are handled after we return
2785 	return B_OK;
2786 }
2787 
2788 
2789 status_t
2790 cache_end_transaction(void* _cache, int32 id,
2791 	transaction_notification_hook hook, void* data)
2792 {
2793 	block_cache* cache = (block_cache*)_cache;
2794 	TransactionLocker locker(cache);
2795 
2796 	TRACE(("cache_end_transaction(id = %" B_PRId32 ")\n", id));
2797 
2798 	cache_transaction* transaction = lookup_transaction(cache, id);
2799 	if (transaction == NULL) {
2800 		panic("cache_end_transaction(): invalid transaction ID\n");
2801 		return B_BAD_VALUE;
2802 	}
2803 
2804 	// Write back all pending transaction blocks
2805 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2806 	if (status != B_OK)
2807 		return status;
2808 
2809 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2810 
2811 	if (hook != NULL
2812 		&& add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN,
2813 			hook, data) != B_OK) {
2814 		return B_NO_MEMORY;
2815 	}
2816 
2817 	T(Action("end", cache, transaction));
2818 
2819 	// iterate through all blocks and free the unchanged original contents
2820 
2821 	cached_block* next;
2822 	for (cached_block* block = transaction->first_block; block != NULL;
2823 			block = next) {
2824 		next = block->transaction_next;
2825 		ASSERT(block->previous_transaction == NULL);
2826 
2827 		if (block->discard) {
2828 			// This block has been discarded in the transaction
2829 			cache->DiscardBlock(block);
2830 			transaction->num_blocks--;
2831 			continue;
2832 		}
2833 
2834 		if (block->original_data != NULL) {
2835 			cache->Free(block->original_data);
2836 			block->original_data = NULL;
2837 		}
2838 		if (block->parent_data != NULL) {
2839 			ASSERT(transaction->has_sub_transaction);
2840 			cache->FreeBlockParentData(block);
2841 		}
2842 
2843 		// move the block to the previous transaction list
2844 		transaction->blocks.Add(block);
2845 
2846 		block->previous_transaction = transaction;
2847 		block->transaction_next = NULL;
2848 		block->transaction = NULL;
2849 	}
2850 
2851 	transaction->open = false;
2852 	return B_OK;
2853 }
2854 
2855 
2856 status_t
2857 cache_abort_transaction(void* _cache, int32 id)
2858 {
2859 	block_cache* cache = (block_cache*)_cache;
2860 	TransactionLocker locker(cache);
2861 
2862 	TRACE(("cache_abort_transaction(id = %" B_PRId32 ")\n", id));
2863 
2864 	cache_transaction* transaction = lookup_transaction(cache, id);
2865 	if (transaction == NULL) {
2866 		panic("cache_abort_transaction(): invalid transaction ID\n");
2867 		return B_BAD_VALUE;
2868 	}
2869 
2870 	T(Abort(cache, transaction));
2871 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
2872 
2873 	// iterate through all blocks and restore their original contents
2874 
2875 	cached_block* block = transaction->first_block;
2876 	cached_block* next;
2877 	for (; block != NULL; block = next) {
2878 		next = block->transaction_next;
2879 
2880 		if (block->original_data != NULL) {
2881 			TRACE(("cache_abort_transaction(id = %" B_PRId32 "): restored contents of "
2882 				"block %" B_PRIdOFF "\n", transaction->id, block->block_number));
2883 			memcpy(block->current_data, block->original_data,
2884 				cache->block_size);
2885 			cache->Free(block->original_data);
2886 			block->original_data = NULL;
2887 		}
2888 		if (transaction->has_sub_transaction && block->parent_data != NULL)
2889 			cache->FreeBlockParentData(block);
2890 
2891 		block->transaction_next = NULL;
2892 		block->transaction = NULL;
2893 		block->discard = false;
2894 		if (block->previous_transaction == NULL)
2895 			block->is_dirty = false;
2896 	}
2897 
2898 	cache->transaction_hash->Remove(transaction);
2899 	delete_transaction(cache, transaction);
2900 	return B_OK;
2901 }
2902 
2903 
2904 /*!	Acknowledges the current parent transaction, and starts a new transaction
2905 	from its sub transaction.
2906 	The new transaction also gets a new transaction ID.
2907 */
2908 int32
2909 cache_detach_sub_transaction(void* _cache, int32 id,
2910 	transaction_notification_hook hook, void* data)
2911 {
2912 	block_cache* cache = (block_cache*)_cache;
2913 	TransactionLocker locker(cache);
2914 
2915 	TRACE(("cache_detach_sub_transaction(id = %" B_PRId32 ")\n", id));
2916 
2917 	cache_transaction* transaction = lookup_transaction(cache, id);
2918 	if (transaction == NULL) {
2919 		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
2920 		return B_BAD_VALUE;
2921 	}
2922 	if (!transaction->has_sub_transaction)
2923 		return B_BAD_VALUE;
2924 
2925 	// iterate through all blocks and free the unchanged original contents
2926 
2927 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2928 	if (status != B_OK)
2929 		return status;
2930 
2931 	// create a new transaction for the sub transaction
2932 	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
2933 	if (newTransaction == NULL)
2934 		return B_NO_MEMORY;
2935 
2936 	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
2937 	T(Detach(cache, transaction, newTransaction));
2938 
2939 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2940 
2941 	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
2942 			data) != B_OK) {
2943 		delete newTransaction;
2944 		return B_NO_MEMORY;
2945 	}
2946 
2947 	cached_block* last = NULL;
2948 	cached_block* next;
2949 	for (cached_block* block = transaction->first_block; block != NULL;
2950 			block = next) {
2951 		next = block->transaction_next;
2952 		ASSERT(block->previous_transaction == NULL);
2953 
2954 		if (block->discard) {
2955 			cache->DiscardBlock(block);
2956 			transaction->main_num_blocks--;
2957 			continue;
2958 		}
2959 
2960 		if (block->parent_data != NULL) {
2961 			// The block changed in the parent - free the original data, since
2962 			// they will be replaced by what is in current.
2963 			ASSERT(block->original_data != NULL);
2964 			cache->Free(block->original_data);
2965 
2966 			if (block->parent_data != block->current_data) {
2967 				// The block had been changed in both transactions
2968 				block->original_data = block->parent_data;
2969 			} else {
2970 				// The block has only been changed in the parent
2971 				block->original_data = NULL;
2972 			}
2973 
2974 			// move the block to the previous transaction list
2975 			transaction->blocks.Add(block);
2976 			block->previous_transaction = transaction;
2977 			block->parent_data = NULL;
2978 		}
2979 
2980 		if (block->original_data != NULL) {
2981 			// This block had been changed in the current sub transaction,
2982 			// we need to move this block over to the new transaction.
2983 			ASSERT(block->parent_data == NULL);
2984 
2985 			if (last == NULL)
2986 				newTransaction->first_block = block;
2987 			else
2988 				last->transaction_next = block;
2989 
2990 			block->transaction = newTransaction;
2991 			last = block;
2992 		} else
2993 			block->transaction = NULL;
2994 
2995 		block->transaction_next = NULL;
2996 	}
2997 
2998 	newTransaction->num_blocks = transaction->sub_num_blocks;
2999 
3000 	transaction->open = false;
3001 	transaction->has_sub_transaction = false;
3002 	transaction->num_blocks = transaction->main_num_blocks;
3003 	transaction->sub_num_blocks = 0;
3004 
3005 	cache->transaction_hash->Insert(newTransaction);
3006 	cache->last_transaction = newTransaction;
3007 
3008 	return newTransaction->id;
3009 }
3010 
3011 
3012 status_t
3013 cache_abort_sub_transaction(void* _cache, int32 id)
3014 {
3015 	block_cache* cache = (block_cache*)_cache;
3016 	TransactionLocker locker(cache);
3017 
3018 	TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 ")\n", id));
3019 
3020 	cache_transaction* transaction = lookup_transaction(cache, id);
3021 	if (transaction == NULL) {
3022 		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
3023 		return B_BAD_VALUE;
3024 	}
3025 	if (!transaction->has_sub_transaction)
3026 		return B_BAD_VALUE;
3027 
3028 	T(Abort(cache, transaction));
3029 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3030 
3031 	// revert all changes back to the version of the parent
3032 
3033 	cached_block* block = transaction->first_block;
3034 	cached_block* last = NULL;
3035 	cached_block* next;
3036 	for (; block != NULL; block = next) {
3037 		next = block->transaction_next;
3038 
3039 		if (block->parent_data == NULL) {
3040 			// The parent transaction didn't change the block, but the sub
3041 			// transaction did - we need to revert to the original data.
3042 			// The block is no longer part of the transaction
3043 			if (block->original_data != NULL) {
3044 				// The block might not have original data if was empty
3045 				memcpy(block->current_data, block->original_data,
3046 					cache->block_size);
3047 			}
3048 
3049 			if (last != NULL)
3050 				last->transaction_next = next;
3051 			else
3052 				transaction->first_block = next;
3053 
3054 			block->transaction_next = NULL;
3055 			block->transaction = NULL;
3056 			transaction->num_blocks--;
3057 
3058 			if (block->previous_transaction == NULL) {
3059 				cache->Free(block->original_data);
3060 				block->original_data = NULL;
3061 				block->is_dirty = false;
3062 
3063 				if (block->ref_count == 0) {
3064 					// Move the block into the unused list if possible
3065 					block->unused = true;
3066 					cache->unused_blocks.Add(block);
3067 					cache->unused_block_count++;
3068 				}
3069 			}
3070 		} else {
3071 			if (block->parent_data != block->current_data) {
3072 				// The block has been changed and must be restored - the block
3073 				// is still dirty and part of the transaction
3074 				TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 "): "
3075 					"restored contents of block %" B_PRIdOFF "\n",
3076 					transaction->id, block->block_number));
3077 				memcpy(block->current_data, block->parent_data,
3078 					cache->block_size);
3079 				cache->Free(block->parent_data);
3080 				// The block stays dirty
3081 			}
3082 			block->parent_data = NULL;
3083 			last = block;
3084 		}
3085 
3086 		block->discard = false;
3087 	}
3088 
3089 	// all subsequent changes will go into the main transaction
3090 	transaction->has_sub_transaction = false;
3091 	transaction->sub_num_blocks = 0;
3092 
3093 	return B_OK;
3094 }
3095 
3096 
3097 status_t
3098 cache_start_sub_transaction(void* _cache, int32 id)
3099 {
3100 	block_cache* cache = (block_cache*)_cache;
3101 	TransactionLocker locker(cache);
3102 
3103 	TRACE(("cache_start_sub_transaction(id = %" B_PRId32 ")\n", id));
3104 
3105 	cache_transaction* transaction = lookup_transaction(cache, id);
3106 	if (transaction == NULL) {
3107 		panic("cache_start_sub_transaction(): invalid transaction ID %" B_PRId32 "\n",
3108 			id);
3109 		return B_BAD_VALUE;
3110 	}
3111 
3112 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3113 
3114 	// move all changed blocks up to the parent
3115 
3116 	cached_block* block = transaction->first_block;
3117 	cached_block* next;
3118 	for (; block != NULL; block = next) {
3119 		next = block->transaction_next;
3120 
3121 		if (block->parent_data != NULL) {
3122 			// There already is an older sub transaction - we acknowledge
3123 			// its changes and move its blocks up to the parent
3124 			ASSERT(transaction->has_sub_transaction);
3125 			cache->FreeBlockParentData(block);
3126 		}
3127 		if (block->discard) {
3128 			// This block has been discarded in the parent transaction.
3129 			// Just throw away any changes made in this transaction, so that
3130 			// it can still be reverted to its original contents if needed
3131 			ASSERT(block->previous_transaction == NULL);
3132 			if (block->original_data != NULL) {
3133 				memcpy(block->current_data, block->original_data,
3134 					cache->block_size);
3135 				block->original_data = NULL;
3136 			}
3137 			continue;
3138 		}
3139 
3140 		// we "allocate" the parent data lazily, that means, we don't copy
3141 		// the data (and allocate memory for it) until we need to
3142 		block->parent_data = block->current_data;
3143 	}
3144 
3145 	// all subsequent changes will go into the sub transaction
3146 	transaction->has_sub_transaction = true;
3147 	transaction->main_num_blocks = transaction->num_blocks;
3148 	transaction->sub_num_blocks = 0;
3149 	T(Action("start-sub", cache, transaction));
3150 
3151 	return B_OK;
3152 }
3153 
3154 
3155 /*!	Adds a transaction listener that gets notified when the transaction
3156 	is ended, aborted, written, or idle as specified by \a events.
3157 	The listener gets automatically removed when the transaction ends.
3158 */
3159 status_t
3160 cache_add_transaction_listener(void* _cache, int32 id, int32 events,
3161 	transaction_notification_hook hook, void* data)
3162 {
3163 	block_cache* cache = (block_cache*)_cache;
3164 	TransactionLocker locker(cache);
3165 
3166 	cache_transaction* transaction = lookup_transaction(cache, id);
3167 	if (transaction == NULL)
3168 		return B_BAD_VALUE;
3169 
3170 	return add_transaction_listener(cache, transaction, events, hook, data);
3171 }
3172 
3173 
3174 status_t
3175 cache_remove_transaction_listener(void* _cache, int32 id,
3176 	transaction_notification_hook hookFunction, void* data)
3177 {
3178 	block_cache* cache = (block_cache*)_cache;
3179 	TransactionLocker locker(cache);
3180 
3181 	cache_transaction* transaction = lookup_transaction(cache, id);
3182 	if (transaction == NULL)
3183 		return B_BAD_VALUE;
3184 
3185 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
3186 	while (iterator.HasNext()) {
3187 		cache_listener* listener = iterator.Next();
3188 		if (listener->data == data && listener->hook == hookFunction) {
3189 			iterator.Remove();
3190 
3191 			if (listener->events_pending != 0) {
3192 				MutexLocker _(sNotificationsLock);
3193 				if (listener->events_pending != 0)
3194 					cache->pending_notifications.Remove(listener);
3195 			}
3196 			delete listener;
3197 			return B_OK;
3198 		}
3199 	}
3200 
3201 	return B_ENTRY_NOT_FOUND;
3202 }
3203 
3204 
3205 status_t
3206 cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
3207 	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
3208 {
3209 	cached_block* block = (cached_block*)*_cookie;
3210 	block_cache* cache = (block_cache*)_cache;
3211 	TransactionLocker locker(cache);
3212 
3213 	cache_transaction* transaction = lookup_transaction(cache, id);
3214 	if (transaction == NULL || !transaction->open)
3215 		return B_BAD_VALUE;
3216 
3217 	if (block == NULL)
3218 		block = transaction->first_block;
3219 	else
3220 		block = block->transaction_next;
3221 
3222 	if (transaction->has_sub_transaction) {
3223 		if (mainOnly) {
3224 			// find next block that the parent changed
3225 			while (block != NULL && block->parent_data == NULL)
3226 				block = block->transaction_next;
3227 		} else {
3228 			// find next non-discarded block
3229 			while (block != NULL && block->discard)
3230 				block = block->transaction_next;
3231 		}
3232 	}
3233 
3234 	if (block == NULL)
3235 		return B_ENTRY_NOT_FOUND;
3236 
3237 	if (_blockNumber)
3238 		*_blockNumber = block->block_number;
3239 	if (_data)
3240 		*_data = mainOnly ? block->parent_data : block->current_data;
3241 	if (_unchangedData)
3242 		*_unchangedData = block->original_data;
3243 
3244 	*_cookie = (addr_t)block;
3245 	return B_OK;
3246 }
3247 
3248 
3249 int32
3250 cache_blocks_in_transaction(void* _cache, int32 id)
3251 {
3252 	block_cache* cache = (block_cache*)_cache;
3253 	TransactionLocker locker(cache);
3254 
3255 	cache_transaction* transaction = lookup_transaction(cache, id);
3256 	if (transaction == NULL)
3257 		return B_BAD_VALUE;
3258 
3259 	return transaction->num_blocks;
3260 }
3261 
3262 
3263 /*!	Returns the number of blocks that are part of the main transaction. If this
3264 	transaction does not have a sub transaction yet, this is the same value as
3265 	cache_blocks_in_transaction() would return.
3266 */
3267 int32
3268 cache_blocks_in_main_transaction(void* _cache, int32 id)
3269 {
3270 	block_cache* cache = (block_cache*)_cache;
3271 	TransactionLocker locker(cache);
3272 
3273 	cache_transaction* transaction = lookup_transaction(cache, id);
3274 	if (transaction == NULL)
3275 		return B_BAD_VALUE;
3276 
3277 	if (transaction->has_sub_transaction)
3278 		return transaction->main_num_blocks;
3279 
3280 	return transaction->num_blocks;
3281 }
3282 
3283 
3284 int32
3285 cache_blocks_in_sub_transaction(void* _cache, int32 id)
3286 {
3287 	block_cache* cache = (block_cache*)_cache;
3288 	TransactionLocker locker(cache);
3289 
3290 	cache_transaction* transaction = lookup_transaction(cache, id);
3291 	if (transaction == NULL)
3292 		return B_BAD_VALUE;
3293 
3294 	return transaction->sub_num_blocks;
3295 }
3296 
3297 
3298 //	#pragma mark - public block cache API
3299 
3300 
3301 void
3302 block_cache_delete(void* _cache, bool allowWrites)
3303 {
3304 	block_cache* cache = (block_cache*)_cache;
3305 
3306 	if (allowWrites)
3307 		block_cache_sync(cache);
3308 
3309 	mutex_lock(&sCachesLock);
3310 	sCaches.Remove(cache);
3311 	mutex_unlock(&sCachesLock);
3312 
3313 	mutex_lock(&cache->lock);
3314 
3315 	// wait for all blocks to become unbusy
3316 	wait_for_busy_reading_blocks(cache);
3317 	wait_for_busy_writing_blocks(cache);
3318 
3319 	// free all blocks
3320 
3321 	cached_block* block = cache->hash->Clear(true);
3322 	while (block != NULL) {
3323 		cached_block* next = block->next;
3324 		cache->FreeBlock(block);
3325 		block = next;
3326 	}
3327 
3328 	// free all transactions (they will all be aborted)
3329 
3330 	cache_transaction* transaction = cache->transaction_hash->Clear(true);
3331 	while (transaction != NULL) {
3332 		cache_transaction* next = transaction->next;
3333 		delete transaction;
3334 		transaction = next;
3335 	}
3336 
3337 	delete cache;
3338 }
3339 
3340 
3341 void*
3342 block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
3343 {
3344 	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
3345 		readOnly);
3346 	if (cache == NULL)
3347 		return NULL;
3348 
3349 	if (cache->Init() != B_OK) {
3350 		delete cache;
3351 		return NULL;
3352 	}
3353 
3354 	MutexLocker _(sCachesLock);
3355 	sCaches.Add(cache);
3356 
3357 	return cache;
3358 }
3359 
3360 
3361 status_t
3362 block_cache_sync(void* _cache)
3363 {
3364 	block_cache* cache = (block_cache*)_cache;
3365 
3366 	// We will sync all dirty blocks to disk that have a completed
3367 	// transaction or no transaction only
3368 
3369 	MutexLocker locker(&cache->lock);
3370 
3371 	BlockWriter writer(cache);
3372 	BlockTable::Iterator iterator(cache->hash);
3373 
3374 	while (iterator.HasNext()) {
3375 		cached_block* block = iterator.Next();
3376 		if (block->CanBeWritten())
3377 			writer.Add(block);
3378 	}
3379 
3380 	status_t status = writer.Write();
3381 
3382 	locker.Unlock();
3383 
3384 	wait_for_notifications(cache);
3385 		// make sure that all pending TRANSACTION_WRITTEN notifications
3386 		// are handled after we return
3387 	return status;
3388 }
3389 
3390 
3391 status_t
3392 block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
3393 {
3394 	block_cache* cache = (block_cache*)_cache;
3395 
3396 	// We will sync all dirty blocks to disk that have a completed
3397 	// transaction or no transaction only
3398 
3399 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
3400 		panic("block_cache_sync_etc: invalid block number %" B_PRIdOFF
3401 			" (max %" B_PRIdOFF ")",
3402 			blockNumber, cache->max_blocks - 1);
3403 		return B_BAD_VALUE;
3404 	}
3405 
3406 	MutexLocker locker(&cache->lock);
3407 	BlockWriter writer(cache);
3408 
3409 	for (; numBlocks > 0; numBlocks--, blockNumber++) {
3410 		cached_block* block = cache->hash->Lookup(blockNumber);
3411 		if (block == NULL)
3412 			continue;
3413 
3414 		if (block->CanBeWritten())
3415 			writer.Add(block);
3416 	}
3417 
3418 	status_t status = writer.Write();
3419 
3420 	locker.Unlock();
3421 
3422 	wait_for_notifications(cache);
3423 		// make sure that all pending TRANSACTION_WRITTEN notifications
3424 		// are handled after we return
3425 	return status;
3426 }
3427 
3428 
3429 /*!	Discards a block from the current transaction or from the cache.
3430 	You have to call this function when you no longer use a block, ie. when it
3431 	might be reclaimed by the file cache in order to make sure they won't
3432 	interfere.
3433 */
3434 void
3435 block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
3436 {
3437 	// TODO: this could be a nice place to issue the ATA trim command
3438 	block_cache* cache = (block_cache*)_cache;
3439 	TransactionLocker locker(cache);
3440 
3441 	BlockWriter writer(cache);
3442 
3443 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3444 		cached_block* block = cache->hash->Lookup(blockNumber);
3445 		if (block != NULL && block->previous_transaction != NULL)
3446 			writer.Add(block);
3447 	}
3448 
3449 	writer.Write();
3450 		// TODO: this can fail, too!
3451 
3452 	blockNumber -= numBlocks;
3453 		// reset blockNumber to its original value
3454 
3455 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3456 		cached_block* block = cache->hash->Lookup(blockNumber);
3457 		if (block == NULL)
3458 			continue;
3459 
3460 		ASSERT(block->previous_transaction == NULL);
3461 
3462 		if (block->unused) {
3463 			cache->unused_blocks.Remove(block);
3464 			cache->unused_block_count--;
3465 			cache->RemoveBlock(block);
3466 		} else {
3467 			if (block->transaction != NULL && block->parent_data != NULL
3468 				&& block->parent_data != block->current_data) {
3469 				panic("Discarded block %" B_PRIdOFF " has already been changed in this "
3470 					"transaction!", blockNumber);
3471 			}
3472 
3473 			// mark it as discarded (in the current transaction only, if any)
3474 			block->discard = true;
3475 		}
3476 	}
3477 }
3478 
3479 
3480 status_t
3481 block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
3482 {
3483 	block_cache* cache = (block_cache*)_cache;
3484 	MutexLocker locker(&cache->lock);
3485 
3486 	if (cache->read_only) {
3487 		panic("tried to make block writable on a read-only cache!");
3488 		return B_ERROR;
3489 	}
3490 
3491 	// TODO: this can be done better!
3492 	void* block = get_writable_cached_block(cache, blockNumber,
3493 		blockNumber, 1, transaction, false);
3494 	if (block != NULL) {
3495 		put_cached_block((block_cache*)_cache, blockNumber);
3496 		return B_OK;
3497 	}
3498 
3499 	return B_ERROR;
3500 }
3501 
3502 
3503 void*
3504 block_cache_get_writable_etc(void* _cache, off_t blockNumber, off_t base,
3505 	off_t length, int32 transaction)
3506 {
3507 	block_cache* cache = (block_cache*)_cache;
3508 	MutexLocker locker(&cache->lock);
3509 
3510 	TRACE(("block_cache_get_writable_etc(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3511 		blockNumber, transaction));
3512 	if (cache->read_only)
3513 		panic("tried to get writable block on a read-only cache!");
3514 
3515 	return get_writable_cached_block(cache, blockNumber, base, length,
3516 		transaction, false);
3517 }
3518 
3519 
3520 void*
3521 block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
3522 {
3523 	return block_cache_get_writable_etc(_cache, blockNumber,
3524 		blockNumber, 1, transaction);
3525 }
3526 
3527 
3528 void*
3529 block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
3530 {
3531 	block_cache* cache = (block_cache*)_cache;
3532 	MutexLocker locker(&cache->lock);
3533 
3534 	TRACE(("block_cache_get_empty(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3535 		blockNumber, transaction));
3536 	if (cache->read_only)
3537 		panic("tried to get empty writable block on a read-only cache!");
3538 
3539 	return get_writable_cached_block((block_cache*)_cache, blockNumber,
3540 		blockNumber, 1, transaction, true);
3541 }
3542 
3543 
3544 const void*
3545 block_cache_get_etc(void* _cache, off_t blockNumber, off_t base, off_t length)
3546 {
3547 	block_cache* cache = (block_cache*)_cache;
3548 	MutexLocker locker(&cache->lock);
3549 	bool allocated;
3550 
3551 	cached_block* block = get_cached_block(cache, blockNumber, &allocated);
3552 	if (block == NULL)
3553 		return NULL;
3554 
3555 #if BLOCK_CACHE_DEBUG_CHANGED
3556 	if (block->compare == NULL)
3557 		block->compare = cache->Allocate();
3558 	if (block->compare != NULL)
3559 		memcpy(block->compare, block->current_data, cache->block_size);
3560 #endif
3561 	TB(Get(cache, block));
3562 
3563 	return block->current_data;
3564 }
3565 
3566 
3567 const void*
3568 block_cache_get(void* _cache, off_t blockNumber)
3569 {
3570 	return block_cache_get_etc(_cache, blockNumber, blockNumber, 1);
3571 }
3572 
3573 
3574 /*!	Changes the internal status of a writable block to \a dirty. This can be
3575 	helpful in case you realize you don't need to change that block anymore
3576 	for whatever reason.
3577 
3578 	Note, you must only use this function on blocks that were acquired
3579 	writable!
3580 */
3581 status_t
3582 block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
3583 	int32 transaction)
3584 {
3585 	block_cache* cache = (block_cache*)_cache;
3586 	MutexLocker locker(&cache->lock);
3587 
3588 	cached_block* block = cache->hash->Lookup(blockNumber);
3589 	if (block == NULL)
3590 		return B_BAD_VALUE;
3591 	if (block->is_dirty == dirty) {
3592 		// there is nothing to do for us
3593 		return B_OK;
3594 	}
3595 
3596 	// TODO: not yet implemented
3597 	if (dirty)
3598 		panic("block_cache_set_dirty(): not yet implemented that way!\n");
3599 
3600 	return B_OK;
3601 }
3602 
3603 
3604 void
3605 block_cache_put(void* _cache, off_t blockNumber)
3606 {
3607 	block_cache* cache = (block_cache*)_cache;
3608 	MutexLocker locker(&cache->lock);
3609 
3610 	put_cached_block(cache, blockNumber);
3611 }
3612 
3613