xref: /haiku/src/system/kernel/cache/block_cache.cpp (revision 04b1c44b89086a9521bb478c8e99d024cef8f6a6)
1 /*
2  * Copyright 2004-2012, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include <block_cache.h>
8 
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13 
14 #include <KernelExport.h>
15 #include <fs_cache.h>
16 
17 #include <condition_variable.h>
18 #include <lock.h>
19 #include <low_resource_manager.h>
20 #include <slab/Slab.h>
21 #include <tracing.h>
22 #include <util/kernel_cpp.h>
23 #include <util/DoublyLinkedList.h>
24 #include <util/AutoLock.h>
25 #include <util/khash.h>
26 #include <vm/vm_page.h>
27 
28 #include "kernel_debug_config.h"
29 
30 
31 // TODO: this is a naive but growing implementation to test the API:
32 //	block reading/writing is not at all optimized for speed, it will
33 //	just read and write single blocks.
34 // TODO: the retrieval/copy of the original data could be delayed until the
35 //		new data must be written, ie. in low memory situations.
36 
37 //#define TRACE_BLOCK_CACHE
38 #ifdef TRACE_BLOCK_CACHE
39 #	define TRACE(x)	dprintf x
40 #else
41 #	define TRACE(x) ;
42 #endif
43 
44 // This macro is used for fatal situations that are acceptable in a running
45 // system, like out of memory situations - should only panic for debugging.
46 #define FATAL(x) panic x
47 
48 static const bigtime_t kTransactionIdleTime = 2000000LL;
49 	// a transaction is considered idle after 2 seconds of inactivity
50 
51 
52 struct cache_transaction;
53 struct cached_block;
54 struct block_cache;
55 typedef DoublyLinkedListLink<cached_block> block_link;
56 
57 struct cached_block {
58 	cached_block*	next;			// next in hash
59 	cached_block*	transaction_next;
60 	block_link		link;
61 	off_t			block_number;
62 	void*			current_data;
63 		// The data that is seen by everyone using the API; this one is always
64 		// present.
65 	void*			original_data;
66 		// When in a transaction, this contains the original data from before
67 		// the transaction.
68 	void*			parent_data;
69 		// This is a lazily alloced buffer that represents the contents of the
70 		// block in the parent transaction. It may point to current_data if the
71 		// contents have been changed only in the parent transaction, or, if the
72 		// block has been changed in the current sub transaction already, to a
73 		// new block containing the contents changed in the parent transaction.
74 		// If this is NULL, the block has not been changed in the parent
75 		// transaction at all.
76 #if BLOCK_CACHE_DEBUG_CHANGED
77 	void*			compare;
78 #endif
79 	int32			ref_count;
80 	int32			last_accessed;
81 	bool			busy_reading : 1;
82 	bool			busy_writing : 1;
83 	bool			is_writing : 1;
84 		// Block has been checked out for writing without transactions, and
85 		// cannot be written back if set
86 	bool			is_dirty : 1;
87 	bool			unused : 1;
88 	bool			discard : 1;
89 	bool			busy_reading_waiters : 1;
90 	bool			busy_writing_waiters : 1;
91 	cache_transaction* transaction;
92 		// This is the current active transaction, if any, the block is
93 		// currently in (meaning was changed as a part of it).
94 	cache_transaction* previous_transaction;
95 		// This is set to the last transaction that was ended containing this
96 		// block. In this case, the block has not yet written back yet, and
97 		// the changed data is either in current_data, or original_data -- the
98 		// latter if the block is already being part of another transaction.
99 		// There can only be one previous transaction, so when the active
100 		// transaction ends, the changes of the previous transaction have to
101 		// be written back before that transaction becomes the next previous
102 		// transaction.
103 
104 	bool CanBeWritten() const;
105 	int32 LastAccess() const
106 		{ return system_time() / 1000000L - last_accessed; }
107 
108 	static int Compare(void* _cacheEntry, const void* _block);
109 	static uint32 Hash(void* _cacheEntry, const void* _block, uint32 range);
110 };
111 
112 typedef DoublyLinkedList<cached_block,
113 	DoublyLinkedListMemberGetLink<cached_block,
114 		&cached_block::link> > block_list;
115 
116 struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
117 	int32			transaction_id;
118 	int32			events_pending;
119 	int32			events;
120 	transaction_notification_hook hook;
121 	void*			data;
122 	bool			delete_after_event;
123 };
124 
125 typedef DoublyLinkedList<cache_notification> NotificationList;
126 
127 struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
128 	hash_table*		hash;
129 	mutex			lock;
130 	int				fd;
131 	off_t			max_blocks;
132 	size_t			block_size;
133 	int32			next_transaction_id;
134 	cache_transaction* last_transaction;
135 	hash_table*		transaction_hash;
136 
137 	object_cache*	buffer_cache;
138 	block_list		unused_blocks;
139 	uint32			unused_block_count;
140 
141 	ConditionVariable busy_reading_condition;
142 	uint32			busy_reading_count;
143 	bool			busy_reading_waiters;
144 
145 	ConditionVariable busy_writing_condition;
146 	uint32			busy_writing_count;
147 	bool			busy_writing_waiters;
148 
149 	uint32			num_dirty_blocks;
150 	bool			read_only;
151 
152 	NotificationList pending_notifications;
153 	ConditionVariable condition_variable;
154 
155 					block_cache(int fd, off_t numBlocks, size_t blockSize,
156 						bool readOnly);
157 					~block_cache();
158 
159 	status_t		Init();
160 
161 	void			Free(void* buffer);
162 	void*			Allocate();
163 	void			FreeBlock(cached_block* block);
164 	cached_block*	NewBlock(off_t blockNumber);
165 	void			FreeBlockParentData(cached_block* block);
166 
167 	void			RemoveUnusedBlocks(int32 count, int32 minSecondsOld = 0);
168 	void			RemoveBlock(cached_block* block);
169 	void			DiscardBlock(cached_block* block);
170 
171 private:
172 	static void		_LowMemoryHandler(void* data, uint32 resources,
173 						int32 level);
174 	cached_block*	_GetUnusedBlock();
175 };
176 
177 struct cache_listener;
178 typedef DoublyLinkedListLink<cache_listener> listener_link;
179 
180 struct cache_listener : cache_notification {
181 	listener_link	link;
182 };
183 
184 typedef DoublyLinkedList<cache_listener,
185 	DoublyLinkedListMemberGetLink<cache_listener,
186 		&cache_listener::link> > ListenerList;
187 
188 
189 struct cache_transaction {
190 	cache_transaction();
191 
192 	cache_transaction* next;
193 	int32			id;
194 	int32			num_blocks;
195 	int32			main_num_blocks;
196 	int32			sub_num_blocks;
197 	cached_block*	first_block;
198 	block_list		blocks;
199 	ListenerList	listeners;
200 	bool			open;
201 	bool			has_sub_transaction;
202 	bigtime_t		last_used;
203 	int32			busy_writing_count;
204 };
205 
206 
207 class BlockWriter {
208 public:
209 								BlockWriter(block_cache* cache,
210 									size_t max = SIZE_MAX);
211 								~BlockWriter();
212 
213 			bool				Add(cached_block* block,
214 									hash_iterator* iterator = NULL);
215 			bool				Add(cache_transaction* transaction,
216 									hash_iterator* iterator,
217 									bool& hasLeftOvers);
218 
219 			status_t			Write(hash_iterator* iterator = NULL,
220 									bool canUnlock = true);
221 
222 			bool				DeletedTransaction() const
223 									{ return fDeletedTransaction; }
224 
225 	static	status_t			WriteBlock(block_cache* cache,
226 									cached_block* block);
227 
228 private:
229 			void*				_Data(cached_block* block) const;
230 			status_t			_WriteBlock(cached_block* block);
231 			void				_BlockDone(cached_block* block,
232 									hash_iterator* iterator);
233 			void				_UnmarkWriting(cached_block* block);
234 
235 	static	int					_CompareBlocks(const void* _blockA,
236 									const void* _blockB);
237 
238 private:
239 	static	const size_t		kBufferSize = 64;
240 
241 			block_cache*		fCache;
242 			cached_block*		fBuffer[kBufferSize];
243 			cached_block**		fBlocks;
244 			size_t				fCount;
245 			size_t				fTotal;
246 			size_t				fCapacity;
247 			size_t				fMax;
248 			status_t			fStatus;
249 			bool				fDeletedTransaction;
250 };
251 
252 
253 class TransactionLocking {
254 public:
255 	inline bool Lock(block_cache* cache)
256 	{
257 		mutex_lock(&cache->lock);
258 
259 		while (cache->busy_writing_count != 0) {
260 			// wait for all blocks to be written
261 			ConditionVariableEntry entry;
262 			cache->busy_writing_condition.Add(&entry);
263 			cache->busy_writing_waiters = true;
264 
265 			mutex_unlock(&cache->lock);
266 
267 			entry.Wait();
268 
269 			mutex_lock(&cache->lock);
270 		}
271 
272 		return true;
273 	}
274 
275 	inline void Unlock(block_cache* cache)
276 	{
277 		mutex_unlock(&cache->lock);
278 	}
279 };
280 
281 typedef AutoLocker<block_cache, TransactionLocking> TransactionLocker;
282 
283 
284 #if BLOCK_CACHE_BLOCK_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
285 namespace BlockTracing {
286 
287 class Action : public AbstractTraceEntry {
288 public:
289 	Action(block_cache* cache, cached_block* block)
290 		:
291 		fCache(cache),
292 		fBlockNumber(block->block_number),
293 		fIsDirty(block->is_dirty),
294 		fHasOriginal(block->original_data != NULL),
295 		fHasParent(block->parent_data != NULL),
296 		fTransactionID(-1),
297 		fPreviousID(-1)
298 	{
299 		if (block->transaction != NULL)
300 			fTransactionID = block->transaction->id;
301 		if (block->previous_transaction != NULL)
302 			fPreviousID = block->previous_transaction->id;
303 	}
304 
305 	virtual void AddDump(TraceOutput& out)
306 	{
307 		out.Print("block cache %p, %s %Ld, %c%c%c transaction %ld "
308 			"(previous id %ld)\n", fCache, _Action(), fBlockNumber,
309 			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
310 			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
311 	}
312 
313 	virtual const char* _Action() const = 0;
314 
315 private:
316 	block_cache*		fCache;
317 	uint64				fBlockNumber;
318 	bool				fIsDirty;
319 	bool				fHasOriginal;
320 	bool				fHasParent;
321 	int32				fTransactionID;
322 	int32				fPreviousID;
323 };
324 
325 class Get : public Action {
326 public:
327 	Get(block_cache* cache, cached_block* block)
328 		:
329 		Action(cache, block)
330 	{
331 		Initialized();
332 	}
333 
334 	virtual const char* _Action() const { return "get"; }
335 };
336 
337 class Put : public Action {
338 public:
339 	Put(block_cache* cache, cached_block* block)
340 		:
341 		Action(cache, block)
342 	{
343 		Initialized();
344 	}
345 
346 	virtual const char* _Action() const { return "put"; }
347 };
348 
349 class Read : public Action {
350 public:
351 	Read(block_cache* cache, cached_block* block)
352 		:
353 		Action(cache, block)
354 	{
355 		Initialized();
356 	}
357 
358 	virtual const char* _Action() const { return "read"; }
359 };
360 
361 class Write : public Action {
362 public:
363 	Write(block_cache* cache, cached_block* block)
364 		:
365 		Action(cache, block)
366 	{
367 		Initialized();
368 	}
369 
370 	virtual const char* _Action() const { return "write"; }
371 };
372 
373 class Flush : public Action {
374 public:
375 	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
376 		:
377 		Action(cache, block),
378 		fGetUnused(getUnused)
379 	{
380 		Initialized();
381 	}
382 
383 	virtual const char* _Action() const
384 		{ return fGetUnused ? "get-unused" : "flush"; }
385 
386 private:
387 	bool	fGetUnused;
388 };
389 
390 class Error : public AbstractTraceEntry {
391 public:
392 	Error(block_cache* cache, uint64 blockNumber, const char* message,
393 			status_t status = B_OK)
394 		:
395 		fCache(cache),
396 		fBlockNumber(blockNumber),
397 		fMessage(message),
398 		fStatus(status)
399 	{
400 		Initialized();
401 	}
402 
403 	virtual void AddDump(TraceOutput& out)
404 	{
405 		out.Print("block cache %p, error %Ld, %s%s%s",
406 			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
407 			fStatus != B_OK ? strerror(fStatus) : "");
408 	}
409 
410 private:
411 	block_cache*	fCache;
412 	uint64			fBlockNumber;
413 	const char*		fMessage;
414 	status_t		fStatus;
415 };
416 
417 #if BLOCK_CACHE_BLOCK_TRACING >= 2
418 class BlockData : public AbstractTraceEntry {
419 public:
420 	enum {
421 		kCurrent	= 0x01,
422 		kParent		= 0x02,
423 		kOriginal	= 0x04
424 	};
425 
426 	BlockData(block_cache* cache, cached_block* block, const char* message)
427 		:
428 		fCache(cache),
429 		fSize(cache->block_size),
430 		fBlockNumber(block->block_number),
431 		fMessage(message)
432 	{
433 		_Allocate(fCurrent, block->current_data);
434 		_Allocate(fParent, block->parent_data);
435 		_Allocate(fOriginal, block->original_data);
436 
437 #if KTRACE_PRINTF_STACK_TRACE
438 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
439 			false);
440 #endif
441 
442 		Initialized();
443 	}
444 
445 	virtual void AddDump(TraceOutput& out)
446 	{
447 		out.Print("block cache %p, block %Ld, data %c%c%c: %s",
448 			fCache, fBlockNumber, fCurrent != NULL ? 'c' : '-',
449 			fParent != NULL ? 'p' : '-', fOriginal != NULL ? 'o' : '-',
450 			fMessage);
451 	}
452 
453 #if KTRACE_PRINTF_STACK_TRACE
454 	virtual void DumpStackTrace(TraceOutput& out)
455 	{
456 		out.PrintStackTrace(fStackTrace);
457 	}
458 #endif
459 
460 	void DumpBlocks(uint32 which, uint32 offset, uint32 size)
461 	{
462 		if ((which & kCurrent) != 0)
463 			DumpBlock(kCurrent, offset, size);
464 		if ((which & kParent) != 0)
465 			DumpBlock(kParent, offset, size);
466 		if ((which & kOriginal) != 0)
467 			DumpBlock(kOriginal, offset, size);
468 	}
469 
470 	void DumpBlock(uint32 which, uint32 offset, uint32 size)
471 	{
472 		if (offset > fSize) {
473 			kprintf("invalid offset (block size %lu)\n", fSize);
474 			return;
475 		}
476 		if (offset + size > fSize)
477 			size = fSize - offset;
478 
479 		const char* label;
480 		uint8* data;
481 
482 		if ((which & kCurrent) != 0) {
483 			label = "current";
484 			data = fCurrent;
485 		} else if ((which & kParent) != 0) {
486 			label = "parent";
487 			data = fParent;
488 		} else if ((which & kOriginal) != 0) {
489 			label = "original";
490 			data = fOriginal;
491 		} else
492 			return;
493 
494 		kprintf("%s: offset %lu, %lu bytes\n", label, offset, size);
495 
496 		static const uint32 kBlockSize = 16;
497 		data += offset;
498 
499 		for (uint32 i = 0; i < size;) {
500 			int start = i;
501 
502 			kprintf("  %04lx ", i);
503 			for (; i < start + kBlockSize; i++) {
504 				if (!(i % 4))
505 					kprintf(" ");
506 
507 				if (i >= size)
508 					kprintf("  ");
509 				else
510 					kprintf("%02x", data[i]);
511 			}
512 
513 			kprintf("\n");
514 		}
515 	}
516 
517 private:
518 	void _Allocate(uint8*& target, void* source)
519 	{
520 		if (source == NULL) {
521 			target = NULL;
522 			return;
523 		}
524 
525 		target = alloc_tracing_buffer_memcpy(source, fSize, false);
526 	}
527 
528 	block_cache*	fCache;
529 	uint32			fSize;
530 	uint64			fBlockNumber;
531 	const char*		fMessage;
532 	uint8*			fCurrent;
533 	uint8*			fParent;
534 	uint8*			fOriginal;
535 #if KTRACE_PRINTF_STACK_TRACE
536 	tracing_stack_trace* fStackTrace;
537 #endif
538 };
539 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
540 
541 }	// namespace BlockTracing
542 
543 #	define TB(x) new(std::nothrow) BlockTracing::x;
544 #else
545 #	define TB(x) ;
546 #endif
547 
548 #if BLOCK_CACHE_BLOCK_TRACING >= 2
549 #	define TB2(x) new(std::nothrow) BlockTracing::x;
550 #else
551 #	define TB2(x) ;
552 #endif
553 
554 
555 #if BLOCK_CACHE_TRANSACTION_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
556 namespace TransactionTracing {
557 
558 class Action : public AbstractTraceEntry {
559 public:
560 	Action(const char* label, block_cache* cache,
561 			cache_transaction* transaction)
562 		:
563 		fCache(cache),
564 		fTransaction(transaction),
565 		fID(transaction->id),
566 		fSub(transaction->has_sub_transaction),
567 		fNumBlocks(transaction->num_blocks),
568 		fSubNumBlocks(transaction->sub_num_blocks)
569 	{
570 		strlcpy(fLabel, label, sizeof(fLabel));
571 		Initialized();
572 	}
573 
574 	virtual void AddDump(TraceOutput& out)
575 	{
576 		out.Print("block cache %p, %s transaction %p (id %ld)%s"
577 			", %ld/%ld blocks", fCache, fLabel, fTransaction, fID,
578 			fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
579 	}
580 
581 private:
582 	char				fLabel[12];
583 	block_cache*		fCache;
584 	cache_transaction*	fTransaction;
585 	int32				fID;
586 	bool				fSub;
587 	int32				fNumBlocks;
588 	int32				fSubNumBlocks;
589 };
590 
591 class Detach : public AbstractTraceEntry {
592 public:
593 	Detach(block_cache* cache, cache_transaction* transaction,
594 			cache_transaction* newTransaction)
595 		:
596 		fCache(cache),
597 		fTransaction(transaction),
598 		fID(transaction->id),
599 		fSub(transaction->has_sub_transaction),
600 		fNewTransaction(newTransaction),
601 		fNewID(newTransaction->id)
602 	{
603 		Initialized();
604 	}
605 
606 	virtual void AddDump(TraceOutput& out)
607 	{
608 		out.Print("block cache %p, detach transaction %p (id %ld)"
609 			"from transaction %p (id %ld)%s",
610 			fCache, fNewTransaction, fNewID, fTransaction, fID,
611 			fSub ? " sub" : "");
612 	}
613 
614 private:
615 	block_cache*		fCache;
616 	cache_transaction*	fTransaction;
617 	int32				fID;
618 	bool				fSub;
619 	cache_transaction*	fNewTransaction;
620 	int32				fNewID;
621 };
622 
623 class Abort : public AbstractTraceEntry {
624 public:
625 	Abort(block_cache* cache, cache_transaction* transaction)
626 		:
627 		fCache(cache),
628 		fTransaction(transaction),
629 		fID(transaction->id),
630 		fNumBlocks(0)
631 	{
632 		bool isSub = transaction->has_sub_transaction;
633 		fNumBlocks = isSub ? transaction->sub_num_blocks
634 			: transaction->num_blocks;
635 		fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
636 		if (fBlocks != NULL) {
637 			cached_block* block = transaction->first_block;
638 			for (int32 i = 0; block != NULL && i < fNumBlocks;
639 					block = block->transaction_next) {
640 				fBlocks[i++] = block->block_number;
641 			}
642 		} else
643 			fNumBlocks = 0;
644 
645 #if KTRACE_PRINTF_STACK_TRACE
646 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
647 			false);
648 #endif
649 
650 		Initialized();
651 	}
652 
653 	virtual void AddDump(TraceOutput& out)
654 	{
655 		out.Print("block cache %p, abort transaction "
656 			"%p (id %ld), blocks", fCache, fTransaction, fID);
657 		for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
658 			out.Print(" %Ld", fBlocks[i]);
659 	}
660 
661 #if KTRACE_PRINTF_STACK_TRACE
662 	virtual void DumpStackTrace(TraceOutput& out)
663 	{
664 		out.PrintStackTrace(fStackTrace);
665 	}
666 #endif
667 
668 private:
669 	block_cache*		fCache;
670 	cache_transaction*	fTransaction;
671 	int32				fID;
672 	off_t*				fBlocks;
673 	int32				fNumBlocks;
674 #if KTRACE_PRINTF_STACK_TRACE
675 	tracing_stack_trace* fStackTrace;
676 #endif
677 };
678 
679 }	// namespace TransactionTracing
680 
681 #	define T(x) new(std::nothrow) TransactionTracing::x;
682 #else
683 #	define T(x) ;
684 #endif
685 
686 
687 static DoublyLinkedList<block_cache> sCaches;
688 static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
689 static mutex sCachesMemoryUseLock
690 	= MUTEX_INITIALIZER("block caches memory use");
691 static size_t sUsedMemory;
692 static sem_id sEventSemaphore;
693 static mutex sNotificationsLock
694 	= MUTEX_INITIALIZER("block cache notifications");
695 static thread_id sNotifierWriterThread;
696 static DoublyLinkedListLink<block_cache> sMarkCache;
697 	// TODO: this only works if the link is the first entry of block_cache
698 static object_cache* sBlockCache;
699 
700 
701 //	#pragma mark - notifications/listener
702 
703 
704 /*!	Checks whether or not this is an event that closes a transaction. */
705 static inline bool
706 is_closing_event(int32 event)
707 {
708 	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
709 }
710 
711 
712 static inline bool
713 is_written_event(int32 event)
714 {
715 	return (event & TRANSACTION_WRITTEN) != 0;
716 }
717 
718 
719 /*!	From the specified \a notification, it will remove the lowest pending
720 	event, and return that one in \a _event.
721 	If there is no pending event anymore, it will return \c false.
722 */
723 static bool
724 get_next_pending_event(cache_notification* notification, int32* _event)
725 {
726 	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
727 		int32 pending = atomic_and(&notification->events_pending,
728 			~eventMask);
729 
730 		bool more = (pending & ~eventMask) != 0;
731 
732 		if ((pending & eventMask) != 0) {
733 			*_event = eventMask;
734 			return more;
735 		}
736 	}
737 
738 	return false;
739 }
740 
741 
742 static void
743 flush_pending_notifications(block_cache* cache)
744 {
745 	ASSERT_LOCKED_MUTEX(&sCachesLock);
746 
747 	while (true) {
748 		MutexLocker locker(sNotificationsLock);
749 
750 		cache_notification* notification = cache->pending_notifications.Head();
751 		if (notification == NULL)
752 			return;
753 
754 		bool deleteAfterEvent = false;
755 		int32 event = -1;
756 		if (!get_next_pending_event(notification, &event)) {
757 			// remove the notification if this was the last pending event
758 			cache->pending_notifications.Remove(notification);
759 			deleteAfterEvent = notification->delete_after_event;
760 		}
761 
762 		if (event >= 0) {
763 			// Notify listener, we need to copy the notification, as it might
764 			// be removed when we unlock the list.
765 			cache_notification copy = *notification;
766 			locker.Unlock();
767 
768 			copy.hook(copy.transaction_id, event, copy.data);
769 
770 			locker.Lock();
771 		}
772 
773 		if (deleteAfterEvent)
774 			delete notification;
775 	}
776 }
777 
778 
779 /*!	Flushes all pending notifications by calling the appropriate hook
780 	functions.
781 	Must not be called with a cache lock held.
782 */
783 static void
784 flush_pending_notifications()
785 {
786 	MutexLocker _(sCachesLock);
787 
788 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
789 	while (iterator.HasNext()) {
790 		block_cache* cache = iterator.Next();
791 
792 		flush_pending_notifications(cache);
793 	}
794 }
795 
796 
797 /*!	Initializes the \a notification as specified. */
798 static void
799 set_notification(cache_transaction* transaction,
800 	cache_notification &notification, int32 events,
801 	transaction_notification_hook hook, void* data)
802 {
803 	notification.transaction_id = transaction != NULL ? transaction->id : -1;
804 	notification.events_pending = 0;
805 	notification.events = events;
806 	notification.hook = hook;
807 	notification.data = data;
808 	notification.delete_after_event = false;
809 }
810 
811 
812 /*!	Makes sure the notification is deleted. It either deletes it directly,
813 	when possible, or marks it for deletion if the notification is pending.
814 */
815 static void
816 delete_notification(cache_notification* notification)
817 {
818 	MutexLocker locker(sNotificationsLock);
819 
820 	if (notification->events_pending != 0)
821 		notification->delete_after_event = true;
822 	else
823 		delete notification;
824 }
825 
826 
827 /*!	Adds the notification to the pending notifications list, or, if it's
828 	already part of it, updates its events_pending field.
829 	Also marks the notification to be deleted if \a deleteNotification
830 	is \c true.
831 	Triggers the notifier thread to run.
832 */
833 static void
834 add_notification(block_cache* cache, cache_notification* notification,
835 	int32 event, bool deleteNotification)
836 {
837 	if (notification->hook == NULL)
838 		return;
839 
840 	int32 pending = atomic_or(&notification->events_pending, event);
841 	if (pending == 0) {
842 		// not yet part of the notification list
843 		MutexLocker locker(sNotificationsLock);
844 		if (deleteNotification)
845 			notification->delete_after_event = true;
846 		cache->pending_notifications.Add(notification);
847 	} else if (deleteNotification) {
848 		// we might need to delete it ourselves if we're late
849 		delete_notification(notification);
850 	}
851 
852 	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
853 		// We're probably still holding some locks that makes rescheduling
854 		// not a good idea at this point.
855 }
856 
857 
858 /*!	Notifies all interested listeners of this transaction about the \a event.
859 	If \a event is a closing event (ie. TRANSACTION_ENDED, and
860 	TRANSACTION_ABORTED), all listeners except those listening to
861 	TRANSACTION_WRITTEN will be removed.
862 */
863 static void
864 notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
865 	int32 event)
866 {
867 	T(Action("notify", cache, transaction));
868 
869 	bool isClosing = is_closing_event(event);
870 	bool isWritten = is_written_event(event);
871 
872 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
873 	while (iterator.HasNext()) {
874 		cache_listener* listener = iterator.Next();
875 
876 		bool remove = (isClosing && !is_written_event(listener->events))
877 			|| (isWritten && is_written_event(listener->events));
878 		if (remove)
879 			iterator.Remove();
880 
881 		if ((listener->events & event) != 0)
882 			add_notification(cache, listener, event, remove);
883 		else if (remove)
884 			delete_notification(listener);
885 	}
886 }
887 
888 
889 /*!	Removes and deletes all listeners that are still monitoring this
890 	transaction.
891 */
892 static void
893 remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
894 {
895 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
896 	while (iterator.HasNext()) {
897 		cache_listener* listener = iterator.Next();
898 		iterator.Remove();
899 
900 		delete_notification(listener);
901 	}
902 }
903 
904 
905 static status_t
906 add_transaction_listener(block_cache* cache, cache_transaction* transaction,
907 	int32 events, transaction_notification_hook hookFunction, void* data)
908 {
909 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
910 	while (iterator.HasNext()) {
911 		cache_listener* listener = iterator.Next();
912 
913 		if (listener->data == data && listener->hook == hookFunction) {
914 			// this listener already exists, just update it
915 			listener->events |= events;
916 			return B_OK;
917 		}
918 	}
919 
920 	cache_listener* listener = new(std::nothrow) cache_listener;
921 	if (listener == NULL)
922 		return B_NO_MEMORY;
923 
924 	set_notification(transaction, *listener, events, hookFunction, data);
925 	transaction->listeners.Add(listener);
926 	return B_OK;
927 }
928 
929 
930 //	#pragma mark - private transaction
931 
932 
933 cache_transaction::cache_transaction()
934 {
935 	num_blocks = 0;
936 	main_num_blocks = 0;
937 	sub_num_blocks = 0;
938 	first_block = NULL;
939 	open = true;
940 	has_sub_transaction = false;
941 	last_used = system_time();
942 	busy_writing_count = 0;
943 }
944 
945 
946 static int
947 transaction_compare(void* _transaction, const void* _id)
948 {
949 	cache_transaction* transaction = (cache_transaction*)_transaction;
950 	const int32* id = (const int32*)_id;
951 
952 	return transaction->id - *id;
953 }
954 
955 
956 static uint32
957 transaction_hash(void* _transaction, const void* _id, uint32 range)
958 {
959 	cache_transaction* transaction = (cache_transaction*)_transaction;
960 	const int32* id = (const int32*)_id;
961 
962 	if (transaction != NULL)
963 		return transaction->id % range;
964 
965 	return (uint32)*id % range;
966 }
967 
968 
969 static void
970 delete_transaction(block_cache* cache, cache_transaction* transaction)
971 {
972 	if (cache->last_transaction == transaction)
973 		cache->last_transaction = NULL;
974 
975 	remove_transaction_listeners(cache, transaction);
976 	delete transaction;
977 }
978 
979 
980 static cache_transaction*
981 lookup_transaction(block_cache* cache, int32 id)
982 {
983 	return (cache_transaction*)hash_lookup(cache->transaction_hash, &id);
984 }
985 
986 
987 /*!	Writes back any changes made to blocks in \a transaction that are still
988 	part of a previous transacton.
989 */
990 static status_t
991 write_blocks_in_previous_transaction(block_cache* cache,
992 	cache_transaction* transaction)
993 {
994 	BlockWriter writer(cache);
995 
996 	cached_block* block = transaction->first_block;
997 	for (; block != NULL; block = block->transaction_next) {
998 		if (block->previous_transaction != NULL) {
999 			// need to write back pending changes
1000 			writer.Add(block);
1001 		}
1002 	}
1003 
1004 	return writer.Write();
1005 }
1006 
1007 
1008 //	#pragma mark - cached_block
1009 
1010 
1011 bool
1012 cached_block::CanBeWritten() const
1013 {
1014 	return !busy_writing && !busy_reading
1015 		&& (previous_transaction != NULL
1016 			|| (transaction == NULL && is_dirty && !is_writing));
1017 }
1018 
1019 
1020 /*static*/ int
1021 cached_block::Compare(void* _cacheEntry, const void* _block)
1022 {
1023 	cached_block* cacheEntry = (cached_block*)_cacheEntry;
1024 	const off_t* block = (const off_t*)_block;
1025 
1026 	off_t diff = cacheEntry->block_number - *block;
1027 	if (diff > 0)
1028 		return 1;
1029 
1030 	return diff < 0 ? -1 : 0;
1031 }
1032 
1033 
1034 /*static*/ uint32
1035 cached_block::Hash(void* _cacheEntry, const void* _block, uint32 range)
1036 {
1037 	cached_block* cacheEntry = (cached_block*)_cacheEntry;
1038 	const off_t* block = (const off_t*)_block;
1039 
1040 	if (cacheEntry != NULL)
1041 		return cacheEntry->block_number % range;
1042 
1043 	return (uint64)*block % range;
1044 }
1045 
1046 
1047 //	#pragma mark - BlockWriter
1048 
1049 
1050 BlockWriter::BlockWriter(block_cache* cache, size_t max)
1051 	:
1052 	fCache(cache),
1053 	fBlocks(fBuffer),
1054 	fCount(0),
1055 	fTotal(0),
1056 	fCapacity(kBufferSize),
1057 	fMax(max),
1058 	fStatus(B_OK),
1059 	fDeletedTransaction(false)
1060 {
1061 }
1062 
1063 
1064 BlockWriter::~BlockWriter()
1065 {
1066 	if (fBlocks != fBuffer)
1067 		free(fBlocks);
1068 }
1069 
1070 
1071 /*!	Adds the specified block to the to be written array. If no more blocks can
1072 	be added, false is returned, otherwise true.
1073 */
1074 bool
1075 BlockWriter::Add(cached_block* block, hash_iterator* iterator)
1076 {
1077 	ASSERT(block->CanBeWritten());
1078 
1079 	if (fTotal == fMax)
1080 		return false;
1081 
1082 	if (fCount >= fCapacity) {
1083 		// Enlarge array if necessary
1084 		cached_block** newBlocks;
1085 		size_t newCapacity = max_c(256, fCapacity * 2);
1086 		if (fBlocks == fBuffer)
1087 			newBlocks = (cached_block**)malloc(newCapacity * sizeof(void*));
1088 		else {
1089 			newBlocks = (cached_block**)realloc(fBlocks,
1090 				newCapacity * sizeof(void*));
1091 		}
1092 
1093 		if (newBlocks == NULL) {
1094 			// Allocating a larger array failed - we need to write back what
1095 			// we have synchronously now (this will also clear the array)
1096 			Write(iterator, false);
1097 		} else {
1098 			if (fBlocks == fBuffer)
1099 				memcpy(newBlocks, fBuffer, kBufferSize * sizeof(void*));
1100 
1101 			fBlocks = newBlocks;
1102 			fCapacity = newCapacity;
1103 		}
1104 	}
1105 
1106 	fBlocks[fCount++] = block;
1107 	fTotal++;
1108 	block->busy_writing = true;
1109 	fCache->busy_writing_count++;
1110 	if (block->previous_transaction != NULL)
1111 		block->previous_transaction->busy_writing_count++;
1112 
1113 	return true;
1114 }
1115 
1116 
1117 /*!	Adds all blocks of the specified transaction to the to be written array.
1118 	If no more blocks can be added, false is returned, otherwise true.
1119 */
1120 bool
1121 BlockWriter::Add(cache_transaction* transaction, hash_iterator* iterator,
1122 	bool& hasLeftOvers)
1123 {
1124 	ASSERT(!transaction->open);
1125 
1126 	if (transaction->busy_writing_count != 0) {
1127 		hasLeftOvers = true;
1128 		return true;
1129 	}
1130 
1131 	hasLeftOvers = false;
1132 
1133 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1134 	while (cached_block* block = blockIterator.Next()) {
1135 		if (!block->CanBeWritten()) {
1136 			// This block was already part of a previous transaction within this
1137 			// writer
1138 			hasLeftOvers = true;
1139 			continue;
1140 		}
1141 		if (!Add(block, iterator))
1142 			return false;
1143 
1144 		if (DeletedTransaction())
1145 			break;
1146 	}
1147 
1148 	return true;
1149 }
1150 
1151 
1152 /*! Cache must be locked when calling this method, but it will be unlocked
1153 	while the blocks are written back.
1154 */
1155 status_t
1156 BlockWriter::Write(hash_iterator* iterator, bool canUnlock)
1157 {
1158 	if (fCount == 0)
1159 		return B_OK;
1160 
1161 	if (canUnlock)
1162 		mutex_unlock(&fCache->lock);
1163 
1164 	// Sort blocks in their on-disk order
1165 	// TODO: ideally, this should be handled by the I/O scheduler
1166 
1167 	qsort(fBlocks, fCount, sizeof(void*), &_CompareBlocks);
1168 	fDeletedTransaction = false;
1169 
1170 	for (uint32 i = 0; i < fCount; i++) {
1171 		status_t status = _WriteBlock(fBlocks[i]);
1172 		if (status != B_OK) {
1173 			// propagate to global error handling
1174 			if (fStatus == B_OK)
1175 				fStatus = status;
1176 
1177 			_UnmarkWriting(fBlocks[i]);
1178 			fBlocks[i] = NULL;
1179 				// This block will not be marked clean
1180 		}
1181 	}
1182 
1183 	if (canUnlock)
1184 		mutex_lock(&fCache->lock);
1185 
1186 	for (uint32 i = 0; i < fCount; i++)
1187 		_BlockDone(fBlocks[i], iterator);
1188 
1189 	fCount = 0;
1190 	return fStatus;
1191 }
1192 
1193 
1194 /*!	Writes the specified \a block back to disk. It will always only write back
1195 	the oldest change of the block if it is part of more than one transaction.
1196 	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1197 	delete transactions when they are no longer used, and \a deleteTransaction
1198 	is \c true.
1199 */
1200 /*static*/ status_t
1201 BlockWriter::WriteBlock(block_cache* cache, cached_block* block)
1202 {
1203 	BlockWriter writer(cache);
1204 
1205 	writer.Add(block);
1206 	return writer.Write();
1207 }
1208 
1209 
1210 void*
1211 BlockWriter::_Data(cached_block* block) const
1212 {
1213 	return block->previous_transaction != NULL && block->original_data != NULL
1214 		? block->original_data : block->current_data;
1215 		// We first need to write back changes from previous transactions
1216 }
1217 
1218 
1219 status_t
1220 BlockWriter::_WriteBlock(cached_block* block)
1221 {
1222 	ASSERT(block->busy_writing);
1223 
1224 	TRACE(("BlockWriter::_WriteBlock(block %Ld)\n", block->block_number));
1225 	TB(Write(fCache, block));
1226 	TB2(BlockData(fCache, block, "before write"));
1227 
1228 	size_t blockSize = fCache->block_size;
1229 
1230 	ssize_t written = write_pos(fCache->fd,
1231 		block->block_number * blockSize, _Data(block), blockSize);
1232 
1233 	if (written != (ssize_t)blockSize) {
1234 		TB(Error(fCache, block->block_number, "write failed", written));
1235 		FATAL(("could not write back block %Ld (%s)\n", block->block_number,
1236 			strerror(errno)));
1237 		if (written < 0)
1238 			return errno;
1239 
1240 		return B_IO_ERROR;
1241 	}
1242 
1243 	return B_OK;
1244 }
1245 
1246 
1247 void
1248 BlockWriter::_BlockDone(cached_block* block, hash_iterator* iterator)
1249 {
1250 	if (block == NULL) {
1251 		// An error occured when trying to write this block
1252 		return;
1253 	}
1254 
1255 	if (fCache->num_dirty_blocks > 0)
1256 		fCache->num_dirty_blocks--;
1257 
1258 	if (_Data(block) == block->current_data)
1259 		block->is_dirty = false;
1260 
1261 	_UnmarkWriting(block);
1262 
1263 	cache_transaction* previous = block->previous_transaction;
1264 	if (previous != NULL) {
1265 		previous->blocks.Remove(block);
1266 		block->previous_transaction = NULL;
1267 
1268 		if (block->original_data != NULL && block->transaction == NULL) {
1269 			// This block is not part of a transaction, so it does not need
1270 			// its original pointer anymore.
1271 			fCache->Free(block->original_data);
1272 			block->original_data = NULL;
1273 		}
1274 
1275 		// Has the previous transation been finished with that write?
1276 		if (--previous->num_blocks == 0) {
1277 			TRACE(("cache transaction %ld finished!\n", previous->id));
1278 			T(Action("written", fCache, previous));
1279 
1280 			notify_transaction_listeners(fCache, previous,
1281 				TRANSACTION_WRITTEN);
1282 
1283 			if (iterator != NULL)
1284 				hash_remove_current(fCache->transaction_hash, iterator);
1285 			else
1286 				hash_remove(fCache->transaction_hash, previous);
1287 
1288 			delete_transaction(fCache, previous);
1289 			fDeletedTransaction = true;
1290 		}
1291 	}
1292 	if (block->transaction == NULL && block->ref_count == 0 && !block->unused) {
1293 		// the block is no longer used
1294 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1295 		block->unused = true;
1296 		fCache->unused_blocks.Add(block);
1297 		fCache->unused_block_count++;
1298 	}
1299 
1300 	TB2(BlockData(fCache, block, "after write"));
1301 }
1302 
1303 
1304 void
1305 BlockWriter::_UnmarkWriting(cached_block* block)
1306 {
1307 	block->busy_writing = false;
1308 	if (block->previous_transaction != NULL)
1309 		block->previous_transaction->busy_writing_count--;
1310 	fCache->busy_writing_count--;
1311 
1312 	if ((fCache->busy_writing_waiters && fCache->busy_writing_count == 0)
1313 		|| block->busy_writing_waiters) {
1314 		fCache->busy_writing_waiters = false;
1315 		block->busy_writing_waiters = false;
1316 		fCache->busy_writing_condition.NotifyAll();
1317 	}
1318 }
1319 
1320 
1321 /*static*/ int
1322 BlockWriter::_CompareBlocks(const void* _blockA, const void* _blockB)
1323 {
1324 	cached_block* blockA = *(cached_block**)_blockA;
1325 	cached_block* blockB = *(cached_block**)_blockB;
1326 
1327 	off_t diff = blockA->block_number - blockB->block_number;
1328 	if (diff > 0)
1329 		return 1;
1330 
1331 	return diff < 0 ? -1 : 0;
1332 }
1333 
1334 
1335 //	#pragma mark - block_cache
1336 
1337 
1338 block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
1339 		bool readOnly)
1340 	:
1341 	hash(NULL),
1342 	fd(_fd),
1343 	max_blocks(numBlocks),
1344 	block_size(blockSize),
1345 	next_transaction_id(1),
1346 	last_transaction(NULL),
1347 	transaction_hash(NULL),
1348 	buffer_cache(NULL),
1349 	unused_block_count(0),
1350 	busy_reading_count(0),
1351 	busy_reading_waiters(false),
1352 	busy_writing_count(0),
1353 	busy_writing_waiters(0),
1354 	num_dirty_blocks(0),
1355 	read_only(readOnly)
1356 {
1357 }
1358 
1359 
1360 /*! Should be called with the cache's lock held. */
1361 block_cache::~block_cache()
1362 {
1363 	unregister_low_resource_handler(&_LowMemoryHandler, this);
1364 
1365 	hash_uninit(transaction_hash);
1366 	hash_uninit(hash);
1367 
1368 	delete_object_cache(buffer_cache);
1369 
1370 	mutex_destroy(&lock);
1371 }
1372 
1373 
1374 status_t
1375 block_cache::Init()
1376 {
1377 	busy_reading_condition.Init(this, "cache block busy_reading");
1378 	busy_writing_condition.Init(this, "cache block busy writing");
1379 	condition_variable.Init(this, "cache transaction sync");
1380 	mutex_init(&lock, "block cache");
1381 
1382 	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
1383 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1384 	if (buffer_cache == NULL)
1385 		return B_NO_MEMORY;
1386 
1387 	cached_block dummyBlock;
1388 	hash = hash_init(1024, offset_of_member(dummyBlock, next),
1389 		&cached_block::Compare, &cached_block::Hash);
1390 	if (hash == NULL)
1391 		return B_NO_MEMORY;
1392 
1393 	cache_transaction dummyTransaction;
1394 	transaction_hash = hash_init(16, offset_of_member(dummyTransaction, next),
1395 		&transaction_compare, &::transaction_hash);
1396 	if (transaction_hash == NULL)
1397 		return B_NO_MEMORY;
1398 
1399 	return register_low_resource_handler(&_LowMemoryHandler, this,
1400 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1401 			| B_KERNEL_RESOURCE_ADDRESS_SPACE, 0);
1402 }
1403 
1404 
1405 void
1406 block_cache::Free(void* buffer)
1407 {
1408 	if (buffer != NULL)
1409 		object_cache_free(buffer_cache, buffer, 0);
1410 }
1411 
1412 
1413 void*
1414 block_cache::Allocate()
1415 {
1416 	void* block = object_cache_alloc(buffer_cache, 0);
1417 	if (block != NULL)
1418 		return block;
1419 
1420 	// recycle existing before allocating a new one
1421 	RemoveUnusedBlocks(100);
1422 
1423 	return object_cache_alloc(buffer_cache, 0);
1424 }
1425 
1426 
1427 void
1428 block_cache::FreeBlock(cached_block* block)
1429 {
1430 	Free(block->current_data);
1431 
1432 	if (block->original_data != NULL || block->parent_data != NULL) {
1433 		panic("block_cache::FreeBlock(): %Ld, original %p, parent %p\n",
1434 			block->block_number, block->original_data, block->parent_data);
1435 	}
1436 
1437 #if BLOCK_CACHE_DEBUG_CHANGED
1438 	Free(block->compare);
1439 #endif
1440 
1441 	object_cache_free(sBlockCache, block, 0);
1442 }
1443 
1444 
1445 /*! Allocates a new block for \a blockNumber, ready for use */
1446 cached_block*
1447 block_cache::NewBlock(off_t blockNumber)
1448 {
1449 	cached_block* block = NULL;
1450 
1451 	if (low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1452 			| B_KERNEL_RESOURCE_ADDRESS_SPACE) != B_NO_LOW_RESOURCE) {
1453 		// recycle existing instead of allocating a new one
1454 		block = _GetUnusedBlock();
1455 	}
1456 	if (block == NULL) {
1457 		block = (cached_block*)object_cache_alloc(sBlockCache, 0);
1458 		if (block != NULL) {
1459 			block->current_data = Allocate();
1460 			if (block->current_data == NULL) {
1461 				object_cache_free(sBlockCache, block, 0);
1462 				return NULL;
1463 			}
1464 		} else {
1465 			TB(Error(this, blockNumber, "allocation failed"));
1466 			dprintf("block allocation failed, unused list is %sempty.\n",
1467 				unused_blocks.IsEmpty() ? "" : "not ");
1468 
1469 			// allocation failed, try to reuse an unused block
1470 			block = _GetUnusedBlock();
1471 			if (block == NULL) {
1472 				TB(Error(this, blockNumber, "get unused failed"));
1473 				FATAL(("could not allocate block!\n"));
1474 				return NULL;
1475 			}
1476 		}
1477 	}
1478 
1479 	block->block_number = blockNumber;
1480 	block->ref_count = 0;
1481 	block->last_accessed = 0;
1482 	block->transaction_next = NULL;
1483 	block->transaction = block->previous_transaction = NULL;
1484 	block->original_data = NULL;
1485 	block->parent_data = NULL;
1486 	block->busy_reading = false;
1487 	block->busy_writing = false;
1488 	block->is_writing = false;
1489 	block->is_dirty = false;
1490 	block->unused = false;
1491 	block->discard = false;
1492 	block->busy_reading_waiters = false;
1493 	block->busy_writing_waiters = false;
1494 #if BLOCK_CACHE_DEBUG_CHANGED
1495 	block->compare = NULL;
1496 #endif
1497 
1498 	return block;
1499 }
1500 
1501 
1502 void
1503 block_cache::FreeBlockParentData(cached_block* block)
1504 {
1505 	ASSERT(block->parent_data != NULL);
1506 	if (block->parent_data != block->current_data)
1507 		Free(block->parent_data);
1508 	block->parent_data = NULL;
1509 }
1510 
1511 
1512 void
1513 block_cache::RemoveUnusedBlocks(int32 count, int32 minSecondsOld)
1514 {
1515 	TRACE(("block_cache: remove up to %" B_PRId32 " unused blocks\n", count));
1516 
1517 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1518 			cached_block* block = iterator.Next();) {
1519 		if (minSecondsOld >= block->LastAccess()) {
1520 			// The list is sorted by last access
1521 			break;
1522 		}
1523 		if (block->busy_reading || block->busy_writing)
1524 			continue;
1525 
1526 		TB(Flush(this, block));
1527 		TRACE(("  remove block %Ld, last accessed %" B_PRId32 "\n",
1528 			block->block_number, block->last_accessed));
1529 
1530 		// this can only happen if no transactions are used
1531 		if (block->is_dirty && !block->discard) {
1532 			if (block->busy_writing)
1533 				continue;
1534 
1535 			BlockWriter::WriteBlock(this, block);
1536 		}
1537 
1538 		// remove block from lists
1539 		iterator.Remove();
1540 		unused_block_count--;
1541 		RemoveBlock(block);
1542 
1543 		if (--count <= 0)
1544 			break;
1545 	}
1546 }
1547 
1548 
1549 void
1550 block_cache::RemoveBlock(cached_block* block)
1551 {
1552 	hash_remove(hash, block);
1553 	FreeBlock(block);
1554 }
1555 
1556 
1557 /*!	Discards the block from a transaction (this method must not be called
1558 	for blocks not part of a transaction).
1559 */
1560 void
1561 block_cache::DiscardBlock(cached_block* block)
1562 {
1563 	ASSERT(block->discard);
1564 	ASSERT(block->previous_transaction == NULL);
1565 
1566 	if (block->parent_data != NULL)
1567 		FreeBlockParentData(block);
1568 
1569 	if (block->original_data != NULL) {
1570 		Free(block->original_data);
1571 		block->original_data = NULL;
1572 	}
1573 
1574 	RemoveBlock(block);
1575 }
1576 
1577 
1578 void
1579 block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
1580 {
1581 	TRACE(("block_cache: low memory handler called with level %ld\n", level));
1582 
1583 	// free some blocks according to the low memory state
1584 	// (if there is enough memory left, we don't free any)
1585 
1586 	block_cache* cache = (block_cache*)data;
1587 	int32 free = 0;
1588 	int32 secondsOld = 0;
1589 	switch (level) {
1590 		case B_NO_LOW_RESOURCE:
1591 			return;
1592 		case B_LOW_RESOURCE_NOTE:
1593 			free = cache->unused_block_count / 8;
1594 			secondsOld = 120;
1595 			break;
1596 		case B_LOW_RESOURCE_WARNING:
1597 			free = cache->unused_block_count / 4;
1598 			secondsOld = 10;
1599 			break;
1600 		case B_LOW_RESOURCE_CRITICAL:
1601 			free = cache->unused_block_count / 2;
1602 			secondsOld = 0;
1603 			break;
1604 	}
1605 
1606 	MutexLocker locker(&cache->lock);
1607 
1608 	if (!locker.IsLocked()) {
1609 		// If our block_cache were deleted, it could be that we had
1610 		// been called before that deletion went through, therefore,
1611 		// acquiring its lock might fail.
1612 		return;
1613 	}
1614 
1615 #ifdef TRACE_BLOCK_CACHE
1616 	uint32 oldUnused = cache->unused_block_count;
1617 #endif
1618 
1619 	cache->RemoveUnusedBlocks(free, secondsOld);
1620 
1621 	TRACE(("block_cache::_LowMemoryHandler(): %p: unused: %lu -> %lu\n", cache,
1622 		oldUnused, cache->unused_block_count));
1623 }
1624 
1625 
1626 cached_block*
1627 block_cache::_GetUnusedBlock()
1628 {
1629 	TRACE(("block_cache: get unused block\n"));
1630 
1631 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1632 			cached_block* block = iterator.Next();) {
1633 		TB(Flush(this, block, true));
1634 		// this can only happen if no transactions are used
1635 		if (block->is_dirty && !block->busy_writing && !block->discard)
1636 			BlockWriter::WriteBlock(this, block);
1637 
1638 		// remove block from lists
1639 		iterator.Remove();
1640 		unused_block_count--;
1641 		hash_remove(hash, block);
1642 
1643 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1644 		block->unused = false;
1645 
1646 		// TODO: see if compare data is handled correctly here!
1647 #if BLOCK_CACHE_DEBUG_CHANGED
1648 		if (block->compare != NULL)
1649 			Free(block->compare);
1650 #endif
1651 		return block;
1652 	}
1653 
1654 	return NULL;
1655 }
1656 
1657 
1658 //	#pragma mark - private block functions
1659 
1660 
1661 /*!	Cache must be locked.
1662 */
1663 static void
1664 mark_block_busy_reading(block_cache* cache, cached_block* block)
1665 {
1666 	block->busy_reading = true;
1667 	cache->busy_reading_count++;
1668 }
1669 
1670 
1671 /*!	Cache must be locked.
1672 */
1673 static void
1674 mark_block_unbusy_reading(block_cache* cache, cached_block* block)
1675 {
1676 	block->busy_reading = false;
1677 	cache->busy_reading_count--;
1678 
1679 	if ((cache->busy_reading_waiters && cache->busy_reading_count == 0)
1680 		|| block->busy_reading_waiters) {
1681 		cache->busy_reading_waiters = false;
1682 		block->busy_reading_waiters = false;
1683 		cache->busy_reading_condition.NotifyAll();
1684 	}
1685 }
1686 
1687 
1688 /*!	Cache must be locked.
1689 */
1690 static void
1691 wait_for_busy_reading_block(block_cache* cache, cached_block* block)
1692 {
1693 	while (block->busy_reading) {
1694 		// wait for at least the specified block to be read in
1695 		ConditionVariableEntry entry;
1696 		cache->busy_reading_condition.Add(&entry);
1697 		block->busy_reading_waiters = true;
1698 
1699 		mutex_unlock(&cache->lock);
1700 
1701 		entry.Wait();
1702 
1703 		mutex_lock(&cache->lock);
1704 	}
1705 }
1706 
1707 
1708 /*!	Cache must be locked.
1709 */
1710 static void
1711 wait_for_busy_reading_blocks(block_cache* cache)
1712 {
1713 	while (cache->busy_reading_count != 0) {
1714 		// wait for all blocks to be read in
1715 		ConditionVariableEntry entry;
1716 		cache->busy_reading_condition.Add(&entry);
1717 		cache->busy_reading_waiters = true;
1718 
1719 		mutex_unlock(&cache->lock);
1720 
1721 		entry.Wait();
1722 
1723 		mutex_lock(&cache->lock);
1724 	}
1725 }
1726 
1727 
1728 /*!	Cache must be locked.
1729 */
1730 static void
1731 wait_for_busy_writing_block(block_cache* cache, cached_block* block)
1732 {
1733 	while (block->busy_writing) {
1734 		// wait for all blocks to be written back
1735 		ConditionVariableEntry entry;
1736 		cache->busy_writing_condition.Add(&entry);
1737 		block->busy_writing_waiters = true;
1738 
1739 		mutex_unlock(&cache->lock);
1740 
1741 		entry.Wait();
1742 
1743 		mutex_lock(&cache->lock);
1744 	}
1745 }
1746 
1747 
1748 /*!	Cache must be locked.
1749 */
1750 static void
1751 wait_for_busy_writing_blocks(block_cache* cache)
1752 {
1753 	while (cache->busy_writing_count != 0) {
1754 		// wait for all blocks to be written back
1755 		ConditionVariableEntry entry;
1756 		cache->busy_writing_condition.Add(&entry);
1757 		cache->busy_writing_waiters = true;
1758 
1759 		mutex_unlock(&cache->lock);
1760 
1761 		entry.Wait();
1762 
1763 		mutex_lock(&cache->lock);
1764 	}
1765 }
1766 
1767 
1768 /*!	Removes a reference from the specified \a block. If this was the last
1769 	reference, the block is moved into the unused list.
1770 	In low memory situations, it will also free some blocks from that list,
1771 	but not necessarily the \a block it just released.
1772 */
1773 static void
1774 put_cached_block(block_cache* cache, cached_block* block)
1775 {
1776 #if BLOCK_CACHE_DEBUG_CHANGED
1777 	if (!block->is_dirty && block->compare != NULL
1778 		&& memcmp(block->current_data, block->compare, cache->block_size)) {
1779 		dprintf("new block:\n");
1780 		dump_block((const char*)block->current_data, 256, "  ");
1781 		dprintf("unchanged block:\n");
1782 		dump_block((const char*)block->compare, 256, "  ");
1783 		BlockWriter::WriteBlock(cache, block);
1784 		panic("block_cache: supposed to be clean block was changed!\n");
1785 
1786 		cache->Free(block->compare);
1787 		block->compare = NULL;
1788 	}
1789 #endif
1790 	TB(Put(cache, block));
1791 
1792 	if (block->ref_count < 1) {
1793 		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
1794 		return;
1795 	}
1796 
1797 	if (--block->ref_count == 0
1798 		&& block->transaction == NULL && block->previous_transaction == NULL) {
1799 		// This block is not used anymore, and not part of any transaction
1800 		block->is_writing = false;
1801 
1802 		if (block->discard) {
1803 			cache->RemoveBlock(block);
1804 		} else {
1805 			// put this block in the list of unused blocks
1806 			ASSERT(!block->unused);
1807 			block->unused = true;
1808 
1809 			ASSERT(block->original_data == NULL && block->parent_data == NULL);
1810 			cache->unused_blocks.Add(block);
1811 			cache->unused_block_count++;
1812 		}
1813 	}
1814 }
1815 
1816 
1817 static void
1818 put_cached_block(block_cache* cache, off_t blockNumber)
1819 {
1820 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1821 		panic("put_cached_block: invalid block number %lld (max %lld)",
1822 			blockNumber, cache->max_blocks - 1);
1823 	}
1824 
1825 	cached_block* block = (cached_block*)hash_lookup(cache->hash, &blockNumber);
1826 	if (block != NULL)
1827 		put_cached_block(cache, block);
1828 	else {
1829 		TB(Error(cache, blockNumber, "put unknown"));
1830 	}
1831 }
1832 
1833 
1834 /*!	Retrieves the block \a blockNumber from the hash table, if it's already
1835 	there, or reads it from the disk.
1836 	You need to have the cache locked when calling this function.
1837 
1838 	\param _allocated tells you whether or not a new block has been allocated
1839 		to satisfy your request.
1840 	\param readBlock if \c false, the block will not be read in case it was
1841 		not already in the cache. The block you retrieve may contain random
1842 		data. If \c true, the cache will be temporarily unlocked while the
1843 		block is read in.
1844 */
1845 static cached_block*
1846 get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
1847 	bool readBlock = true)
1848 {
1849 	ASSERT_LOCKED_MUTEX(&cache->lock);
1850 
1851 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1852 		panic("get_cached_block: invalid block number %lld (max %lld)",
1853 			blockNumber, cache->max_blocks - 1);
1854 		return NULL;
1855 	}
1856 
1857 retry:
1858 	cached_block* block = (cached_block*)hash_lookup(cache->hash,
1859 		&blockNumber);
1860 	*_allocated = false;
1861 
1862 	if (block == NULL) {
1863 		// put block into cache
1864 		block = cache->NewBlock(blockNumber);
1865 		if (block == NULL)
1866 			return NULL;
1867 
1868 		hash_insert_grow(cache->hash, block);
1869 		*_allocated = true;
1870 	} else if (block->busy_reading) {
1871 		// The block is currently busy_reading - wait and try again later
1872 		wait_for_busy_reading_block(cache, block);
1873 		goto retry;
1874 	}
1875 
1876 	if (block->unused) {
1877 		//TRACE(("remove block %Ld from unused\n", blockNumber));
1878 		block->unused = false;
1879 		cache->unused_blocks.Remove(block);
1880 		cache->unused_block_count--;
1881 	}
1882 
1883 	if (*_allocated && readBlock) {
1884 		// read block into cache
1885 		int32 blockSize = cache->block_size;
1886 
1887 		mark_block_busy_reading(cache, block);
1888 		mutex_unlock(&cache->lock);
1889 
1890 		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
1891 			block->current_data, blockSize);
1892 
1893 		mutex_lock(&cache->lock);
1894 		if (bytesRead < blockSize) {
1895 			cache->RemoveBlock(block);
1896 			TB(Error(cache, blockNumber, "read failed", bytesRead));
1897 
1898 			FATAL(("could not read block %Ld: bytesRead: %ld, error: %s\n",
1899 				blockNumber, bytesRead, strerror(errno)));
1900 			return NULL;
1901 		}
1902 		TB(Read(cache, block));
1903 
1904 		mark_block_unbusy_reading(cache, block);
1905 	}
1906 
1907 	block->ref_count++;
1908 	block->last_accessed = system_time() / 1000000L;
1909 
1910 	return block;
1911 }
1912 
1913 
1914 /*!	Returns the writable block data for the requested blockNumber.
1915 	If \a cleared is true, the block is not read from disk; an empty block
1916 	is returned.
1917 
1918 	This is the only method to insert a block into a transaction. It makes
1919 	sure that the previous block contents are preserved in that case.
1920 */
1921 static void*
1922 get_writable_cached_block(block_cache* cache, off_t blockNumber, off_t base,
1923 	off_t length, int32 transactionID, bool cleared)
1924 {
1925 	TRACE(("get_writable_cached_block(blockNumber = %Ld, transaction = %ld)\n",
1926 		blockNumber, transactionID));
1927 
1928 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1929 		panic("get_writable_cached_block: invalid block number %lld (max %lld)",
1930 			blockNumber, cache->max_blocks - 1);
1931 	}
1932 
1933 	bool allocated;
1934 	cached_block* block = get_cached_block(cache, blockNumber, &allocated,
1935 		!cleared);
1936 	if (block == NULL)
1937 		return NULL;
1938 
1939 	if (block->busy_writing)
1940 		wait_for_busy_writing_block(cache, block);
1941 
1942 	block->discard = false;
1943 
1944 	// if there is no transaction support, we just return the current block
1945 	if (transactionID == -1) {
1946 		if (cleared) {
1947 			mark_block_busy_reading(cache, block);
1948 			mutex_unlock(&cache->lock);
1949 
1950 			memset(block->current_data, 0, cache->block_size);
1951 
1952 			mutex_lock(&cache->lock);
1953 			mark_block_unbusy_reading(cache, block);
1954 		}
1955 
1956 		block->is_writing = true;
1957 
1958 		if (!block->is_dirty) {
1959 			cache->num_dirty_blocks++;
1960 			block->is_dirty = true;
1961 				// mark the block as dirty
1962 		}
1963 
1964 		TB(Get(cache, block));
1965 		return block->current_data;
1966 	}
1967 
1968 	cache_transaction* transaction = block->transaction;
1969 
1970 	if (transaction != NULL && transaction->id != transactionID) {
1971 		// TODO: we have to wait here until the other transaction is done.
1972 		//	Maybe we should even panic, since we can't prevent any deadlocks.
1973 		panic("get_writable_cached_block(): asked to get busy writable block "
1974 			"(transaction %ld)\n", block->transaction->id);
1975 		put_cached_block(cache, block);
1976 		return NULL;
1977 	}
1978 	if (transaction == NULL && transactionID != -1) {
1979 		// get new transaction
1980 		transaction = lookup_transaction(cache, transactionID);
1981 		if (transaction == NULL) {
1982 			panic("get_writable_cached_block(): invalid transaction %ld!\n",
1983 				transactionID);
1984 			put_cached_block(cache, block);
1985 			return NULL;
1986 		}
1987 		if (!transaction->open) {
1988 			panic("get_writable_cached_block(): transaction already done!\n");
1989 			put_cached_block(cache, block);
1990 			return NULL;
1991 		}
1992 
1993 		block->transaction = transaction;
1994 
1995 		// attach the block to the transaction block list
1996 		block->transaction_next = transaction->first_block;
1997 		transaction->first_block = block;
1998 		transaction->num_blocks++;
1999 	}
2000 	if (transaction != NULL)
2001 		transaction->last_used = system_time();
2002 
2003 	bool wasUnchanged = block->original_data == NULL
2004 		|| block->previous_transaction != NULL;
2005 
2006 	if (!(allocated && cleared) && block->original_data == NULL) {
2007 		// we already have data, so we need to preserve it
2008 		block->original_data = cache->Allocate();
2009 		if (block->original_data == NULL) {
2010 			TB(Error(cache, blockNumber, "allocate original failed"));
2011 			FATAL(("could not allocate original_data\n"));
2012 			put_cached_block(cache, block);
2013 			return NULL;
2014 		}
2015 
2016 		mark_block_busy_reading(cache, block);
2017 		mutex_unlock(&cache->lock);
2018 
2019 		memcpy(block->original_data, block->current_data, cache->block_size);
2020 
2021 		mutex_lock(&cache->lock);
2022 		mark_block_unbusy_reading(cache, block);
2023 	}
2024 	if (block->parent_data == block->current_data) {
2025 		// remember any previous contents for the parent transaction
2026 		block->parent_data = cache->Allocate();
2027 		if (block->parent_data == NULL) {
2028 			// TODO: maybe we should just continue the current transaction in
2029 			// this case...
2030 			TB(Error(cache, blockNumber, "allocate parent failed"));
2031 			FATAL(("could not allocate parent\n"));
2032 			put_cached_block(cache, block);
2033 			return NULL;
2034 		}
2035 
2036 		mark_block_busy_reading(cache, block);
2037 		mutex_unlock(&cache->lock);
2038 
2039 		memcpy(block->parent_data, block->current_data, cache->block_size);
2040 
2041 		mutex_lock(&cache->lock);
2042 		mark_block_unbusy_reading(cache, block);
2043 
2044 		transaction->sub_num_blocks++;
2045 	} else if (transaction != NULL && transaction->has_sub_transaction
2046 		&& block->parent_data == NULL && wasUnchanged)
2047 		transaction->sub_num_blocks++;
2048 
2049 	if (cleared) {
2050 		mark_block_busy_reading(cache, block);
2051 		mutex_unlock(&cache->lock);
2052 
2053 		memset(block->current_data, 0, cache->block_size);
2054 
2055 		mutex_lock(&cache->lock);
2056 		mark_block_unbusy_reading(cache, block);
2057 	}
2058 
2059 	block->is_dirty = true;
2060 	TB(Get(cache, block));
2061 	TB2(BlockData(cache, block, "get writable"));
2062 
2063 	return block->current_data;
2064 }
2065 
2066 
2067 #if DEBUG_BLOCK_CACHE
2068 
2069 
2070 static void
2071 dump_block(cached_block* block)
2072 {
2073 	kprintf("%08lx %9Ld %08lx %08lx %08lx %5ld %6ld %c%c%c%c%c%c %08lx %08lx\n",
2074 		(addr_t)block, block->block_number,
2075 		(addr_t)block->current_data, (addr_t)block->original_data,
2076 		(addr_t)block->parent_data, block->ref_count, block->LastAccess(),
2077 		block->busy_reading ? 'r' : '-', block->busy_writing ? 'w' : '-',
2078 		block->is_writing ? 'W' : '-', block->is_dirty ? 'D' : '-',
2079 		block->unused ? 'U' : '-', block->discard ? 'D' : '-',
2080 		(addr_t)block->transaction,
2081 		(addr_t)block->previous_transaction);
2082 }
2083 
2084 
2085 static void
2086 dump_block_long(cached_block* block)
2087 {
2088 	kprintf("BLOCK %p\n", block);
2089 	kprintf(" current data:  %p\n", block->current_data);
2090 	kprintf(" original data: %p\n", block->original_data);
2091 	kprintf(" parent data:   %p\n", block->parent_data);
2092 #if BLOCK_CACHE_DEBUG_CHANGED
2093 	kprintf(" compare data:  %p\n", block->compare);
2094 #endif
2095 	kprintf(" ref_count:     %ld\n", block->ref_count);
2096 	kprintf(" accessed:      %ld\n", block->LastAccess());
2097 	kprintf(" flags:        ");
2098 	if (block->busy_reading)
2099 		kprintf(" busy_reading");
2100 	if (block->busy_writing)
2101 		kprintf(" busy_writing");
2102 	if (block->is_writing)
2103 		kprintf(" is-writing");
2104 	if (block->is_dirty)
2105 		kprintf(" is-dirty");
2106 	if (block->unused)
2107 		kprintf(" unused");
2108 	if (block->discard)
2109 		kprintf(" discard");
2110 	kprintf("\n");
2111 	if (block->transaction != NULL) {
2112 		kprintf(" transaction:   %p (%ld)\n", block->transaction,
2113 			block->transaction->id);
2114 		if (block->transaction_next != NULL) {
2115 			kprintf(" next in transaction: %Ld\n",
2116 				block->transaction_next->block_number);
2117 		}
2118 	}
2119 	if (block->previous_transaction != NULL) {
2120 		kprintf(" previous transaction: %p (%ld)\n",
2121 			block->previous_transaction,
2122 			block->previous_transaction->id);
2123 	}
2124 
2125 	set_debug_variable("_current", (addr_t)block->current_data);
2126 	set_debug_variable("_original", (addr_t)block->original_data);
2127 	set_debug_variable("_parent", (addr_t)block->parent_data);
2128 }
2129 
2130 
2131 static int
2132 dump_cached_block(int argc, char** argv)
2133 {
2134 	if (argc != 2) {
2135 		kprintf("usage: %s <block-address>\n", argv[0]);
2136 		return 0;
2137 	}
2138 
2139 	dump_block_long((struct cached_block*)(addr_t)parse_expression(argv[1]));
2140 	return 0;
2141 }
2142 
2143 
2144 static int
2145 dump_cache(int argc, char** argv)
2146 {
2147 	bool showTransactions = false;
2148 	bool showBlocks = false;
2149 	int32 i = 1;
2150 	while (argv[i] != NULL && argv[i][0] == '-') {
2151 		for (char* arg = &argv[i][1]; arg[0]; arg++) {
2152 			switch (arg[0]) {
2153 				case 'b':
2154 					showBlocks = true;
2155 					break;
2156 				case 't':
2157 					showTransactions = true;
2158 					break;
2159 				default:
2160 					print_debugger_command_usage(argv[0]);
2161 					return 0;
2162 			}
2163 		}
2164 		i++;
2165 	}
2166 
2167 	if (i >= argc) {
2168 		print_debugger_command_usage(argv[0]);
2169 		return 0;
2170 	}
2171 
2172 	block_cache* cache = (struct block_cache*)(addr_t)parse_expression(argv[i]);
2173 	if (cache == NULL) {
2174 		kprintf("invalid cache address\n");
2175 		return 0;
2176 	}
2177 
2178 	off_t blockNumber = -1;
2179 	if (i + 1 < argc) {
2180 		blockNumber = parse_expression(argv[i + 1]);
2181 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
2182 			&blockNumber);
2183 		if (block != NULL)
2184 			dump_block_long(block);
2185 		else
2186 			kprintf("block %Ld not found\n", blockNumber);
2187 		return 0;
2188 	}
2189 
2190 	kprintf("BLOCK CACHE: %p\n", cache);
2191 
2192 	kprintf(" fd:           %d\n", cache->fd);
2193 	kprintf(" max_blocks:   %Ld\n", cache->max_blocks);
2194 	kprintf(" block_size:   %lu\n", cache->block_size);
2195 	kprintf(" next_transaction_id: %ld\n", cache->next_transaction_id);
2196 	kprintf(" buffer_cache: %p\n", cache->buffer_cache);
2197 	kprintf(" busy_reading: %lu, %s waiters\n", cache->busy_reading_count,
2198 		cache->busy_reading_waiters ? "has" : "no");
2199 	kprintf(" busy_writing: %lu, %s waiters\n", cache->busy_writing_count,
2200 		cache->busy_writing_waiters ? "has" : "no");
2201 
2202 	if (!cache->pending_notifications.IsEmpty()) {
2203 		kprintf(" pending notifications:\n");
2204 
2205 		NotificationList::Iterator iterator
2206 			= cache->pending_notifications.GetIterator();
2207 		while (iterator.HasNext()) {
2208 			cache_notification* notification = iterator.Next();
2209 
2210 			kprintf("  %p %5lx %p - %p\n", notification,
2211 				notification->events_pending, notification->hook,
2212 				notification->data);
2213 		}
2214 	}
2215 
2216 	if (showTransactions) {
2217 		kprintf(" transactions:\n");
2218 		kprintf("address       id state  blocks  main   sub\n");
2219 
2220 		hash_iterator iterator;
2221 		hash_open(cache->transaction_hash, &iterator);
2222 
2223 		cache_transaction* transaction;
2224 		while ((transaction = (cache_transaction*)hash_next(
2225 				cache->transaction_hash, &iterator)) != NULL) {
2226 			kprintf("%p %5ld %-7s %5ld %5ld %5ld\n", transaction,
2227 				transaction->id, transaction->open ? "open" : "closed",
2228 				transaction->num_blocks, transaction->main_num_blocks,
2229 				transaction->sub_num_blocks);
2230 		}
2231 	}
2232 
2233 	if (showBlocks) {
2234 		kprintf(" blocks:\n");
2235 		kprintf("address  block no. current  original parent    refs access "
2236 			"flags transact prev. trans\n");
2237 	}
2238 
2239 	uint32 referenced = 0;
2240 	uint32 count = 0;
2241 	uint32 dirty = 0;
2242 	uint32 discarded = 0;
2243 	hash_iterator iterator;
2244 	hash_open(cache->hash, &iterator);
2245 	cached_block* block;
2246 	while ((block = (cached_block*)hash_next(cache->hash, &iterator)) != NULL) {
2247 		if (showBlocks)
2248 			dump_block(block);
2249 
2250 		if (block->is_dirty)
2251 			dirty++;
2252 		if (block->discard)
2253 			discarded++;
2254 		if (block->ref_count)
2255 			referenced++;
2256 		count++;
2257 	}
2258 
2259 	kprintf(" %ld blocks total, %ld dirty, %ld discarded, %ld referenced, %ld "
2260 		"busy, %" B_PRIu32 " in unused.\n", count, dirty, discarded, referenced,
2261 		cache->busy_reading_count, cache->unused_block_count);
2262 
2263 	hash_close(cache->hash, &iterator, false);
2264 	return 0;
2265 }
2266 
2267 
2268 static int
2269 dump_transaction(int argc, char** argv)
2270 {
2271 	bool showBlocks = false;
2272 	int i = 1;
2273 	if (argc > 1 && !strcmp(argv[1], "-b")) {
2274 		showBlocks = true;
2275 		i++;
2276 	}
2277 
2278 	if (argc - i < 1 || argc - i > 2) {
2279 		print_debugger_command_usage(argv[0]);
2280 		return 0;
2281 	}
2282 
2283 	cache_transaction* transaction = NULL;
2284 
2285 	if (argc - i == 1) {
2286 		transaction = (cache_transaction*)(addr_t)parse_expression(argv[i]);
2287 	} else {
2288 		block_cache* cache = (block_cache*)(addr_t)parse_expression(argv[i]);
2289 		int32 id = parse_expression(argv[i + 1]);
2290 		transaction = lookup_transaction(cache, id);
2291 		if (transaction == NULL) {
2292 			kprintf("No transaction with ID %ld found.\n", id);
2293 			return 0;
2294 		}
2295 	}
2296 
2297 	kprintf("TRANSACTION %p\n", transaction);
2298 
2299 	kprintf(" id:             %ld\n", transaction->id);
2300 	kprintf(" num block:      %ld\n", transaction->num_blocks);
2301 	kprintf(" main num block: %ld\n", transaction->main_num_blocks);
2302 	kprintf(" sub num block:  %ld\n", transaction->sub_num_blocks);
2303 	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
2304 	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
2305 	kprintf(" idle:           %Ld secs\n",
2306 		(system_time() - transaction->last_used) / 1000000);
2307 
2308 	kprintf(" listeners:\n");
2309 
2310 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2311 	while (iterator.HasNext()) {
2312 		cache_listener* listener = iterator.Next();
2313 
2314 		kprintf("  %p %5lx %p - %p\n", listener, listener->events_pending,
2315 			listener->hook, listener->data);
2316 	}
2317 
2318 	if (!showBlocks)
2319 		return 0;
2320 
2321 	kprintf(" blocks:\n");
2322 	kprintf("address  block no. current  original parent    refs access "
2323 		"flags transact prev. trans\n");
2324 
2325 	cached_block* block = transaction->first_block;
2326 	while (block != NULL) {
2327 		dump_block(block);
2328 		block = block->transaction_next;
2329 	}
2330 
2331 	kprintf("--\n");
2332 
2333 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
2334 	while (blockIterator.HasNext()) {
2335 		block = blockIterator.Next();
2336 		dump_block(block);
2337 	}
2338 
2339 	return 0;
2340 }
2341 
2342 
2343 static int
2344 dump_caches(int argc, char** argv)
2345 {
2346 	kprintf("Block caches:\n");
2347 	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
2348 	while (i.HasNext()) {
2349 		block_cache* cache = i.Next();
2350 		if (cache == (block_cache*)&sMarkCache)
2351 			continue;
2352 
2353 		kprintf("  %p\n", cache);
2354 	}
2355 
2356 	return 0;
2357 }
2358 
2359 
2360 #if BLOCK_CACHE_BLOCK_TRACING >= 2
2361 static int
2362 dump_block_data(int argc, char** argv)
2363 {
2364 	using namespace BlockTracing;
2365 
2366 	// Determine which blocks to show
2367 
2368 	bool printStackTrace = true;
2369 	uint32 which = 0;
2370 	int32 i = 1;
2371 	while (i < argc && argv[i][0] == '-') {
2372 		char* arg = &argv[i][1];
2373 		while (arg[0]) {
2374 			switch (arg[0]) {
2375 				case 'c':
2376 					which |= BlockData::kCurrent;
2377 					break;
2378 				case 'p':
2379 					which |= BlockData::kParent;
2380 					break;
2381 				case 'o':
2382 					which |= BlockData::kOriginal;
2383 					break;
2384 
2385 				default:
2386 					kprintf("invalid block specifier (only o/c/p are "
2387 						"allowed).\n");
2388 					return 0;
2389 			}
2390 			arg++;
2391 		}
2392 
2393 		i++;
2394 	}
2395 	if (which == 0)
2396 		which = BlockData::kCurrent | BlockData::kParent | BlockData::kOriginal;
2397 
2398 	if (i == argc) {
2399 		print_debugger_command_usage(argv[0]);
2400 		return 0;
2401 	}
2402 
2403 	// Get the range of blocks to print
2404 
2405 	int64 from = parse_expression(argv[i]);
2406 	int64 to = from;
2407 	if (argc > i + 1)
2408 		to = parse_expression(argv[i + 1]);
2409 	if (to < from)
2410 		to = from;
2411 
2412 	uint32 offset = 0;
2413 	uint32 size = LONG_MAX;
2414 	if (argc > i + 2)
2415 		offset = parse_expression(argv[i + 2]);
2416 	if (argc > i + 3)
2417 		size = parse_expression(argv[i + 3]);
2418 
2419 	TraceEntryIterator iterator;
2420 	iterator.MoveTo(from - 1);
2421 
2422 	static char sBuffer[1024];
2423 	LazyTraceOutput out(sBuffer, sizeof(sBuffer), TRACE_OUTPUT_TEAM_ID);
2424 
2425 	while (TraceEntry* entry = iterator.Next()) {
2426 		int32 index = iterator.Index();
2427 		if (index > to)
2428 			break;
2429 
2430 		Action* action = dynamic_cast<Action*>(entry);
2431 		if (action != NULL) {
2432 			out.Clear();
2433 			out.DumpEntry(action);
2434 			continue;
2435 		}
2436 
2437 		BlockData* blockData = dynamic_cast<BlockData*>(entry);
2438 		if (blockData == NULL)
2439 			continue;
2440 
2441 		out.Clear();
2442 
2443 		const char* dump = out.DumpEntry(entry);
2444 		int length = strlen(dump);
2445 		if (length > 0 && dump[length - 1] == '\n')
2446 			length--;
2447 
2448 		kprintf("%5ld. %.*s\n", index, length, dump);
2449 
2450 		if (printStackTrace) {
2451 			out.Clear();
2452 			entry->DumpStackTrace(out);
2453 			if (out.Size() > 0)
2454 				kputs(out.Buffer());
2455 		}
2456 
2457 		blockData->DumpBlocks(which, offset, size);
2458 	}
2459 
2460 	return 0;
2461 }
2462 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
2463 
2464 
2465 #endif	// DEBUG_BLOCK_CACHE
2466 
2467 
2468 /*!	Traverses through the block_cache list, and returns one cache after the
2469 	other. The cache returned is automatically locked when you get it, and
2470 	unlocked with the next call to this function. Ignores caches that are in
2471 	deletion state.
2472 	Returns \c NULL when the end of the list is reached.
2473 */
2474 static block_cache*
2475 get_next_locked_block_cache(block_cache* last)
2476 {
2477 	MutexLocker _(sCachesLock);
2478 
2479 	block_cache* cache;
2480 	if (last != NULL) {
2481 		mutex_unlock(&last->lock);
2482 
2483 		cache = sCaches.GetNext((block_cache*)&sMarkCache);
2484 		sCaches.Remove((block_cache*)&sMarkCache);
2485 	} else
2486 		cache = sCaches.Head();
2487 
2488 	if (cache != NULL) {
2489 		mutex_lock(&cache->lock);
2490 		sCaches.Insert(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
2491 	}
2492 
2493 	return cache;
2494 }
2495 
2496 
2497 /*!	Background thread that continuously checks for pending notifications of
2498 	all caches.
2499 	Every two seconds, it will also write back up to 64 blocks per cache.
2500 */
2501 static status_t
2502 block_notifier_and_writer(void* /*data*/)
2503 {
2504 	const bigtime_t kTimeout = 2000000LL;
2505 	bigtime_t timeout = kTimeout;
2506 
2507 	while (true) {
2508 		bigtime_t start = system_time();
2509 
2510 		status_t status = acquire_sem_etc(sEventSemaphore, 1,
2511 			B_RELATIVE_TIMEOUT, timeout);
2512 		if (status == B_OK) {
2513 			flush_pending_notifications();
2514 			timeout -= system_time() - start;
2515 			continue;
2516 		}
2517 
2518 		// write 64 blocks of each block_cache every two seconds
2519 		// TODO: change this once we have an I/O scheduler
2520 		timeout = kTimeout;
2521 		size_t usedMemory;
2522 		object_cache_get_usage(sBlockCache, &usedMemory);
2523 
2524 		block_cache* cache = NULL;
2525 		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
2526 			BlockWriter writer(cache, 64);
2527 
2528 			size_t cacheUsedMemory;
2529 			object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
2530 			usedMemory += cacheUsedMemory;
2531 
2532 			if (cache->num_dirty_blocks) {
2533 				// This cache is not using transactions, we'll scan the blocks
2534 				// directly
2535 				hash_iterator iterator;
2536 				hash_open(cache->hash, &iterator);
2537 
2538 				cached_block* block;
2539 				while ((block = (cached_block*)hash_next(cache->hash, &iterator))
2540 						!= NULL) {
2541 					if (block->CanBeWritten() && !writer.Add(block))
2542 						break;
2543 				}
2544 
2545 				hash_close(cache->hash, &iterator, false);
2546 			} else {
2547 				hash_iterator iterator;
2548 				hash_open(cache->transaction_hash, &iterator);
2549 
2550 				cache_transaction* transaction;
2551 				while ((transaction = (cache_transaction*)hash_next(
2552 						cache->transaction_hash, &iterator)) != NULL) {
2553 					if (transaction->open) {
2554 						if (system_time() > transaction->last_used
2555 								+ kTransactionIdleTime) {
2556 							// Transaction is open but idle
2557 							notify_transaction_listeners(cache, transaction,
2558 								TRANSACTION_IDLE);
2559 						}
2560 						continue;
2561 					}
2562 
2563 					bool hasLeftOvers;
2564 						// we ignore this one
2565 					if (!writer.Add(transaction, &iterator, hasLeftOvers))
2566 						break;
2567 				}
2568 
2569 				hash_close(cache->transaction_hash, &iterator, false);
2570 			}
2571 
2572 			writer.Write();
2573 
2574 			if ((block_cache_used_memory() / B_PAGE_SIZE)
2575 					> vm_page_num_pages() / 2) {
2576 				// Try to reduce memory usage to half of the available
2577 				// RAM at maximum
2578 				cache->RemoveUnusedBlocks(1000, 10);
2579 			}
2580 		}
2581 
2582 		MutexLocker _(sCachesMemoryUseLock);
2583 		sUsedMemory = usedMemory;
2584 	}
2585 
2586 	// never can get here
2587 	return B_OK;
2588 }
2589 
2590 
2591 /*!	Notify function for wait_for_notifications(). */
2592 static void
2593 notify_sync(int32 transactionID, int32 event, void* _cache)
2594 {
2595 	block_cache* cache = (block_cache*)_cache;
2596 
2597 	cache->condition_variable.NotifyOne();
2598 }
2599 
2600 
2601 /*!	Must be called with the sCachesLock held. */
2602 static bool
2603 is_valid_cache(block_cache* cache)
2604 {
2605 	ASSERT_LOCKED_MUTEX(&sCachesLock);
2606 
2607 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
2608 	while (iterator.HasNext()) {
2609 		if (cache == iterator.Next())
2610 			return true;
2611 	}
2612 
2613 	return false;
2614 }
2615 
2616 
2617 /*!	Waits until all pending notifications are carried out.
2618 	Safe to be called from the block writer/notifier thread.
2619 	You must not hold the \a cache lock when calling this function.
2620 */
2621 static void
2622 wait_for_notifications(block_cache* cache)
2623 {
2624 	MutexLocker locker(sCachesLock);
2625 
2626 	if (find_thread(NULL) == sNotifierWriterThread) {
2627 		// We're the notifier thread, don't wait, but flush all pending
2628 		// notifications directly.
2629 		if (is_valid_cache(cache))
2630 			flush_pending_notifications(cache);
2631 		return;
2632 	}
2633 
2634 	// add sync notification
2635 	cache_notification notification;
2636 	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
2637 		cache);
2638 
2639 	ConditionVariableEntry entry;
2640 	cache->condition_variable.Add(&entry);
2641 
2642 	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
2643 	locker.Unlock();
2644 
2645 	// wait for notification hook to be called
2646 	entry.Wait();
2647 }
2648 
2649 
2650 status_t
2651 block_cache_init(void)
2652 {
2653 	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
2654 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
2655 	if (sBlockCache == NULL)
2656 		return B_NO_MEMORY;
2657 
2658 	new (&sCaches) DoublyLinkedList<block_cache>;
2659 		// manually call constructor
2660 
2661 	sEventSemaphore = create_sem(0, "block cache event");
2662 	if (sEventSemaphore < B_OK)
2663 		return sEventSemaphore;
2664 
2665 	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
2666 		"block notifier/writer", B_LOW_PRIORITY, NULL);
2667 	if (sNotifierWriterThread >= B_OK)
2668 		resume_thread(sNotifierWriterThread);
2669 
2670 #if DEBUG_BLOCK_CACHE
2671 	add_debugger_command_etc("block_caches", &dump_caches,
2672 		"dumps all block caches", "\n", 0);
2673 	add_debugger_command_etc("block_cache", &dump_cache,
2674 		"dumps a specific block cache",
2675 		"[-bt] <cache-address> [block-number]\n"
2676 		"  -t lists the transactions\n"
2677 		"  -b lists all blocks\n", 0);
2678 	add_debugger_command("cached_block", &dump_cached_block,
2679 		"dumps the specified cached block");
2680 	add_debugger_command_etc("transaction", &dump_transaction,
2681 		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
2682 		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
2683 		"  -b lists all blocks that are part of this transaction\n", 0);
2684 #	if BLOCK_CACHE_BLOCK_TRACING >= 2
2685 	add_debugger_command_etc("block_cache_data", &dump_block_data,
2686 		"dumps the data blocks logged for the actions",
2687 		"[-cpo] <from> [<to> [<offset> [<size>]]]\n"
2688 		"If no data specifier is used, all blocks are shown by default.\n"
2689 		" -c       the current data is shown, if available.\n"
2690 		" -p       the parent data is shown, if available.\n"
2691 		" -o       the original data is shown, if available.\n"
2692 		" <from>   first index of tracing entries to show.\n"
2693 		" <to>     if given, the last entry. If not, only <from> is shown.\n"
2694 		" <offset> the offset of the block data.\n"
2695 		" <from>   the size of the block data that is dumped\n", 0);
2696 #	endif
2697 #endif	// DEBUG_BLOCK_CACHE
2698 
2699 	return B_OK;
2700 }
2701 
2702 
2703 size_t
2704 block_cache_used_memory(void)
2705 {
2706 	MutexLocker _(sCachesMemoryUseLock);
2707 	return sUsedMemory;
2708 }
2709 
2710 
2711 //	#pragma mark - public transaction API
2712 
2713 
2714 int32
2715 cache_start_transaction(void* _cache)
2716 {
2717 	block_cache* cache = (block_cache*)_cache;
2718 	TransactionLocker locker(cache);
2719 
2720 	if (cache->last_transaction && cache->last_transaction->open) {
2721 		panic("last transaction (%ld) still open!\n",
2722 			cache->last_transaction->id);
2723 	}
2724 
2725 	cache_transaction* transaction = new(std::nothrow) cache_transaction;
2726 	if (transaction == NULL)
2727 		return B_NO_MEMORY;
2728 
2729 	transaction->id = atomic_add(&cache->next_transaction_id, 1);
2730 	cache->last_transaction = transaction;
2731 
2732 	TRACE(("cache_start_transaction(): id %ld started\n", transaction->id));
2733 	T(Action("start", cache, transaction));
2734 
2735 	hash_insert_grow(cache->transaction_hash, transaction);
2736 
2737 	return transaction->id;
2738 }
2739 
2740 
2741 status_t
2742 cache_sync_transaction(void* _cache, int32 id)
2743 {
2744 	block_cache* cache = (block_cache*)_cache;
2745 	bool hadBusy;
2746 
2747 	TRACE(("cache_sync_transaction(id %ld)\n", id));
2748 
2749 	do {
2750 		TransactionLocker locker(cache);
2751 		hadBusy = false;
2752 
2753 		BlockWriter writer(cache);
2754 		hash_iterator iterator;
2755 		hash_open(cache->transaction_hash, &iterator);
2756 
2757 		cache_transaction* transaction;
2758 		while ((transaction = (cache_transaction*)hash_next(
2759 				cache->transaction_hash, &iterator)) != NULL) {
2760 			// close all earlier transactions which haven't been closed yet
2761 
2762 			if (transaction->busy_writing_count != 0) {
2763 				hadBusy = true;
2764 				continue;
2765 			}
2766 			if (transaction->id <= id && !transaction->open) {
2767 				// write back all of their remaining dirty blocks
2768 				T(Action("sync", cache, transaction));
2769 
2770 				bool hasLeftOvers;
2771 				writer.Add(transaction, &iterator, hasLeftOvers);
2772 
2773 				if (hasLeftOvers) {
2774 					// This transaction contains blocks that a previous
2775 					// transaction is trying to write back in this write run
2776 					hadBusy = true;
2777 				}
2778 			}
2779 		}
2780 
2781 		hash_close(cache->transaction_hash, &iterator, false);
2782 
2783 		status_t status = writer.Write();
2784 		if (status != B_OK)
2785 			return status;
2786 	} while (hadBusy);
2787 
2788 	wait_for_notifications(cache);
2789 		// make sure that all pending TRANSACTION_WRITTEN notifications
2790 		// are handled after we return
2791 	return B_OK;
2792 }
2793 
2794 
2795 status_t
2796 cache_end_transaction(void* _cache, int32 id,
2797 	transaction_notification_hook hook, void* data)
2798 {
2799 	block_cache* cache = (block_cache*)_cache;
2800 	TransactionLocker locker(cache);
2801 
2802 	TRACE(("cache_end_transaction(id = %ld)\n", id));
2803 
2804 	cache_transaction* transaction = lookup_transaction(cache, id);
2805 	if (transaction == NULL) {
2806 		panic("cache_end_transaction(): invalid transaction ID\n");
2807 		return B_BAD_VALUE;
2808 	}
2809 
2810 	// Write back all pending transaction blocks
2811 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2812 	if (status != B_OK)
2813 		return status;
2814 
2815 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2816 
2817 	if (hook != NULL
2818 		&& add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN,
2819 			hook, data) != B_OK) {
2820 		return B_NO_MEMORY;
2821 	}
2822 
2823 	T(Action("end", cache, transaction));
2824 
2825 	// iterate through all blocks and free the unchanged original contents
2826 
2827 	cached_block* next;
2828 	for (cached_block* block = transaction->first_block; block != NULL;
2829 			block = next) {
2830 		next = block->transaction_next;
2831 		ASSERT(block->previous_transaction == NULL);
2832 
2833 		if (block->discard) {
2834 			// This block has been discarded in the transaction
2835 			cache->DiscardBlock(block);
2836 			transaction->num_blocks--;
2837 			continue;
2838 		}
2839 
2840 		if (block->original_data != NULL) {
2841 			cache->Free(block->original_data);
2842 			block->original_data = NULL;
2843 		}
2844 		if (block->parent_data != NULL) {
2845 			ASSERT(transaction->has_sub_transaction);
2846 			cache->FreeBlockParentData(block);
2847 		}
2848 
2849 		// move the block to the previous transaction list
2850 		transaction->blocks.Add(block);
2851 
2852 		block->previous_transaction = transaction;
2853 		block->transaction_next = NULL;
2854 		block->transaction = NULL;
2855 	}
2856 
2857 	transaction->open = false;
2858 	return B_OK;
2859 }
2860 
2861 
2862 status_t
2863 cache_abort_transaction(void* _cache, int32 id)
2864 {
2865 	block_cache* cache = (block_cache*)_cache;
2866 	TransactionLocker locker(cache);
2867 
2868 	TRACE(("cache_abort_transaction(id = %ld)\n", id));
2869 
2870 	cache_transaction* transaction = lookup_transaction(cache, id);
2871 	if (transaction == NULL) {
2872 		panic("cache_abort_transaction(): invalid transaction ID\n");
2873 		return B_BAD_VALUE;
2874 	}
2875 
2876 	T(Abort(cache, transaction));
2877 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
2878 
2879 	// iterate through all blocks and restore their original contents
2880 
2881 	cached_block* block = transaction->first_block;
2882 	cached_block* next;
2883 	for (; block != NULL; block = next) {
2884 		next = block->transaction_next;
2885 
2886 		if (block->original_data != NULL) {
2887 			TRACE(("cache_abort_transaction(id = %ld): restored contents of "
2888 				"block %Ld\n", transaction->id, block->block_number));
2889 			memcpy(block->current_data, block->original_data,
2890 				cache->block_size);
2891 			cache->Free(block->original_data);
2892 			block->original_data = NULL;
2893 		}
2894 		if (transaction->has_sub_transaction && block->parent_data != NULL)
2895 			cache->FreeBlockParentData(block);
2896 
2897 		block->transaction_next = NULL;
2898 		block->transaction = NULL;
2899 		block->discard = false;
2900 		if (block->previous_transaction == NULL)
2901 			block->is_dirty = false;
2902 	}
2903 
2904 	hash_remove(cache->transaction_hash, transaction);
2905 	delete_transaction(cache, transaction);
2906 	return B_OK;
2907 }
2908 
2909 
2910 /*!	Acknowledges the current parent transaction, and starts a new transaction
2911 	from its sub transaction.
2912 	The new transaction also gets a new transaction ID.
2913 */
2914 int32
2915 cache_detach_sub_transaction(void* _cache, int32 id,
2916 	transaction_notification_hook hook, void* data)
2917 {
2918 	block_cache* cache = (block_cache*)_cache;
2919 	TransactionLocker locker(cache);
2920 
2921 	TRACE(("cache_detach_sub_transaction(id = %ld)\n", id));
2922 
2923 	cache_transaction* transaction = lookup_transaction(cache, id);
2924 	if (transaction == NULL) {
2925 		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
2926 		return B_BAD_VALUE;
2927 	}
2928 	if (!transaction->has_sub_transaction)
2929 		return B_BAD_VALUE;
2930 
2931 	// iterate through all blocks and free the unchanged original contents
2932 
2933 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2934 	if (status != B_OK)
2935 		return status;
2936 
2937 	// create a new transaction for the sub transaction
2938 	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
2939 	if (newTransaction == NULL)
2940 		return B_NO_MEMORY;
2941 
2942 	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
2943 	T(Detach(cache, transaction, newTransaction));
2944 
2945 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2946 
2947 	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
2948 			data) != B_OK) {
2949 		delete newTransaction;
2950 		return B_NO_MEMORY;
2951 	}
2952 
2953 	cached_block* last = NULL;
2954 	cached_block* next;
2955 	for (cached_block* block = transaction->first_block; block != NULL;
2956 			block = next) {
2957 		next = block->transaction_next;
2958 		ASSERT(block->previous_transaction == NULL);
2959 
2960 		if (block->discard) {
2961 			cache->DiscardBlock(block);
2962 			transaction->main_num_blocks--;
2963 			continue;
2964 		}
2965 
2966 		if (block->parent_data != NULL) {
2967 			// The block changed in the parent - free the original data, since
2968 			// they will be replaced by what is in current.
2969 			ASSERT(block->original_data != NULL);
2970 			cache->Free(block->original_data);
2971 
2972 			if (block->parent_data != block->current_data) {
2973 				// The block had been changed in both transactions
2974 				block->original_data = block->parent_data;
2975 			} else {
2976 				// The block has only been changed in the parent
2977 				block->original_data = NULL;
2978 			}
2979 
2980 			// move the block to the previous transaction list
2981 			transaction->blocks.Add(block);
2982 			block->previous_transaction = transaction;
2983 			block->parent_data = NULL;
2984 		}
2985 
2986 		if (block->original_data != NULL) {
2987 			// This block had been changed in the current sub transaction,
2988 			// we need to move this block over to the new transaction.
2989 			ASSERT(block->parent_data == NULL);
2990 
2991 			if (last == NULL)
2992 				newTransaction->first_block = block;
2993 			else
2994 				last->transaction_next = block;
2995 
2996 			block->transaction = newTransaction;
2997 			last = block;
2998 		} else
2999 			block->transaction = NULL;
3000 
3001 		block->transaction_next = NULL;
3002 	}
3003 
3004 	newTransaction->num_blocks = transaction->sub_num_blocks;
3005 
3006 	transaction->open = false;
3007 	transaction->has_sub_transaction = false;
3008 	transaction->num_blocks = transaction->main_num_blocks;
3009 	transaction->sub_num_blocks = 0;
3010 
3011 	hash_insert_grow(cache->transaction_hash, newTransaction);
3012 	cache->last_transaction = newTransaction;
3013 
3014 	return newTransaction->id;
3015 }
3016 
3017 
3018 status_t
3019 cache_abort_sub_transaction(void* _cache, int32 id)
3020 {
3021 	block_cache* cache = (block_cache*)_cache;
3022 	TransactionLocker locker(cache);
3023 
3024 	TRACE(("cache_abort_sub_transaction(id = %ld)\n", id));
3025 
3026 	cache_transaction* transaction = lookup_transaction(cache, id);
3027 	if (transaction == NULL) {
3028 		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
3029 		return B_BAD_VALUE;
3030 	}
3031 	if (!transaction->has_sub_transaction)
3032 		return B_BAD_VALUE;
3033 
3034 	T(Abort(cache, transaction));
3035 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3036 
3037 	// revert all changes back to the version of the parent
3038 
3039 	cached_block* block = transaction->first_block;
3040 	cached_block* last = NULL;
3041 	cached_block* next;
3042 	for (; block != NULL; block = next) {
3043 		next = block->transaction_next;
3044 
3045 		if (block->parent_data == NULL) {
3046 			// The parent transaction didn't change the block, but the sub
3047 			// transaction did - we need to revert to the original data.
3048 			// The block is no longer part of the transaction
3049 			if (block->original_data != NULL) {
3050 				// The block might not have original data if was empty
3051 				memcpy(block->current_data, block->original_data,
3052 					cache->block_size);
3053 			}
3054 
3055 			if (last != NULL)
3056 				last->transaction_next = next;
3057 			else
3058 				transaction->first_block = next;
3059 
3060 			block->transaction_next = NULL;
3061 			block->transaction = NULL;
3062 			transaction->num_blocks--;
3063 
3064 			if (block->previous_transaction == NULL) {
3065 				cache->Free(block->original_data);
3066 				block->original_data = NULL;
3067 				block->is_dirty = false;
3068 
3069 				if (block->ref_count == 0) {
3070 					// Move the block into the unused list if possible
3071 					block->unused = true;
3072 					cache->unused_blocks.Add(block);
3073 					cache->unused_block_count++;
3074 				}
3075 			}
3076 		} else {
3077 			if (block->parent_data != block->current_data) {
3078 				// The block has been changed and must be restored - the block
3079 				// is still dirty and part of the transaction
3080 				TRACE(("cache_abort_sub_transaction(id = %ld): restored "
3081 					"contents of block %Ld\n", transaction->id,
3082 					block->block_number));
3083 				memcpy(block->current_data, block->parent_data,
3084 					cache->block_size);
3085 				cache->Free(block->parent_data);
3086 				// The block stays dirty
3087 			}
3088 			block->parent_data = NULL;
3089 			last = block;
3090 		}
3091 
3092 		block->discard = false;
3093 	}
3094 
3095 	// all subsequent changes will go into the main transaction
3096 	transaction->has_sub_transaction = false;
3097 	transaction->sub_num_blocks = 0;
3098 
3099 	return B_OK;
3100 }
3101 
3102 
3103 status_t
3104 cache_start_sub_transaction(void* _cache, int32 id)
3105 {
3106 	block_cache* cache = (block_cache*)_cache;
3107 	TransactionLocker locker(cache);
3108 
3109 	TRACE(("cache_start_sub_transaction(id = %ld)\n", id));
3110 
3111 	cache_transaction* transaction = lookup_transaction(cache, id);
3112 	if (transaction == NULL) {
3113 		panic("cache_start_sub_transaction(): invalid transaction ID %ld\n",
3114 			id);
3115 		return B_BAD_VALUE;
3116 	}
3117 
3118 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3119 
3120 	// move all changed blocks up to the parent
3121 
3122 	cached_block* block = transaction->first_block;
3123 	cached_block* next;
3124 	for (; block != NULL; block = next) {
3125 		next = block->transaction_next;
3126 
3127 		if (block->parent_data != NULL) {
3128 			// There already is an older sub transaction - we acknowledge
3129 			// its changes and move its blocks up to the parent
3130 			ASSERT(transaction->has_sub_transaction);
3131 			cache->FreeBlockParentData(block);
3132 		}
3133 		if (block->discard) {
3134 			// This block has been discarded in the parent transaction.
3135 			// Just throw away any changes made in this transaction, so that
3136 			// it can still be reverted to its original contents if needed
3137 			ASSERT(block->previous_transaction == NULL);
3138 			if (block->original_data != NULL) {
3139 				memcpy(block->current_data, block->original_data,
3140 					cache->block_size);
3141 				block->original_data = NULL;
3142 			}
3143 			continue;
3144 		}
3145 
3146 		// we "allocate" the parent data lazily, that means, we don't copy
3147 		// the data (and allocate memory for it) until we need to
3148 		block->parent_data = block->current_data;
3149 	}
3150 
3151 	// all subsequent changes will go into the sub transaction
3152 	transaction->has_sub_transaction = true;
3153 	transaction->main_num_blocks = transaction->num_blocks;
3154 	transaction->sub_num_blocks = 0;
3155 	T(Action("start-sub", cache, transaction));
3156 
3157 	return B_OK;
3158 }
3159 
3160 
3161 /*!	Adds a transaction listener that gets notified when the transaction
3162 	is ended, aborted, written, or idle as specified by \a events.
3163 	The listener gets automatically removed when the transaction ends.
3164 */
3165 status_t
3166 cache_add_transaction_listener(void* _cache, int32 id, int32 events,
3167 	transaction_notification_hook hook, void* data)
3168 {
3169 	block_cache* cache = (block_cache*)_cache;
3170 	TransactionLocker locker(cache);
3171 
3172 	cache_transaction* transaction = lookup_transaction(cache, id);
3173 	if (transaction == NULL)
3174 		return B_BAD_VALUE;
3175 
3176 	return add_transaction_listener(cache, transaction, events, hook, data);
3177 }
3178 
3179 
3180 status_t
3181 cache_remove_transaction_listener(void* _cache, int32 id,
3182 	transaction_notification_hook hookFunction, void* data)
3183 {
3184 	block_cache* cache = (block_cache*)_cache;
3185 	TransactionLocker locker(cache);
3186 
3187 	cache_transaction* transaction = lookup_transaction(cache, id);
3188 	if (transaction == NULL)
3189 		return B_BAD_VALUE;
3190 
3191 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
3192 	while (iterator.HasNext()) {
3193 		cache_listener* listener = iterator.Next();
3194 		if (listener->data == data && listener->hook == hookFunction) {
3195 			iterator.Remove();
3196 
3197 			if (listener->events_pending != 0) {
3198 				MutexLocker _(sNotificationsLock);
3199 				if (listener->events_pending != 0)
3200 					cache->pending_notifications.Remove(listener);
3201 			}
3202 			delete listener;
3203 			return B_OK;
3204 		}
3205 	}
3206 
3207 	return B_ENTRY_NOT_FOUND;
3208 }
3209 
3210 
3211 status_t
3212 cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
3213 	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
3214 {
3215 	cached_block* block = (cached_block*)*_cookie;
3216 	block_cache* cache = (block_cache*)_cache;
3217 	TransactionLocker locker(cache);
3218 
3219 	cache_transaction* transaction = lookup_transaction(cache, id);
3220 	if (transaction == NULL || !transaction->open)
3221 		return B_BAD_VALUE;
3222 
3223 	if (block == NULL)
3224 		block = transaction->first_block;
3225 	else
3226 		block = block->transaction_next;
3227 
3228 	if (transaction->has_sub_transaction) {
3229 		if (mainOnly) {
3230 			// find next block that the parent changed
3231 			while (block != NULL && block->parent_data == NULL)
3232 				block = block->transaction_next;
3233 		} else {
3234 			// find next non-discarded block
3235 			while (block != NULL && block->discard)
3236 				block = block->transaction_next;
3237 		}
3238 	}
3239 
3240 	if (block == NULL)
3241 		return B_ENTRY_NOT_FOUND;
3242 
3243 	if (_blockNumber)
3244 		*_blockNumber = block->block_number;
3245 	if (_data)
3246 		*_data = mainOnly ? block->parent_data : block->current_data;
3247 	if (_unchangedData)
3248 		*_unchangedData = block->original_data;
3249 
3250 	*_cookie = (addr_t)block;
3251 	return B_OK;
3252 }
3253 
3254 
3255 int32
3256 cache_blocks_in_transaction(void* _cache, int32 id)
3257 {
3258 	block_cache* cache = (block_cache*)_cache;
3259 	TransactionLocker locker(cache);
3260 
3261 	cache_transaction* transaction = lookup_transaction(cache, id);
3262 	if (transaction == NULL)
3263 		return B_BAD_VALUE;
3264 
3265 	return transaction->num_blocks;
3266 }
3267 
3268 
3269 /*!	Returns the number of blocks that are part of the main transaction. If this
3270 	transaction does not have a sub transaction yet, this is the same value as
3271 	cache_blocks_in_transaction() would return.
3272 */
3273 int32
3274 cache_blocks_in_main_transaction(void* _cache, int32 id)
3275 {
3276 	block_cache* cache = (block_cache*)_cache;
3277 	TransactionLocker locker(cache);
3278 
3279 	cache_transaction* transaction = lookup_transaction(cache, id);
3280 	if (transaction == NULL)
3281 		return B_BAD_VALUE;
3282 
3283 	if (transaction->has_sub_transaction)
3284 		return transaction->main_num_blocks;
3285 
3286 	return transaction->num_blocks;
3287 }
3288 
3289 
3290 int32
3291 cache_blocks_in_sub_transaction(void* _cache, int32 id)
3292 {
3293 	block_cache* cache = (block_cache*)_cache;
3294 	TransactionLocker locker(cache);
3295 
3296 	cache_transaction* transaction = lookup_transaction(cache, id);
3297 	if (transaction == NULL)
3298 		return B_BAD_VALUE;
3299 
3300 	return transaction->sub_num_blocks;
3301 }
3302 
3303 
3304 //	#pragma mark - public block cache API
3305 
3306 
3307 void
3308 block_cache_delete(void* _cache, bool allowWrites)
3309 {
3310 	block_cache* cache = (block_cache*)_cache;
3311 
3312 	if (allowWrites)
3313 		block_cache_sync(cache);
3314 
3315 	mutex_lock(&sCachesLock);
3316 	sCaches.Remove(cache);
3317 	mutex_unlock(&sCachesLock);
3318 
3319 	mutex_lock(&cache->lock);
3320 
3321 	// wait for all blocks to become unbusy
3322 	wait_for_busy_reading_blocks(cache);
3323 	wait_for_busy_writing_blocks(cache);
3324 
3325 	// free all blocks
3326 
3327 	uint32 cookie = 0;
3328 	cached_block* block;
3329 	while ((block = (cached_block*)hash_remove_first(cache->hash,
3330 			&cookie)) != NULL) {
3331 		cache->FreeBlock(block);
3332 	}
3333 
3334 	// free all transactions (they will all be aborted)
3335 
3336 	cookie = 0;
3337 	cache_transaction* transaction;
3338 	while ((transaction = (cache_transaction*)hash_remove_first(
3339 			cache->transaction_hash, &cookie)) != NULL) {
3340 		delete transaction;
3341 	}
3342 
3343 	delete cache;
3344 }
3345 
3346 
3347 void*
3348 block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
3349 {
3350 	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
3351 		readOnly);
3352 	if (cache == NULL)
3353 		return NULL;
3354 
3355 	if (cache->Init() != B_OK) {
3356 		delete cache;
3357 		return NULL;
3358 	}
3359 
3360 	MutexLocker _(sCachesLock);
3361 	sCaches.Add(cache);
3362 
3363 	return cache;
3364 }
3365 
3366 
3367 status_t
3368 block_cache_sync(void* _cache)
3369 {
3370 	block_cache* cache = (block_cache*)_cache;
3371 
3372 	// We will sync all dirty blocks to disk that have a completed
3373 	// transaction or no transaction only
3374 
3375 	MutexLocker locker(&cache->lock);
3376 
3377 	BlockWriter writer(cache);
3378 	hash_iterator iterator;
3379 	hash_open(cache->hash, &iterator);
3380 
3381 	cached_block* block;
3382 	while ((block = (cached_block*)hash_next(cache->hash, &iterator)) != NULL) {
3383 		if (block->CanBeWritten())
3384 			writer.Add(block);
3385 	}
3386 
3387 	hash_close(cache->hash, &iterator, false);
3388 
3389 	status_t status = writer.Write();
3390 
3391 	locker.Unlock();
3392 
3393 	wait_for_notifications(cache);
3394 		// make sure that all pending TRANSACTION_WRITTEN notifications
3395 		// are handled after we return
3396 	return status;
3397 }
3398 
3399 
3400 status_t
3401 block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
3402 {
3403 	block_cache* cache = (block_cache*)_cache;
3404 
3405 	// We will sync all dirty blocks to disk that have a completed
3406 	// transaction or no transaction only
3407 
3408 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
3409 		panic("block_cache_sync_etc: invalid block number %Ld (max %Ld)",
3410 			blockNumber, cache->max_blocks - 1);
3411 		return B_BAD_VALUE;
3412 	}
3413 
3414 	MutexLocker locker(&cache->lock);
3415 	BlockWriter writer(cache);
3416 
3417 	for (; numBlocks > 0; numBlocks--, blockNumber++) {
3418 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
3419 			&blockNumber);
3420 		if (block == NULL)
3421 			continue;
3422 
3423 		if (block->CanBeWritten())
3424 			writer.Add(block);
3425 	}
3426 
3427 	status_t status = writer.Write();
3428 
3429 	locker.Unlock();
3430 
3431 	wait_for_notifications(cache);
3432 		// make sure that all pending TRANSACTION_WRITTEN notifications
3433 		// are handled after we return
3434 	return status;
3435 }
3436 
3437 
3438 /*!	Discards a block from the current transaction or from the cache.
3439 	You have to call this function when you no longer use a block, ie. when it
3440 	might be reclaimed by the file cache in order to make sure they won't
3441 	interfere.
3442 */
3443 void
3444 block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
3445 {
3446 	// TODO: this could be a nice place to issue the ATA trim command
3447 	block_cache* cache = (block_cache*)_cache;
3448 	TransactionLocker locker(cache);
3449 
3450 	BlockWriter writer(cache);
3451 
3452 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3453 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
3454 			&blockNumber);
3455 		if (block != NULL && block->previous_transaction != NULL)
3456 			writer.Add(block);
3457 	}
3458 
3459 	writer.Write();
3460 		// TODO: this can fail, too!
3461 
3462 	blockNumber -= numBlocks;
3463 		// reset blockNumber to its original value
3464 
3465 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3466 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
3467 			&blockNumber);
3468 		if (block == NULL)
3469 			continue;
3470 
3471 		ASSERT(block->previous_transaction == NULL);
3472 
3473 		if (block->unused) {
3474 			cache->unused_blocks.Remove(block);
3475 			cache->unused_block_count--;
3476 			cache->RemoveBlock(block);
3477 		} else {
3478 			if (block->transaction != NULL && block->parent_data != NULL
3479 				&& block->parent_data != block->current_data) {
3480 				panic("Discarded block %Ld has already been changed in this "
3481 					"transaction!", blockNumber);
3482 			}
3483 
3484 			// mark it as discarded (in the current transaction only, if any)
3485 			block->discard = true;
3486 		}
3487 	}
3488 }
3489 
3490 
3491 status_t
3492 block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
3493 {
3494 	block_cache* cache = (block_cache*)_cache;
3495 	MutexLocker locker(&cache->lock);
3496 
3497 	if (cache->read_only) {
3498 		panic("tried to make block writable on a read-only cache!");
3499 		return B_ERROR;
3500 	}
3501 
3502 	// TODO: this can be done better!
3503 	void* block = get_writable_cached_block(cache, blockNumber,
3504 		blockNumber, 1, transaction, false);
3505 	if (block != NULL) {
3506 		put_cached_block((block_cache*)_cache, blockNumber);
3507 		return B_OK;
3508 	}
3509 
3510 	return B_ERROR;
3511 }
3512 
3513 
3514 void*
3515 block_cache_get_writable_etc(void* _cache, off_t blockNumber, off_t base,
3516 	off_t length, int32 transaction)
3517 {
3518 	block_cache* cache = (block_cache*)_cache;
3519 	MutexLocker locker(&cache->lock);
3520 
3521 	TRACE(("block_cache_get_writable_etc(block = %Ld, transaction = %ld)\n",
3522 		blockNumber, transaction));
3523 	if (cache->read_only)
3524 		panic("tried to get writable block on a read-only cache!");
3525 
3526 	return get_writable_cached_block(cache, blockNumber, base, length,
3527 		transaction, false);
3528 }
3529 
3530 
3531 void*
3532 block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
3533 {
3534 	return block_cache_get_writable_etc(_cache, blockNumber,
3535 		blockNumber, 1, transaction);
3536 }
3537 
3538 
3539 void*
3540 block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
3541 {
3542 	block_cache* cache = (block_cache*)_cache;
3543 	MutexLocker locker(&cache->lock);
3544 
3545 	TRACE(("block_cache_get_empty(block = %Ld, transaction = %ld)\n",
3546 		blockNumber, transaction));
3547 	if (cache->read_only)
3548 		panic("tried to get empty writable block on a read-only cache!");
3549 
3550 	return get_writable_cached_block((block_cache*)_cache, blockNumber,
3551 		blockNumber, 1, transaction, true);
3552 }
3553 
3554 
3555 const void*
3556 block_cache_get_etc(void* _cache, off_t blockNumber, off_t base, off_t length)
3557 {
3558 	block_cache* cache = (block_cache*)_cache;
3559 	MutexLocker locker(&cache->lock);
3560 	bool allocated;
3561 
3562 	cached_block* block = get_cached_block(cache, blockNumber, &allocated);
3563 	if (block == NULL)
3564 		return NULL;
3565 
3566 #if BLOCK_CACHE_DEBUG_CHANGED
3567 	if (block->compare == NULL)
3568 		block->compare = cache->Allocate();
3569 	if (block->compare != NULL)
3570 		memcpy(block->compare, block->current_data, cache->block_size);
3571 #endif
3572 	TB(Get(cache, block));
3573 
3574 	return block->current_data;
3575 }
3576 
3577 
3578 const void*
3579 block_cache_get(void* _cache, off_t blockNumber)
3580 {
3581 	return block_cache_get_etc(_cache, blockNumber, blockNumber, 1);
3582 }
3583 
3584 
3585 /*!	Changes the internal status of a writable block to \a dirty. This can be
3586 	helpful in case you realize you don't need to change that block anymore
3587 	for whatever reason.
3588 
3589 	Note, you must only use this function on blocks that were acquired
3590 	writable!
3591 */
3592 status_t
3593 block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
3594 	int32 transaction)
3595 {
3596 	block_cache* cache = (block_cache*)_cache;
3597 	MutexLocker locker(&cache->lock);
3598 
3599 	cached_block* block = (cached_block*)hash_lookup(cache->hash,
3600 		&blockNumber);
3601 	if (block == NULL)
3602 		return B_BAD_VALUE;
3603 	if (block->is_dirty == dirty) {
3604 		// there is nothing to do for us
3605 		return B_OK;
3606 	}
3607 
3608 	// TODO: not yet implemented
3609 	if (dirty)
3610 		panic("block_cache_set_dirty(): not yet implemented that way!\n");
3611 
3612 	return B_OK;
3613 }
3614 
3615 
3616 void
3617 block_cache_put(void* _cache, off_t blockNumber)
3618 {
3619 	block_cache* cache = (block_cache*)_cache;
3620 	MutexLocker locker(&cache->lock);
3621 
3622 	put_cached_block(cache, blockNumber);
3623 }
3624 
3625