xref: /haiku/src/system/kernel/cache/block_cache.cpp (revision 0562493379cd52eb7103531f895f10bb8e77c085)
1 /*
2  * Copyright 2004-2008, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include <block_cache.h>
8 
9 #include <unistd.h>
10 #include <signal.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <errno.h>
14 
15 #include <KernelExport.h>
16 #include <fs_cache.h>
17 
18 #include <condition_variable.h>
19 #include <lock.h>
20 #include <low_resource_manager.h>
21 #include <slab/Slab.h>
22 #include <tracing.h>
23 #include <util/kernel_cpp.h>
24 #include <util/DoublyLinkedList.h>
25 #include <util/AutoLock.h>
26 #include <util/khash.h>
27 
28 #include "kernel_debug_config.h"
29 
30 
31 // TODO: this is a naive but growing implementation to test the API:
32 //	1) block reading/writing is not at all optimized for speed, it will
33 //	   just read and write single blocks.
34 //	2) the locking could be improved; getting a block should not need to
35 //	   wait for blocks to be written
36 // TODO: the retrieval/copy of the original data could be delayed until the
37 //		new data must be written, ie. in low memory situations.
38 
39 //#define TRACE_BLOCK_CACHE
40 #ifdef TRACE_BLOCK_CACHE
41 #	define TRACE(x)	dprintf x
42 #else
43 #	define TRACE(x) ;
44 #endif
45 
46 // This macro is used for fatal situations that are acceptable in a running
47 // system, like out of memory situations - should only panic for debugging.
48 #define FATAL(x) panic x
49 
50 static const bigtime_t kTransactionIdleTime = 2000000LL;
51 	// a transaction is considered idle after 2 seconds of inactivity
52 
53 
54 struct cache_transaction;
55 struct cached_block;
56 struct block_cache;
57 typedef DoublyLinkedListLink<cached_block> block_link;
58 
59 struct cached_block {
60 	cached_block*	next;			// next in hash
61 	cached_block*	transaction_next;
62 	block_link		link;
63 	off_t			block_number;
64 	void*			current_data;
65 	void*			original_data;
66 	void*			parent_data;
67 #if BLOCK_CACHE_DEBUG_CHANGED
68 	void*			compare;
69 #endif
70 	int32			ref_count;
71 	int32			accessed;
72 	bool			busy : 1;
73 	bool			is_writing : 1;
74 	bool			is_dirty : 1;
75 	bool			unused : 1;
76 	bool			discard : 1;
77 	cache_transaction* transaction;
78 	cache_transaction* previous_transaction;
79 
80 	static int Compare(void* _cacheEntry, const void* _block);
81 	static uint32 Hash(void* _cacheEntry, const void* _block, uint32 range);
82 };
83 
84 typedef DoublyLinkedList<cached_block,
85 	DoublyLinkedListMemberGetLink<cached_block,
86 		&cached_block::link> > block_list;
87 
88 struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
89 	int32			transaction_id;
90 	int32			events_pending;
91 	int32			events;
92 	transaction_notification_hook hook;
93 	void*			data;
94 	bool			delete_after_event;
95 };
96 
97 typedef DoublyLinkedList<cache_notification> NotificationList;
98 
99 struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
100 	hash_table*		hash;
101 	mutex			lock;
102 	int				fd;
103 	off_t			max_blocks;
104 	size_t			block_size;
105 	int32			next_transaction_id;
106 	cache_transaction* last_transaction;
107 	hash_table*		transaction_hash;
108 
109 	object_cache*	buffer_cache;
110 	block_list		unused_blocks;
111 
112 	uint32			num_dirty_blocks;
113 	bool			read_only;
114 
115 	NotificationList pending_notifications;
116 	ConditionVariable condition_variable;
117 
118 					block_cache(int fd, off_t numBlocks, size_t blockSize,
119 						bool readOnly);
120 					~block_cache();
121 
122 	status_t		Init();
123 
124 	void			Free(void* buffer);
125 	void*			Allocate();
126 	void			RemoveUnusedBlocks(int32 maxAccessed = LONG_MAX,
127 						int32 count = LONG_MAX);
128 	void			RemoveBlock(cached_block* block);
129 	void			DiscardBlock(cached_block* block);
130 	void			FreeBlock(cached_block* block);
131 	cached_block*	NewBlock(off_t blockNumber);
132 
133 
134 private:
135 	static void		_LowMemoryHandler(void* data, uint32 resources,
136 						int32 level);
137 	cached_block*	_GetUnusedBlock();
138 };
139 
140 struct cache_listener;
141 typedef DoublyLinkedListLink<cache_listener> listener_link;
142 
143 struct cache_listener : cache_notification {
144 	listener_link	link;
145 };
146 
147 typedef DoublyLinkedList<cache_listener,
148 	DoublyLinkedListMemberGetLink<cache_listener,
149 		&cache_listener::link> > ListenerList;
150 
151 struct cache_transaction {
152 	cache_transaction();
153 
154 	cache_transaction* next;
155 	int32			id;
156 	int32			num_blocks;
157 	int32			main_num_blocks;
158 	int32			sub_num_blocks;
159 	cached_block*	first_block;
160 	block_list		blocks;
161 	ListenerList	listeners;
162 	bool			open;
163 	bool			has_sub_transaction;
164 	bigtime_t		last_used;
165 };
166 
167 #if BLOCK_CACHE_BLOCK_TRACING
168 namespace BlockTracing {
169 
170 class Action : public AbstractTraceEntry {
171 public:
172 	Action(block_cache* cache, cached_block* block)
173 		:
174 		fCache(cache),
175 		fBlockNumber(block->block_number),
176 		fIsDirty(block->is_dirty),
177 		fHasOriginal(block->original_data != NULL),
178 		fHasParent(block->parent_data != NULL),
179 		fTransactionID(-1),
180 		fPreviousID(-1)
181 	{
182 		if (block->transaction != NULL)
183 			fTransactionID = block->transaction->id;
184 		if (block->previous_transaction != NULL)
185 			fPreviousID = block->previous_transaction->id;
186 	}
187 
188 	virtual void AddDump(TraceOutput& out)
189 	{
190 		out.Print("block cache %p, %s %Ld, %c%c%c transaction %ld "
191 			"(previous id %ld)\n", fCache, _Action(), fBlockNumber,
192 			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
193 			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
194 	}
195 
196 	virtual const char* _Action() const = 0;
197 
198 private:
199 	block_cache*		fCache;
200 	uint64				fBlockNumber;
201 	bool				fIsDirty;
202 	bool				fHasOriginal;
203 	bool				fHasParent;
204 	int32				fTransactionID;
205 	int32				fPreviousID;
206 };
207 
208 class Get : public Action {
209 public:
210 	Get(block_cache* cache, cached_block* block)
211 		: Action(cache, block)
212 	{
213 		Initialized();
214 	}
215 
216 	virtual const char* _Action() const { return "get"; }
217 };
218 
219 class Put : public Action {
220 public:
221 	Put(block_cache* cache, cached_block* block)
222 		: Action(cache, block)
223 	{
224 		Initialized();
225 	}
226 
227 	virtual const char* _Action() const { return "put"; }
228 };
229 
230 class Read : public Action {
231 public:
232 	Read(block_cache* cache, cached_block* block)
233 		: Action(cache, block)
234 	{
235 		Initialized();
236 	}
237 
238 	virtual const char* _Action() const { return "read"; }
239 };
240 
241 class Write : public Action {
242 public:
243 	Write(block_cache* cache, cached_block* block)
244 		: Action(cache, block)
245 	{
246 		Initialized();
247 	}
248 
249 	virtual const char* _Action() const { return "write"; }
250 };
251 
252 class Flush : public Action {
253 public:
254 	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
255 		: Action(cache, block),
256 		fGetUnused(getUnused)
257 	{
258 		Initialized();
259 	}
260 
261 	virtual const char* _Action() const
262 		{ return fGetUnused ? "get-unused" : "flush"; }
263 
264 private:
265 	bool	fGetUnused;
266 };
267 
268 class Error : public AbstractTraceEntry {
269 public:
270 	Error(block_cache* cache, uint64 blockNumber, const char* message,
271 			status_t status = B_OK)
272 		:
273 		fCache(cache),
274 		fBlockNumber(blockNumber),
275 		fMessage(message),
276 		fStatus(status)
277 	{
278 		Initialized();
279 	}
280 
281 	virtual void AddDump(TraceOutput& out)
282 	{
283 		out.Print("block cache %p, error %Ld, %s%s%s",
284 			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
285 			fStatus != B_OK ? strerror(fStatus) : "");
286 	}
287 
288 private:
289 	block_cache*	fCache;
290 	uint64			fBlockNumber;
291 	const char*		fMessage;
292 	status_t		fStatus;
293 };
294 
295 }	// namespace BlockTracing
296 
297 #	define TB(x) new(std::nothrow) BlockTracing::x;
298 #else
299 #	define TB(x) ;
300 #endif
301 
302 #if BLOCK_CACHE_TRANSACTION_TRACING
303 namespace TransactionTracing {
304 
305 class Action : public AbstractTraceEntry {
306 	public:
307 		Action(const char* label, block_cache* cache,
308 				cache_transaction* transaction)
309 			:
310 			fCache(cache),
311 			fTransaction(transaction),
312 			fID(transaction->id),
313 			fSub(transaction->has_sub_transaction),
314 			fNumBlocks(transaction->num_blocks),
315 			fSubNumBlocks(transaction->sub_num_blocks)
316 		{
317 			strlcpy(fLabel, label, sizeof(fLabel));
318 			Initialized();
319 		}
320 
321 		virtual void AddDump(TraceOutput& out)
322 		{
323 			out.Print("block cache %p, %s transaction %p (id %ld)%s"
324 				", %ld/%ld blocks", fCache, fLabel, fTransaction, fID,
325 				fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
326 		}
327 
328 	private:
329 		char				fLabel[12];
330 		block_cache			*fCache;
331 		cache_transaction	*fTransaction;
332 		int32				fID;
333 		bool				fSub;
334 		int32				fNumBlocks;
335 		int32				fSubNumBlocks;
336 };
337 
338 class Detach : public AbstractTraceEntry {
339 	public:
340 		Detach(block_cache* cache, cache_transaction* transaction,
341 				cache_transaction* newTransaction)
342 			:
343 			fCache(cache),
344 			fTransaction(transaction),
345 			fID(transaction->id),
346 			fSub(transaction->has_sub_transaction),
347 			fNewTransaction(newTransaction),
348 			fNewID(newTransaction->id)
349 		{
350 			Initialized();
351 		}
352 
353 		virtual void AddDump(TraceOutput& out)
354 		{
355 			out.Print("block cache %p, detach transaction %p (id %ld)"
356 				"from transaction %p (id %ld)%s",
357 				fCache, fNewTransaction, fNewID, fTransaction, fID,
358 				fSub ? " sub" : "");
359 		}
360 
361 	private:
362 		block_cache			*fCache;
363 		cache_transaction	*fTransaction;
364 		int32				fID;
365 		bool				fSub;
366 		cache_transaction	*fNewTransaction;
367 		int32				fNewID;
368 };
369 
370 class Abort : public AbstractTraceEntry {
371 	public:
372 		Abort(block_cache* cache, cache_transaction* transaction)
373 			:
374 			fCache(cache),
375 			fTransaction(transaction),
376 			fID(transaction->id),
377 			fNumBlocks(0)
378 		{
379 			bool isSub = transaction->has_sub_transaction;
380 			fNumBlocks = isSub ? transaction->sub_num_blocks
381 				: transaction->num_blocks;
382 			fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
383 			if (fBlocks != NULL) {
384 				cached_block* block = transaction->first_block;
385 				for (int32 i = 0; block != NULL && i < fNumBlocks;
386 						block = block->transaction_next) {
387 					fBlocks[i++] = block->block_number;
388 				}
389 			} else
390 				fNumBlocks = 0;
391 			Initialized();
392 		}
393 
394 		virtual void AddDump(TraceOutput& out)
395 		{
396 			out.Print("block cache %p, abort transaction "
397 				"%p (id %ld), blocks", fCache, fTransaction, fID);
398 			for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
399 				out.Print(" %Ld", fBlocks[i]);
400 		}
401 
402 	private:
403 		block_cache			*fCache;
404 		cache_transaction	*fTransaction;
405 		int32				fID;
406 		off_t				*fBlocks;
407 		int32				fNumBlocks;
408 };
409 
410 }	// namespace TransactionTracing
411 
412 #	define T(x) new(std::nothrow) TransactionTracing::x;
413 #else
414 #	define T(x) ;
415 #endif
416 
417 
418 static status_t write_cached_block(block_cache* cache, cached_block* block,
419 	bool deleteTransaction = true);
420 
421 
422 static DoublyLinkedList<block_cache> sCaches;
423 static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
424 static sem_id sEventSemaphore;
425 static mutex sNotificationsLock = MUTEX_INITIALIZER("block cache notifications");
426 static thread_id sNotifierWriterThread;
427 static DoublyLinkedListLink<block_cache> sMarkCache;
428 	// TODO: this only works if the link is the first entry of block_cache
429 static object_cache* sBlockCache;
430 
431 
432 //	#pragma mark - notifications/listener
433 
434 
435 /*!	Checks wether or not this is an event that closes a transaction. */
436 static inline bool
437 is_closing_event(int32 event)
438 {
439 	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
440 }
441 
442 
443 static inline bool
444 is_written_event(int32 event)
445 {
446 	return (event & TRANSACTION_WRITTEN) != 0;
447 }
448 
449 
450 /*!	From the specified \a notification, it will remove the lowest pending
451 	event, and return that one in \a _event.
452 	If there is no pending event anymore, it will return \c false.
453 */
454 static bool
455 get_next_pending_event(cache_notification* notification, int32* _event)
456 {
457 	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
458 		int32 pending = atomic_and(&notification->events_pending,
459 			~eventMask);
460 
461 		bool more = (pending & ~eventMask) != 0;
462 
463 		if ((pending & eventMask) != 0) {
464 			*_event = eventMask;
465 			return more;
466 		}
467 	}
468 
469 	return false;
470 }
471 
472 
473 static void
474 flush_pending_notifications(block_cache* cache)
475 {
476 	ASSERT_LOCKED_MUTEX(&sCachesLock);
477 
478 	while (true) {
479 		MutexLocker locker(sNotificationsLock);
480 
481 		cache_notification* notification = cache->pending_notifications.Head();
482 		if (notification == NULL)
483 			return;
484 
485 		bool deleteAfterEvent = false;
486 		int32 event = -1;
487 		if (!get_next_pending_event(notification, &event)) {
488 			// remove the notification if this was the last pending event
489 			cache->pending_notifications.Remove(notification);
490 			deleteAfterEvent = notification->delete_after_event;
491 		}
492 
493 		if (event >= 0) {
494 			// Notify listener, we need to copy the notification, as it might
495 			// be removed when we unlock the list.
496 			cache_notification copy = *notification;
497 			locker.Unlock();
498 
499 			copy.hook(copy.transaction_id, event, copy.data);
500 
501 			locker.Lock();
502 		}
503 
504 		if (deleteAfterEvent)
505 			delete notification;
506 	}
507 }
508 
509 
510 /*!	Flushes all pending notifications by calling the appropriate hook
511 	functions.
512 	Must not be called with a cache lock held.
513 */
514 static void
515 flush_pending_notifications()
516 {
517 	MutexLocker _(sCachesLock);
518 
519 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
520 	while (iterator.HasNext()) {
521 		block_cache* cache = iterator.Next();
522 
523 		flush_pending_notifications(cache);
524 	}
525 }
526 
527 
528 /*!	Initializes the \a notification as specified. */
529 static void
530 set_notification(cache_transaction* transaction,
531 	cache_notification &notification, int32 events,
532 	transaction_notification_hook hook, void* data)
533 {
534 	notification.transaction_id = transaction != NULL ? transaction->id : -1;
535 	notification.events_pending = 0;
536 	notification.events = events;
537 	notification.hook = hook;
538 	notification.data = data;
539 	notification.delete_after_event = false;
540 }
541 
542 
543 /*!	Makes sure the notification is deleted. It either deletes it directly,
544 	when possible, or marks it for deletion if the notification is pending.
545 */
546 static void
547 delete_notification(cache_notification* notification)
548 {
549 	MutexLocker locker(sNotificationsLock);
550 
551 	if (notification->events_pending != 0)
552 		notification->delete_after_event = true;
553 	else
554 		delete notification;
555 }
556 
557 
558 /*!	Adds the notification to the pending notifications list, or, if it's
559 	already part of it, updates its events_pending field.
560 	Also marks the notification to be deleted if \a deleteNotification
561 	is \c true.
562 	Triggers the notifier thread to run.
563 */
564 static void
565 add_notification(block_cache* cache, cache_notification* notification,
566 	int32 event, bool deleteNotification)
567 {
568 	if (notification->hook == NULL)
569 		return;
570 
571 	int32 pending = atomic_or(&notification->events_pending, event);
572 	if (pending == 0) {
573 		// not yet part of the notification list
574 		MutexLocker locker(sNotificationsLock);
575 		if (deleteNotification)
576 			notification->delete_after_event = true;
577 		cache->pending_notifications.Add(notification);
578 	} else if (deleteNotification) {
579 		// we might need to delete it ourselves if we're late
580 		delete_notification(notification);
581 	}
582 
583 	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
584 		// We're probably still holding some locks that makes rescheduling
585 		// not a good idea at this point.
586 }
587 
588 
589 /*!	Notifies all interested listeners of this transaction about the \a event.
590 	If \a event is a closing event (ie. TRANSACTION_ENDED, and
591 	TRANSACTION_ABORTED), all listeners except those listening to
592 	TRANSACTION_WRITTEN will be removed.
593 */
594 static void
595 notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
596 	int32 event)
597 {
598 	T(Action("notify", cache, transaction));
599 
600 	bool isClosing = is_closing_event(event);
601 	bool isWritten = is_written_event(event);
602 
603 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
604 	while (iterator.HasNext()) {
605 		cache_listener* listener = iterator.Next();
606 
607 		bool remove = (isClosing && !is_written_event(listener->events))
608 			|| (isWritten && is_written_event(listener->events));
609 		if (remove)
610 			iterator.Remove();
611 
612 		if ((listener->events & event) != 0)
613 			add_notification(cache, listener, event, remove);
614 		else if (remove)
615 			delete_notification(listener);
616 	}
617 }
618 
619 
620 /*!	Removes and deletes all listeners that are still monitoring this
621 	transaction.
622 */
623 static void
624 remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
625 {
626 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
627 	while (iterator.HasNext()) {
628 		cache_listener* listener = iterator.Next();
629 		iterator.Remove();
630 
631 		delete_notification(listener);
632 	}
633 }
634 
635 
636 static status_t
637 add_transaction_listener(block_cache* cache, cache_transaction* transaction,
638 	int32 events, transaction_notification_hook hookFunction, void* data)
639 {
640 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
641 	while (iterator.HasNext()) {
642 		cache_listener* listener = iterator.Next();
643 
644 		if (listener->data == data && listener->hook == hookFunction) {
645 			// this listener already exists, just update it
646 			listener->events |= events;
647 			return B_OK;
648 		}
649 	}
650 
651 	cache_listener* listener = new(std::nothrow) cache_listener;
652 	if (listener == NULL)
653 		return B_NO_MEMORY;
654 
655 	set_notification(transaction, *listener, events, hookFunction, data);
656 	transaction->listeners.Add(listener);
657 	return B_OK;
658 }
659 
660 
661 //	#pragma mark - private transaction
662 
663 
664 cache_transaction::cache_transaction()
665 {
666 	num_blocks = 0;
667 	main_num_blocks = 0;
668 	sub_num_blocks = 0;
669 	first_block = NULL;
670 	open = true;
671 	last_used = system_time();
672 }
673 
674 
675 static int
676 transaction_compare(void* _transaction, const void* _id)
677 {
678 	cache_transaction* transaction = (cache_transaction*)_transaction;
679 	const int32* id = (const int32*)_id;
680 
681 	return transaction->id - *id;
682 }
683 
684 
685 static uint32
686 transaction_hash(void* _transaction, const void* _id, uint32 range)
687 {
688 	cache_transaction* transaction = (cache_transaction*)_transaction;
689 	const int32* id = (const int32*)_id;
690 
691 	if (transaction != NULL)
692 		return transaction->id % range;
693 
694 	return (uint32)*id % range;
695 }
696 
697 
698 static void
699 delete_transaction(block_cache* cache, cache_transaction* transaction)
700 {
701 	if (cache->last_transaction == transaction)
702 		cache->last_transaction = NULL;
703 
704 	remove_transaction_listeners(cache, transaction);
705 	delete transaction;
706 }
707 
708 
709 static cache_transaction*
710 lookup_transaction(block_cache* cache, int32 id)
711 {
712 	return (cache_transaction*)hash_lookup(cache->transaction_hash, &id);
713 }
714 
715 
716 //	#pragma mark - cached_block
717 
718 
719 int
720 compare_blocks(const void* _blockA, const void* _blockB)
721 {
722 	cached_block* blockA = *(cached_block**)_blockA;
723 	cached_block* blockB = *(cached_block**)_blockB;
724 
725 	off_t diff = blockA->block_number - blockB->block_number;
726 	if (diff > 0)
727 		return 1;
728 
729 	return diff < 0 ? -1 : 0;
730 }
731 
732 
733 /*static*/ int
734 cached_block::Compare(void* _cacheEntry, const void* _block)
735 {
736 	cached_block* cacheEntry = (cached_block*)_cacheEntry;
737 	const off_t* block = (const off_t*)_block;
738 
739 	off_t diff = cacheEntry->block_number - *block;
740 	if (diff > 0)
741 		return 1;
742 
743 	return diff < 0 ? -1 : 0;
744 }
745 
746 
747 
748 /*static*/ uint32
749 cached_block::Hash(void* _cacheEntry, const void* _block, uint32 range)
750 {
751 	cached_block* cacheEntry = (cached_block*)_cacheEntry;
752 	const off_t* block = (const off_t*)_block;
753 
754 	if (cacheEntry != NULL)
755 		return cacheEntry->block_number % range;
756 
757 	return (uint64)*block % range;
758 }
759 
760 
761 //	#pragma mark - block_cache
762 
763 
764 block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
765 		bool readOnly)
766 	:
767 	hash(NULL),
768 	fd(_fd),
769 	max_blocks(numBlocks),
770 	block_size(blockSize),
771 	next_transaction_id(1),
772 	last_transaction(NULL),
773 	transaction_hash(NULL),
774 	buffer_cache(NULL),
775 	num_dirty_blocks(0),
776 	read_only(readOnly)
777 {
778 }
779 
780 
781 /*! Should be called with the cache's lock held. */
782 block_cache::~block_cache()
783 {
784 	unregister_low_resource_handler(&_LowMemoryHandler, this);
785 
786 	hash_uninit(transaction_hash);
787 	hash_uninit(hash);
788 
789 	delete_object_cache(buffer_cache);
790 
791 	mutex_destroy(&lock);
792 }
793 
794 
795 status_t
796 block_cache::Init()
797 {
798 	condition_variable.Init(this, "cache transaction sync");
799 	mutex_init(&lock, "block cache");
800 
801 	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
802 		8, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
803 	if (buffer_cache == NULL)
804 		return B_NO_MEMORY;
805 
806 	cached_block dummyBlock;
807 	hash = hash_init(1024, offset_of_member(dummyBlock, next),
808 		&cached_block::Compare, &cached_block::Hash);
809 	if (hash == NULL)
810 		return B_NO_MEMORY;
811 
812 	cache_transaction dummyTransaction;
813 	transaction_hash = hash_init(16, offset_of_member(dummyTransaction, next),
814 		&transaction_compare, &::transaction_hash);
815 	if (transaction_hash == NULL)
816 		return B_NO_MEMORY;
817 
818 	return register_low_resource_handler(&_LowMemoryHandler, this,
819 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY, 0);
820 }
821 
822 
823 void
824 block_cache::Free(void* buffer)
825 {
826 	if (buffer != NULL)
827 		object_cache_free(buffer_cache, buffer);
828 }
829 
830 
831 void*
832 block_cache::Allocate()
833 {
834 	return object_cache_alloc(buffer_cache, 0);
835 }
836 
837 
838 void
839 block_cache::FreeBlock(cached_block* block)
840 {
841 	Free(block->current_data);
842 
843 	if (block->original_data != NULL || block->parent_data != NULL) {
844 		panic("block_cache::FreeBlock(): %Ld, original %p, parent %p\n",
845 			block->block_number, block->original_data, block->parent_data);
846 	}
847 
848 #if BLOCK_CACHE_DEBUG_CHANGED
849 	Free(block->compare);
850 #endif
851 
852 	object_cache_free(sBlockCache, block);
853 }
854 
855 
856 /*! Allocates a new block for \a blockNumber, ready for use */
857 cached_block*
858 block_cache::NewBlock(off_t blockNumber)
859 {
860 	cached_block* block = (cached_block*)object_cache_alloc(sBlockCache, 0);
861 	if (block == NULL) {
862 		TB(Error(this, blockNumber, "allocation failed"));
863 		dprintf("block allocation failed, unused list is %sempty.\n",
864 			unused_blocks.IsEmpty() ? "" : "not ");
865 
866 		// allocation failed, try to reuse an unused block
867 		block = _GetUnusedBlock();
868 		if (block == NULL) {
869 			TB(Error(this, blockNumber, "get unused failed"));
870 			FATAL(("could not allocate block!\n"));
871 			return NULL;
872 		}
873 	}
874 
875 	block->current_data = Allocate();
876 	if (block->current_data == NULL) {
877 		object_cache_free(sBlockCache, block);
878 		return NULL;
879 	}
880 
881 	block->block_number = blockNumber;
882 	block->ref_count = 0;
883 	block->accessed = 0;
884 	block->transaction_next = NULL;
885 	block->transaction = block->previous_transaction = NULL;
886 	block->original_data = NULL;
887 	block->parent_data = NULL;
888 	block->is_dirty = false;
889 	block->unused = false;
890 	block->discard = false;
891 #if BLOCK_CACHE_DEBUG_CHANGED
892 	block->compare = NULL;
893 #endif
894 
895 	return block;
896 }
897 
898 
899 void
900 block_cache::RemoveUnusedBlocks(int32 maxAccessed, int32 count)
901 {
902 	TRACE(("block_cache: remove up to %ld unused blocks\n", count));
903 
904 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
905 			cached_block* block = iterator.Next();) {
906 		if (maxAccessed < block->accessed)
907 			continue;
908 
909 		TB(Flush(this, block));
910 		TRACE(("  remove block %Ld, accessed %ld times\n",
911 			block->block_number, block->accessed));
912 
913 		// this can only happen if no transactions are used
914 		if (block->is_dirty && !block->discard)
915 			write_cached_block(this, block, false);
916 
917 		// remove block from lists
918 		iterator.Remove();
919 		RemoveBlock(block);
920 
921 		if (--count <= 0)
922 			break;
923 	}
924 }
925 
926 
927 void
928 block_cache::RemoveBlock(cached_block* block)
929 {
930 	hash_remove(hash, block);
931 	FreeBlock(block);
932 }
933 
934 
935 /*!	Discards the block from a transaction (this method must not be called
936 	for blocks not part of a transaction).
937 */
938 void
939 block_cache::DiscardBlock(cached_block* block)
940 {
941 	ASSERT(block->discard);
942 
943 	if (block->parent_data != NULL && block->parent_data != block->current_data)
944 		Free(block->parent_data);
945 
946 	block->parent_data = NULL;
947 
948 	if (block->original_data != NULL) {
949 		Free(block->original_data);
950 		block->original_data = NULL;
951 	}
952 
953 	RemoveBlock(block);
954 }
955 
956 
957 void
958 block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
959 {
960 	block_cache* cache = (block_cache*)data;
961 	MutexLocker locker(&cache->lock);
962 
963 	if (!locker.IsLocked()) {
964 		// If our block_cache were deleted, it could be that we had
965 		// been called before that deletion went through, therefore,
966 		// acquiring its lock might fail.
967 		return;
968 	}
969 
970 	TRACE(("block_cache: low memory handler called with level %ld\n", level));
971 
972 	// free some blocks according to the low memory state
973 	// (if there is enough memory left, we don't free any)
974 
975 	int32 free = 1;
976 	int32 accessed = 1;
977 	switch (low_resource_state(
978 			B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY)) {
979 		case B_NO_LOW_RESOURCE:
980 			return;
981 		case B_LOW_RESOURCE_NOTE:
982 			free = 50;
983 			accessed = 2;
984 			break;
985 		case B_LOW_RESOURCE_WARNING:
986 			free = 200;
987 			accessed = 10;
988 			break;
989 		case B_LOW_RESOURCE_CRITICAL:
990 			free = LONG_MAX;
991 			accessed = LONG_MAX;
992 			break;
993 	}
994 
995 	cache->RemoveUnusedBlocks(accessed, free);
996 }
997 
998 
999 cached_block*
1000 block_cache::_GetUnusedBlock()
1001 {
1002 	TRACE(("block_cache: get unused block\n"));
1003 
1004 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1005 			cached_block* block = iterator.Next();) {
1006 		TB(Flush(this, block, true));
1007 		// this can only happen if no transactions are used
1008 		if (block->is_dirty)
1009 			write_cached_block(this, block, false);
1010 
1011 		// remove block from lists
1012 		iterator.Remove();
1013 		hash_remove(hash, block);
1014 
1015 		// TODO: see if parent/compare data is handled correctly here!
1016 		if (block->parent_data != NULL
1017 			&& block->parent_data != block->original_data)
1018 			Free(block->parent_data);
1019 		if (block->original_data != NULL)
1020 			Free(block->original_data);
1021 
1022 		return block;
1023 	}
1024 
1025 	return NULL;
1026 }
1027 
1028 
1029 //	#pragma mark - private block functions
1030 
1031 
1032 /*!	Removes a reference from the specified \a block. If this was the last
1033 	reference, the block is moved into the unused list.
1034 	In low memory situations, it will also free some blocks from that list,
1035 	but not necessarily the \a block it just released.
1036 */
1037 static void
1038 put_cached_block(block_cache* cache, cached_block* block)
1039 {
1040 #if BLOCK_CACHE_DEBUG_CHANGED
1041 	if (!block->is_dirty && block->compare != NULL
1042 		&& memcmp(block->current_data, block->compare, cache->block_size)) {
1043 		dprintf("new block:\n");
1044 		dump_block((const char*)block->current_data, 256, "  ");
1045 		dprintf("unchanged block:\n");
1046 		dump_block((const char*)block->compare, 256, "  ");
1047 		write_cached_block(cache, block);
1048 		panic("block_cache: supposed to be clean block was changed!\n");
1049 
1050 		cache->Free(block->compare);
1051 		block->compare = NULL;
1052 	}
1053 #endif
1054 	TB(Put(cache, block));
1055 
1056 	if (block->ref_count < 1) {
1057 		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
1058 		return;
1059 	}
1060 
1061 	if (--block->ref_count == 0
1062 		&& block->transaction == NULL && block->previous_transaction == NULL) {
1063 		// This block is not used anymore, and not part of any transaction
1064 		if (block->discard) {
1065 			cache->RemoveBlock(block);
1066 		} else {
1067 			// put this block in the list of unused blocks
1068 			block->unused = true;
1069 			ASSERT(block->original_data == NULL
1070 				&& block->parent_data == NULL);
1071 			cache->unused_blocks.Add(block);
1072 		}
1073 	}
1074 
1075 	// Free some blocks according to the low memory state
1076 	// (if there is enough memory left, we don't free any)
1077 
1078 	int32 free = 1;
1079 	switch (low_resource_state(
1080 			B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY)) {
1081 		case B_NO_LOW_RESOURCE:
1082 			return;
1083 		case B_LOW_RESOURCE_NOTE:
1084 			free = 1;
1085 			break;
1086 		case B_LOW_RESOURCE_WARNING:
1087 			free = 5;
1088 			break;
1089 		case B_LOW_RESOURCE_CRITICAL:
1090 			free = 20;
1091 			break;
1092 	}
1093 
1094 	cache->RemoveUnusedBlocks(LONG_MAX, free);
1095 }
1096 
1097 
1098 static void
1099 put_cached_block(block_cache* cache, off_t blockNumber)
1100 {
1101 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1102 		panic("put_cached_block: invalid block number %lld (max %lld)",
1103 			blockNumber, cache->max_blocks - 1);
1104 	}
1105 
1106 	cached_block* block = (cached_block*)hash_lookup(cache->hash, &blockNumber);
1107 	if (block != NULL)
1108 		put_cached_block(cache, block);
1109 	else {
1110 		TB(Error(cache, blockNumber, "put unknown"));
1111 	}
1112 }
1113 
1114 
1115 /*!	Retrieves the block \a blockNumber from the hash table, if it's already
1116 	there, or reads it from the disk.
1117 
1118 	\param _allocated tells you wether or not a new block has been allocated
1119 		to satisfy your request.
1120 	\param readBlock if \c false, the block will not be read in case it was
1121 		not already in the cache. The block you retrieve may contain random
1122 		data.
1123 */
1124 static cached_block*
1125 get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
1126 	bool readBlock = true)
1127 {
1128 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1129 		panic("get_cached_block: invalid block number %lld (max %lld)",
1130 			blockNumber, cache->max_blocks - 1);
1131 		return NULL;
1132 	}
1133 
1134 	cached_block* block = (cached_block*)hash_lookup(cache->hash,
1135 		&blockNumber);
1136 	*_allocated = false;
1137 
1138 	if (block == NULL) {
1139 		// read block into cache
1140 		block = cache->NewBlock(blockNumber);
1141 		if (block == NULL)
1142 			return NULL;
1143 
1144 		hash_insert_grow(cache->hash, block);
1145 		*_allocated = true;
1146 	}
1147 
1148 	if (*_allocated && readBlock) {
1149 		int32 blockSize = cache->block_size;
1150 
1151 		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
1152 			block->current_data, blockSize);
1153 		if (bytesRead < blockSize) {
1154 			cache->RemoveBlock(block);
1155 			TB(Error(cache, blockNumber, "read failed", bytesRead));
1156 
1157 			FATAL(("could not read block %Ld: bytesRead: %ld, error: %s\n",
1158 				blockNumber, bytesRead, strerror(errno)));
1159 			return NULL;
1160 		}
1161 		TB(Read(cache, block));
1162 	}
1163 
1164 	if (block->unused) {
1165 		//TRACE(("remove block %Ld from unused\n", blockNumber));
1166 		block->unused = false;
1167 		cache->unused_blocks.Remove(block);
1168 	}
1169 
1170 	block->ref_count++;
1171 	block->accessed++;
1172 
1173 	return block;
1174 }
1175 
1176 
1177 /*!	Returns the writable block data for the requested blockNumber.
1178 	If \a cleared is true, the block is not read from disk; an empty block
1179 	is returned.
1180 
1181 	This is the only method to insert a block into a transaction. It makes
1182 	sure that the previous block contents are preserved in that case.
1183 */
1184 static void*
1185 get_writable_cached_block(block_cache* cache, off_t blockNumber, off_t base,
1186 	off_t length, int32 transactionID, bool cleared)
1187 {
1188 	TRACE(("get_writable_cached_block(blockNumber = %Ld, transaction = %ld)\n",
1189 		blockNumber, transactionID));
1190 
1191 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1192 		panic("get_writable_cached_block: invalid block number %lld (max %lld)",
1193 			blockNumber, cache->max_blocks - 1);
1194 	}
1195 
1196 	bool allocated;
1197 	cached_block* block = get_cached_block(cache, blockNumber, &allocated,
1198 		!cleared);
1199 	if (block == NULL)
1200 		return NULL;
1201 
1202 	block->discard = false;
1203 
1204 	// if there is no transaction support, we just return the current block
1205 	if (transactionID == -1) {
1206 		if (cleared)
1207 			memset(block->current_data, 0, cache->block_size);
1208 
1209 		if (!block->is_dirty) {
1210 			cache->num_dirty_blocks++;
1211 			block->is_dirty = true;
1212 				// mark the block as dirty
1213 		}
1214 
1215 		TB(Get(cache, block));
1216 		return block->current_data;
1217 	}
1218 
1219 	cache_transaction* transaction = block->transaction;
1220 
1221 	if (transaction != NULL && transaction->id != transactionID) {
1222 		// TODO: we have to wait here until the other transaction is done.
1223 		//	Maybe we should even panic, since we can't prevent any deadlocks.
1224 		panic("get_writable_cached_block(): asked to get busy writable block (transaction %ld)\n", block->transaction->id);
1225 		put_cached_block(cache, block);
1226 		return NULL;
1227 	}
1228 	if (transaction == NULL && transactionID != -1) {
1229 		// get new transaction
1230 		transaction = lookup_transaction(cache, transactionID);
1231 		if (transaction == NULL) {
1232 			panic("get_writable_cached_block(): invalid transaction %ld!\n",
1233 				transactionID);
1234 			put_cached_block(cache, block);
1235 			return NULL;
1236 		}
1237 		if (!transaction->open) {
1238 			panic("get_writable_cached_block(): transaction already done!\n");
1239 			put_cached_block(cache, block);
1240 			return NULL;
1241 		}
1242 
1243 		block->transaction = transaction;
1244 
1245 		// attach the block to the transaction block list
1246 		block->transaction_next = transaction->first_block;
1247 		transaction->first_block = block;
1248 		transaction->num_blocks++;
1249 	}
1250 	if (transaction != NULL)
1251 		transaction->last_used = system_time();
1252 
1253 	bool wasUnchanged = block->original_data == NULL
1254 		|| block->previous_transaction != NULL;
1255 
1256 	if (!(allocated && cleared) && block->original_data == NULL) {
1257 		// we already have data, so we need to preserve it
1258 		block->original_data = cache->Allocate();
1259 		if (block->original_data == NULL) {
1260 			TB(Error(cache, blockNumber, "allocate original failed"));
1261 			FATAL(("could not allocate original_data\n"));
1262 			put_cached_block(cache, block);
1263 			return NULL;
1264 		}
1265 
1266 		memcpy(block->original_data, block->current_data, cache->block_size);
1267 	}
1268 	if (block->parent_data == block->current_data) {
1269 		// remember any previous contents for the parent transaction
1270 		block->parent_data = cache->Allocate();
1271 		if (block->parent_data == NULL) {
1272 			// TODO: maybe we should just continue the current transaction in this case...
1273 			TB(Error(cache, blockNumber, "allocate parent failed"));
1274 			FATAL(("could not allocate parent\n"));
1275 			put_cached_block(cache, block);
1276 			return NULL;
1277 		}
1278 
1279 		memcpy(block->parent_data, block->current_data, cache->block_size);
1280 		transaction->sub_num_blocks++;
1281 	} else if (transaction != NULL && transaction->has_sub_transaction
1282 		&& block->parent_data == NULL && wasUnchanged)
1283 		transaction->sub_num_blocks++;
1284 
1285 	if (cleared)
1286 		memset(block->current_data, 0, cache->block_size);
1287 
1288 	block->is_dirty = true;
1289 	TB(Get(cache, block));
1290 
1291 	return block->current_data;
1292 }
1293 
1294 
1295 /*!	Writes the specified \a block back to disk. It will always only write back
1296 	the oldest change of the block if it is part of more than one transaction.
1297 	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1298 	delete transactions when they are no longer used, and \a deleteTransaction
1299 	is \c true.
1300 */
1301 static status_t
1302 write_cached_block(block_cache* cache, cached_block* block,
1303 	bool deleteTransaction)
1304 {
1305 	cache_transaction* previous = block->previous_transaction;
1306 	int32 blockSize = cache->block_size;
1307 
1308 	void* data = previous && block->original_data
1309 		? block->original_data : block->current_data;
1310 		// we first need to write back changes from previous transactions
1311 
1312 	TRACE(("write_cached_block(block %Ld)\n", block->block_number));
1313 	TB(Write(cache, block));
1314 
1315 	ssize_t written = write_pos(cache->fd, block->block_number * blockSize,
1316 		data, blockSize);
1317 
1318 	if (written < blockSize) {
1319 		TB(Error(cache, block->block_number, "write failed", written));
1320 		FATAL(("could not write back block %Ld (%s)\n", block->block_number,
1321 			strerror(errno)));
1322 		return B_IO_ERROR;
1323 	}
1324 
1325 	if (cache->num_dirty_blocks > 0)
1326 		cache->num_dirty_blocks--;
1327 
1328 	if (data == block->current_data)
1329 		block->is_dirty = false;
1330 
1331 	if (previous != NULL) {
1332 		previous->blocks.Remove(block);
1333 		block->previous_transaction = NULL;
1334 
1335 		if (block->original_data != NULL && block->transaction == NULL) {
1336 			// This block is not part of a transaction, so it does not need
1337 			// its original pointer anymore.
1338 			cache->Free(block->original_data);
1339 			block->original_data = NULL;
1340 		}
1341 
1342 		// Has the previous transation been finished with that write?
1343 		if (--previous->num_blocks == 0) {
1344 			TRACE(("cache transaction %ld finished!\n", previous->id));
1345 			T(Action("written", cache, previous));
1346 
1347 			notify_transaction_listeners(cache, previous, TRANSACTION_WRITTEN);
1348 
1349 			if (deleteTransaction) {
1350 				hash_remove(cache->transaction_hash, previous);
1351 				delete_transaction(cache, previous);
1352 			}
1353 		}
1354 	}
1355 	if (block->transaction == NULL && block->ref_count == 0) {
1356 		// the block is no longer used
1357 		block->unused = true;
1358 		cache->unused_blocks.Add(block);
1359 	}
1360 
1361 	return B_OK;
1362 }
1363 
1364 
1365 #if DEBUG_BLOCK_CACHE
1366 
1367 static void
1368 dump_block(cached_block* block)
1369 {
1370 	kprintf("%08lx %9Ld %08lx %08lx %08lx %5ld %6ld %c%c%c%c%c %08lx %08lx\n",
1371 		(addr_t)block, block->block_number,
1372 		(addr_t)block->current_data, (addr_t)block->original_data,
1373 		(addr_t)block->parent_data, block->ref_count, block->accessed,
1374 		block->busy ? 'B' : '-', block->is_writing ? 'W' : '-',
1375 		block->is_dirty ? 'D' : '-', block->unused ? 'U' : '-',
1376 		block->discard ? 'D' : '-',
1377 		(addr_t)block->transaction,
1378 		(addr_t)block->previous_transaction);
1379 }
1380 
1381 
1382 static int
1383 dump_cache(int argc, char** argv)
1384 {
1385 	bool showTransactions = false;
1386 	bool showBlocks = false;
1387 	int32 i = 1;
1388 	while (argv[i] != NULL && argv[i][0] == '-') {
1389 		for (char* arg = &argv[i][1]; arg[0]; arg++) {
1390 			switch (arg[0]) {
1391 				case 'b':
1392 					showBlocks = true;
1393 					break;
1394 				case 't':
1395 					showTransactions = true;
1396 					break;
1397 				default:
1398 					print_debugger_command_usage(argv[0]);
1399 					return 0;
1400 			}
1401 		}
1402 		i++;
1403 	}
1404 
1405 	if (i >= argc) {
1406 		print_debugger_command_usage(argv[0]);
1407 		return 0;
1408 	}
1409 
1410 	block_cache* cache = (struct block_cache*)parse_expression(argv[i]);
1411 	if (cache == NULL) {
1412 		kprintf("invalid cache address\n");
1413 		return 0;
1414 	}
1415 
1416 	off_t blockNumber = -1;
1417 	if (i + 1 < argc) {
1418 		blockNumber = parse_expression(argv[i + 1]);
1419 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
1420 			&blockNumber);
1421 		if (block != NULL) {
1422 			kprintf("BLOCK %p\n", block);
1423 			kprintf(" current data:  %p\n", block->current_data);
1424 			kprintf(" original data: %p\n", block->original_data);
1425 			kprintf(" parent data:   %p\n", block->parent_data);
1426 			kprintf(" ref_count:     %ld\n", block->ref_count);
1427 			kprintf(" accessed:      %ld\n", block->accessed);
1428 			kprintf(" flags:        ");
1429 			if (block->is_writing)
1430 				kprintf(" is-writing");
1431 			if (block->is_dirty)
1432 				kprintf(" is-dirty");
1433 			if (block->unused)
1434 				kprintf(" unused");
1435 			if (block->discard)
1436 				kprintf(" discard");
1437 			kprintf("\n");
1438 			if (block->transaction != NULL) {
1439 				kprintf(" transaction:   %p (%ld)\n", block->transaction,
1440 					block->transaction->id);
1441 				if (block->transaction_next != NULL) {
1442 					kprintf(" next in transaction: %Ld\n",
1443 						block->transaction_next->block_number);
1444 				}
1445 			}
1446 			if (block->previous_transaction != NULL) {
1447 				kprintf(" previous transaction: %p (%ld)\n",
1448 					block->previous_transaction,
1449 					block->previous_transaction->id);
1450 			}
1451 
1452 			set_debug_variable("_current", (addr_t)block->current_data);
1453 			set_debug_variable("_original", (addr_t)block->original_data);
1454 			set_debug_variable("_parent", (addr_t)block->parent_data);
1455 		} else
1456 			kprintf("block %Ld not found\n", blockNumber);
1457 		return 0;
1458 	}
1459 
1460 	kprintf("BLOCK CACHE: %p\n", cache);
1461 
1462 	kprintf(" fd:         %d\n", cache->fd);
1463 	kprintf(" max_blocks: %Ld\n", cache->max_blocks);
1464 	kprintf(" block_size: %lu\n", cache->block_size);
1465 	kprintf(" next_transaction_id: %ld\n", cache->next_transaction_id);
1466 
1467 	if (!cache->pending_notifications.IsEmpty()) {
1468 		kprintf(" pending notifications:\n");
1469 
1470 		NotificationList::Iterator iterator
1471 			= cache->pending_notifications.GetIterator();
1472 		while (iterator.HasNext()) {
1473 			cache_notification* notification = iterator.Next();
1474 
1475 			kprintf("  %p %5lx %p - %p\n", notification,
1476 				notification->events_pending, notification->hook,
1477 				notification->data);
1478 		}
1479 	}
1480 
1481 	if (showTransactions) {
1482 		kprintf(" transactions:\n");
1483 		kprintf("address       id state  blocks  main   sub\n");
1484 
1485 		hash_iterator iterator;
1486 		hash_open(cache->transaction_hash, &iterator);
1487 
1488 		cache_transaction* transaction;
1489 		while ((transaction = (cache_transaction*)hash_next(
1490 				cache->transaction_hash, &iterator)) != NULL) {
1491 			kprintf("%p %5ld %-7s %5ld %5ld %5ld\n", transaction,
1492 				transaction->id, transaction->open ? "open" : "closed",
1493 				transaction->num_blocks, transaction->main_num_blocks,
1494 				transaction->sub_num_blocks);
1495 		}
1496 	}
1497 
1498 	if (showBlocks) {
1499 		kprintf(" blocks:\n");
1500 		kprintf("address  block no. current  original parent    refs access "
1501 			"flags transact prev. trans\n");
1502 	}
1503 
1504 	uint32 referenced = 0;
1505 	uint32 count = 0;
1506 	uint32 dirty = 0;
1507 	uint32 discarded = 0;
1508 	hash_iterator iterator;
1509 	hash_open(cache->hash, &iterator);
1510 	cached_block* block;
1511 	while ((block = (cached_block*)hash_next(cache->hash, &iterator)) != NULL) {
1512 		if (showBlocks)
1513 			dump_block(block);
1514 
1515 		if (block->is_dirty)
1516 			dirty++;
1517 		if (block->discard)
1518 			discarded++;
1519 		if (block->ref_count)
1520 			referenced++;
1521 		count++;
1522 	}
1523 
1524 	kprintf(" %ld blocks total, %ld dirty, %ld discarded, %ld referenced, %ld "
1525 		"in unused.\n", count, dirty, discarded, referenced,
1526 		cache->unused_blocks.Size());
1527 
1528 	hash_close(cache->hash, &iterator, false);
1529 	return 0;
1530 }
1531 
1532 
1533 static int
1534 dump_transaction(int argc, char** argv)
1535 {
1536 	bool showBlocks = false;
1537 	int i = 1;
1538 	if (argc > 1 && !strcmp(argv[1], "-b")) {
1539 		showBlocks = true;
1540 		i++;
1541 	}
1542 
1543 	if (argc - i < 1 || argc - i > 2) {
1544 		print_debugger_command_usage(argv[0]);
1545 		return 0;
1546 	}
1547 
1548 	cache_transaction* transaction = NULL;
1549 
1550 	if (argc - i == 1) {
1551 		transaction = (cache_transaction*)parse_expression(argv[i]);
1552 	} else {
1553 		block_cache* cache = (block_cache*)parse_expression(argv[i]);
1554 		int32 id = parse_expression(argv[i + 1]);
1555 		transaction = lookup_transaction(cache, id);
1556 		if (transaction == NULL) {
1557 			kprintf("No transaction with ID %ld found.\n", id);
1558 			return 0;
1559 		}
1560 	}
1561 
1562 	kprintf("TRANSACTION %p\n", transaction);
1563 
1564 	kprintf(" id:             %ld\n", transaction->id);
1565 	kprintf(" num block:      %ld\n", transaction->num_blocks);
1566 	kprintf(" main num block: %ld\n", transaction->main_num_blocks);
1567 	kprintf(" sub num block:  %ld\n", transaction->sub_num_blocks);
1568 	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
1569 	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
1570 	kprintf(" idle:           %Ld secs\n",
1571 		(system_time() - transaction->last_used) / 1000000);
1572 
1573 	kprintf(" listeners:\n");
1574 
1575 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
1576 	while (iterator.HasNext()) {
1577 		cache_listener* listener = iterator.Next();
1578 
1579 		kprintf("  %p %5lx %p - %p\n", listener, listener->events_pending,
1580 			listener->hook, listener->data);
1581 	}
1582 
1583 	if (!showBlocks)
1584 		return 0;
1585 
1586 	kprintf(" blocks:\n");
1587 	kprintf("address  block no. current  original parent    refs access "
1588 		"flags transact prev. trans\n");
1589 
1590 	cached_block* block = transaction->first_block;
1591 	while (block != NULL) {
1592 		dump_block(block);
1593 		block = block->transaction_next;
1594 	}
1595 
1596 	kprintf("--\n");
1597 
1598 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1599 	while (blockIterator.HasNext()) {
1600 		block = blockIterator.Next();
1601 		dump_block(block);
1602 	}
1603 
1604 	return 0;
1605 }
1606 
1607 
1608 static int
1609 dump_caches(int argc, char** argv)
1610 {
1611 	kprintf("Block caches:\n");
1612 	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
1613 	while (i.HasNext()) {
1614 		block_cache* cache = i.Next();
1615 		if (cache == (block_cache*)&sMarkCache)
1616 			continue;
1617 
1618 		kprintf("  %p\n", cache);
1619 	}
1620 
1621 	return 0;
1622 }
1623 
1624 #endif	// DEBUG_BLOCK_CACHE
1625 
1626 
1627 /*!	Traverses through the block_cache list, and returns one cache after the
1628 	other. The cache returned is automatically locked when you get it, and
1629 	unlocked with the next call to this function. Ignores caches that are in
1630 	deletion state.
1631 	Returns \c NULL when the end of the list is reached.
1632 */
1633 static block_cache*
1634 get_next_locked_block_cache(block_cache* last)
1635 {
1636 	MutexLocker _(sCachesLock);
1637 
1638 	block_cache* cache;
1639 	if (last != NULL) {
1640 		mutex_unlock(&last->lock);
1641 
1642 		cache = sCaches.GetNext((block_cache*)&sMarkCache);
1643 		sCaches.Remove((block_cache*)&sMarkCache);
1644 	} else
1645 		cache = sCaches.Head();
1646 
1647 	if (cache != NULL) {
1648 		mutex_lock(&cache->lock);
1649 		sCaches.Insert(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
1650 	}
1651 
1652 	return cache;
1653 }
1654 
1655 
1656 /*!	Background thread that continuously checks for pending notifications of
1657 	all caches.
1658 	Every two seconds, it will also write back up to 64 blocks per cache.
1659 */
1660 static status_t
1661 block_notifier_and_writer(void* /*data*/)
1662 {
1663 	const bigtime_t kTimeout = 2000000LL;
1664 	bigtime_t timeout = kTimeout;
1665 
1666 	while (true) {
1667 		bigtime_t start = system_time();
1668 
1669 		status_t status = acquire_sem_etc(sEventSemaphore, 1,
1670 			B_RELATIVE_TIMEOUT, timeout);
1671 		if (status == B_OK) {
1672 			flush_pending_notifications();
1673 			timeout -= system_time() - start;
1674 			continue;
1675 		}
1676 
1677 		// write 64 blocks of each block_cache every two seconds
1678 		// TODO: change this once we have an I/O scheduler
1679 		timeout = kTimeout;
1680 
1681 		block_cache* cache = NULL;
1682 		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
1683 			const uint32 kMaxCount = 64;
1684 			cached_block* blocks[kMaxCount];
1685 			uint32 count = 0;
1686 
1687 			if (cache->num_dirty_blocks) {
1688 				// This cache is not using transactions, we'll scan the blocks
1689 				// directly
1690 				hash_iterator iterator;
1691 				hash_open(cache->hash, &iterator);
1692 
1693 				cached_block* block;
1694 				while (count < kMaxCount
1695 					&& (block = (cached_block*)hash_next(cache->hash,
1696 							&iterator)) != NULL) {
1697 					if (block->is_dirty)
1698 						blocks[count++] = block;
1699 				}
1700 
1701 				hash_close(cache->hash, &iterator, false);
1702 			} else {
1703 				hash_iterator iterator;
1704 				hash_open(cache->transaction_hash, &iterator);
1705 
1706 				cache_transaction* transaction;
1707 				while ((transaction = (cache_transaction*)hash_next(
1708 						cache->transaction_hash, &iterator)) != NULL
1709 					&& count < kMaxCount) {
1710 					if (transaction->open) {
1711 						if (system_time() > transaction->last_used
1712 								+ kTransactionIdleTime) {
1713 							// Transaction is open but idle
1714 							notify_transaction_listeners(cache, transaction,
1715 								TRANSACTION_IDLE);
1716 						}
1717 						continue;
1718 					}
1719 
1720 					// sort blocks to speed up writing them back
1721 					// TODO: ideally, this should be handled by the I/O scheduler
1722 					block_list::Iterator iterator
1723 						= transaction->blocks.GetIterator();
1724 
1725 					for (; count < kMaxCount && iterator.HasNext(); count++) {
1726 						blocks[count] = iterator.Next();
1727 					}
1728 				}
1729 
1730 				hash_close(cache->transaction_hash, &iterator, false);
1731 			}
1732 
1733 			qsort(blocks, count, sizeof(void*), &compare_blocks);
1734 
1735 			for (uint32 i = 0; i < count; i++) {
1736 				if (write_cached_block(cache, blocks[i], true) != B_OK)
1737 					break;
1738 			}
1739 		}
1740 	}
1741 }
1742 
1743 
1744 /*!	Notify function for wait_for_notifications(). */
1745 static void
1746 notify_sync(int32 transactionID, int32 event, void* _cache)
1747 {
1748 	block_cache* cache = (block_cache*)_cache;
1749 
1750 	cache->condition_variable.NotifyOne();
1751 }
1752 
1753 
1754 /*!	Must be called with the sCachesLock held. */
1755 static bool
1756 is_valid_cache(block_cache* cache)
1757 {
1758 	ASSERT_LOCKED_MUTEX(&sCachesLock);
1759 
1760 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
1761 	while (iterator.HasNext()) {
1762 		if (cache == iterator.Next())
1763 			return true;
1764 	}
1765 
1766 	return false;
1767 }
1768 
1769 
1770 /*!	Waits until all pending notifications are carried out.
1771 	Safe to be called from the block writer/notifier thread.
1772 	You must not hold the \a cache lock when calling this function.
1773 */
1774 static void
1775 wait_for_notifications(block_cache* cache)
1776 {
1777 	MutexLocker locker(sCachesLock);
1778 
1779 	if (find_thread(NULL) == sNotifierWriterThread) {
1780 		// We're the notifier thread, don't wait, but flush all pending
1781 		// notifications directly.
1782 		if (is_valid_cache(cache))
1783 			flush_pending_notifications(cache);
1784 		return;
1785 	}
1786 
1787 	// add sync notification
1788 	cache_notification notification;
1789 	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
1790 		cache);
1791 
1792 	ConditionVariableEntry entry;
1793 	cache->condition_variable.Add(&entry);
1794 
1795 	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
1796 	locker.Unlock();
1797 
1798 	// wait for notification hook to be called
1799 	entry.Wait();
1800 
1801 	ASSERT(notification.GetDoublyLinkedListLink()->next == NULL
1802 		&& notification.GetDoublyLinkedListLink()->previous == NULL
1803 		&& cache->pending_notifications.Head() != &notification);
1804 }
1805 
1806 
1807 status_t
1808 block_cache_init(void)
1809 {
1810 	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
1811 		8, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1812 	if (sBlockCache == NULL)
1813 		return B_NO_MEMORY;
1814 
1815 	new (&sCaches) DoublyLinkedList<block_cache>;
1816 		// manually call constructor
1817 
1818 	sEventSemaphore = create_sem(0, "block cache event");
1819 	if (sEventSemaphore < B_OK)
1820 		return sEventSemaphore;
1821 
1822 	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
1823 		"block notifier/writer", B_LOW_PRIORITY, NULL);
1824 	if (sNotifierWriterThread >= B_OK)
1825 		send_signal_etc(sNotifierWriterThread, SIGCONT, B_DO_NOT_RESCHEDULE);
1826 
1827 #if DEBUG_BLOCK_CACHE
1828 	add_debugger_command_etc("block_caches", &dump_caches,
1829 		"dumps all block caches", "\n", 0);
1830 	add_debugger_command_etc("block_cache", &dump_cache,
1831 		"dumps a specific block cache",
1832 		"[-bt] <cache-address> [block-number]\n"
1833 		"  -t lists the transactions\n"
1834 		"  -b lists all blocks\n", 0);
1835 	add_debugger_command_etc("transaction", &dump_transaction,
1836 		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
1837 		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
1838 		"  -b lists all blocks that are part of this transaction\n", 0);
1839 #endif
1840 
1841 	return B_OK;
1842 }
1843 
1844 
1845 size_t
1846 block_cache_used_memory()
1847 {
1848 	size_t usedMemory = 0;
1849 
1850 	MutexLocker _(sCachesLock);
1851 
1852 	DoublyLinkedList<block_cache>::Iterator it = sCaches.GetIterator();
1853 	while (block_cache* cache = it.Next()) {
1854 		if (cache == (block_cache*)&sMarkCache)
1855 			continue;
1856 
1857 		size_t cacheUsedMemory;
1858 		object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
1859 		usedMemory += cacheUsedMemory;
1860 	}
1861 
1862 	return usedMemory;
1863 }
1864 
1865 
1866 //	#pragma mark - public transaction API
1867 
1868 
1869 int32
1870 cache_start_transaction(void* _cache)
1871 {
1872 	block_cache* cache = (block_cache*)_cache;
1873 	MutexLocker locker(&cache->lock);
1874 
1875 	if (cache->last_transaction && cache->last_transaction->open) {
1876 		panic("last transaction (%ld) still open!\n",
1877 			cache->last_transaction->id);
1878 	}
1879 
1880 	cache_transaction* transaction = new(std::nothrow) cache_transaction;
1881 	if (transaction == NULL)
1882 		return B_NO_MEMORY;
1883 
1884 	transaction->id = atomic_add(&cache->next_transaction_id, 1);
1885 	cache->last_transaction = transaction;
1886 
1887 	TRACE(("cache_start_transaction(): id %ld started\n", transaction->id));
1888 	T(Action("start", cache, transaction));
1889 
1890 	hash_insert_grow(cache->transaction_hash, transaction);
1891 
1892 	return transaction->id;
1893 }
1894 
1895 
1896 status_t
1897 cache_sync_transaction(void* _cache, int32 id)
1898 {
1899 	block_cache* cache = (block_cache*)_cache;
1900 	MutexLocker locker(&cache->lock);
1901 	status_t status = B_ENTRY_NOT_FOUND;
1902 
1903 	TRACE(("cache_sync_transaction(id %ld)\n", id));
1904 
1905 	hash_iterator iterator;
1906 	hash_open(cache->transaction_hash, &iterator);
1907 
1908 	cache_transaction* transaction;
1909 	while ((transaction = (cache_transaction*)hash_next(
1910 			cache->transaction_hash, &iterator)) != NULL) {
1911 		// close all earlier transactions which haven't been closed yet
1912 
1913 		if (transaction->id <= id && !transaction->open) {
1914 			// write back all of their remaining dirty blocks
1915 			T(Action("sync", cache, transaction));
1916 			while (transaction->num_blocks > 0) {
1917 				// sort blocks to speed up writing them back
1918 				// TODO: this should be handled by the I/O scheduler
1919 				block_list::Iterator iterator
1920 					= transaction->blocks.GetIterator();
1921 				uint32 maxCount = transaction->num_blocks;
1922 				cached_block* buffer[16];
1923 				cached_block** blocks = (cached_block**)malloc(maxCount
1924 					* sizeof(void*));
1925 				if (blocks == NULL) {
1926 					maxCount = 16;
1927 					blocks = buffer;
1928 				}
1929 
1930 				uint32 count = 0;
1931 				for (; count < maxCount && iterator.HasNext(); count++) {
1932 					blocks[count] = iterator.Next();
1933 				}
1934 				qsort(blocks, count, sizeof(void*), &compare_blocks);
1935 
1936 				for (uint32 i = 0; i < count; i++) {
1937 					status = write_cached_block(cache, blocks[i], false);
1938 					if (status != B_OK)
1939 						break;
1940 				}
1941 
1942 				if (blocks != buffer)
1943 					free(blocks);
1944 
1945 				if (status != B_OK)
1946 					return status;
1947 			}
1948 
1949 			hash_remove_current(cache->transaction_hash, &iterator);
1950 			delete_transaction(cache, transaction);
1951 		}
1952 	}
1953 
1954 	hash_close(cache->transaction_hash, &iterator, false);
1955 	locker.Unlock();
1956 
1957 	wait_for_notifications(cache);
1958 		// make sure that all pending TRANSACTION_WRITTEN notifications
1959 		// are handled after we return
1960 	return B_OK;
1961 }
1962 
1963 
1964 status_t
1965 cache_end_transaction(void* _cache, int32 id,
1966 	transaction_notification_hook hook, void* data)
1967 {
1968 	block_cache* cache = (block_cache*)_cache;
1969 	MutexLocker locker(&cache->lock);
1970 
1971 	TRACE(("cache_end_transaction(id = %ld)\n", id));
1972 
1973 	cache_transaction* transaction = lookup_transaction(cache, id);
1974 	if (transaction == NULL) {
1975 		panic("cache_end_transaction(): invalid transaction ID\n");
1976 		return B_BAD_VALUE;
1977 	}
1978 
1979 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
1980 
1981 	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
1982 			data) != B_OK) {
1983 		return B_NO_MEMORY;
1984 	}
1985 
1986 	T(Action("end", cache, transaction));
1987 
1988 	// iterate through all blocks and free the unchanged original contents
1989 
1990 	cached_block* block = transaction->first_block;
1991 	cached_block* next;
1992 	for (; block != NULL; block = next) {
1993 		next = block->transaction_next;
1994 
1995 		if (block->previous_transaction != NULL) {
1996 			// need to write back pending changes
1997 			write_cached_block(cache, block);
1998 		}
1999 		if (block->discard) {
2000 			// This block has been discarded in the transaction
2001 			cache->DiscardBlock(block);
2002 			transaction->num_blocks--;
2003 			continue;
2004 		}
2005 
2006 		if (block->original_data != NULL) {
2007 			cache->Free(block->original_data);
2008 			block->original_data = NULL;
2009 		}
2010 		if (transaction->has_sub_transaction) {
2011 			if (block->parent_data != block->current_data)
2012 				cache->Free(block->parent_data);
2013 			block->parent_data = NULL;
2014 		}
2015 
2016 		// move the block to the previous transaction list
2017 		transaction->blocks.Add(block);
2018 
2019 		block->previous_transaction = transaction;
2020 		block->transaction_next = NULL;
2021 		block->transaction = NULL;
2022 	}
2023 
2024 	transaction->open = false;
2025 	return B_OK;
2026 }
2027 
2028 
2029 status_t
2030 cache_abort_transaction(void* _cache, int32 id)
2031 {
2032 	block_cache* cache = (block_cache*)_cache;
2033 	MutexLocker locker(&cache->lock);
2034 
2035 	TRACE(("cache_abort_transaction(id = %ld)\n", id));
2036 
2037 	cache_transaction* transaction = lookup_transaction(cache, id);
2038 	if (transaction == NULL) {
2039 		panic("cache_abort_transaction(): invalid transaction ID\n");
2040 		return B_BAD_VALUE;
2041 	}
2042 
2043 	T(Abort(cache, transaction));
2044 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
2045 
2046 	// iterate through all blocks and restore their original contents
2047 
2048 	cached_block* block = transaction->first_block;
2049 	cached_block* next;
2050 	for (; block != NULL; block = next) {
2051 		next = block->transaction_next;
2052 
2053 		if (block->original_data != NULL) {
2054 			TRACE(("cache_abort_transaction(id = %ld): restored contents of block %Ld\n",
2055 				transaction->id, block->block_number));
2056 			memcpy(block->current_data, block->original_data, cache->block_size);
2057 			cache->Free(block->original_data);
2058 			block->original_data = NULL;
2059 		}
2060 		if (transaction->has_sub_transaction) {
2061 			if (block->parent_data != block->current_data)
2062 				cache->Free(block->parent_data);
2063 			block->parent_data = NULL;
2064 		}
2065 
2066 		block->transaction_next = NULL;
2067 		block->transaction = NULL;
2068 		block->discard = false;
2069 	}
2070 
2071 	hash_remove(cache->transaction_hash, transaction);
2072 	delete_transaction(cache, transaction);
2073 	return B_OK;
2074 }
2075 
2076 
2077 /*!	Acknowledges the current parent transaction, and starts a new transaction
2078 	from its sub transaction.
2079 	The new transaction also gets a new transaction ID.
2080 */
2081 int32
2082 cache_detach_sub_transaction(void* _cache, int32 id,
2083 	transaction_notification_hook hook, void* data)
2084 {
2085 	block_cache* cache = (block_cache*)_cache;
2086 	MutexLocker locker(&cache->lock);
2087 
2088 	TRACE(("cache_detach_sub_transaction(id = %ld)\n", id));
2089 
2090 	cache_transaction* transaction = lookup_transaction(cache, id);
2091 	if (transaction == NULL) {
2092 		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
2093 		return B_BAD_VALUE;
2094 	}
2095 	if (!transaction->has_sub_transaction)
2096 		return B_BAD_VALUE;
2097 
2098 	// create a new transaction for the sub transaction
2099 	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
2100 	if (transaction == NULL)
2101 		return B_NO_MEMORY;
2102 
2103 	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
2104 	T(Detach(cache, transaction, newTransaction));
2105 
2106 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2107 
2108 	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
2109 			data) != B_OK) {
2110 		delete newTransaction;
2111 		return B_NO_MEMORY;
2112 	}
2113 
2114 	// iterate through all blocks and free the unchanged original contents
2115 
2116 	cached_block* block = transaction->first_block;
2117 	cached_block* last = NULL;
2118 	cached_block* next;
2119 	for (; block != NULL; block = next) {
2120 		next = block->transaction_next;
2121 
2122 		if (block->previous_transaction != NULL) {
2123 			// need to write back pending changes
2124 			write_cached_block(cache, block);
2125 		}
2126 		if (block->discard) {
2127 			cache->DiscardBlock(block);
2128 			transaction->main_num_blocks--;
2129 			continue;
2130 		}
2131 
2132 		if (block->original_data != NULL && block->parent_data != NULL) {
2133 			// free the original data if the parent data of the transaction
2134 			// will be made current - but keep them otherwise
2135 			cache->Free(block->original_data);
2136 			block->original_data = NULL;
2137 		}
2138 		if (block->parent_data == NULL
2139 			|| block->parent_data != block->current_data) {
2140 			// we need to move this block over to the new transaction
2141 			block->original_data = block->parent_data;
2142 			if (last == NULL)
2143 				newTransaction->first_block = block;
2144 			else
2145 				last->transaction_next = block;
2146 
2147 			block->transaction = newTransaction;
2148 			last = block;
2149 		} else
2150 			block->transaction = NULL;
2151 
2152 		if (block->parent_data != NULL) {
2153 			// move the block to the previous transaction list
2154 			transaction->blocks.Add(block);
2155 			block->previous_transaction = transaction;
2156 			block->parent_data = NULL;
2157 		}
2158 
2159 		block->transaction_next = NULL;
2160 	}
2161 
2162 	newTransaction->num_blocks = transaction->sub_num_blocks;
2163 
2164 	transaction->open = false;
2165 	transaction->has_sub_transaction = false;
2166 	transaction->num_blocks = transaction->main_num_blocks;
2167 	transaction->sub_num_blocks = 0;
2168 
2169 	hash_insert_grow(cache->transaction_hash, newTransaction);
2170 	cache->last_transaction = newTransaction;
2171 
2172 	return newTransaction->id;
2173 }
2174 
2175 
2176 status_t
2177 cache_abort_sub_transaction(void* _cache, int32 id)
2178 {
2179 	block_cache* cache = (block_cache*)_cache;
2180 	MutexLocker locker(&cache->lock);
2181 
2182 	TRACE(("cache_abort_sub_transaction(id = %ld)\n", id));
2183 
2184 	cache_transaction* transaction = lookup_transaction(cache, id);
2185 	if (transaction == NULL) {
2186 		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
2187 		return B_BAD_VALUE;
2188 	}
2189 	if (!transaction->has_sub_transaction)
2190 		return B_BAD_VALUE;
2191 
2192 	T(Abort(cache, transaction));
2193 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
2194 
2195 	// revert all changes back to the version of the parent
2196 
2197 	cached_block* block = transaction->first_block;
2198 	cached_block* next;
2199 	for (; block != NULL; block = next) {
2200 		next = block->transaction_next;
2201 
2202 		if (block->parent_data == NULL) {
2203 			if (block->original_data != NULL) {
2204 				// the parent transaction didn't change the block, but the sub
2205 				// transaction did - we need to revert from the original data
2206 				memcpy(block->current_data, block->original_data,
2207 					cache->block_size);
2208 			}
2209 		} else if (block->parent_data != block->current_data) {
2210 			// the block has been changed and must be restored
2211 			TRACE(("cache_abort_sub_transaction(id = %ld): restored contents of block %Ld\n",
2212 				transaction->id, block->block_number));
2213 			memcpy(block->current_data, block->parent_data, cache->block_size);
2214 			cache->Free(block->parent_data);
2215 		}
2216 
2217 		block->parent_data = NULL;
2218 		block->discard = false;
2219 	}
2220 
2221 	// all subsequent changes will go into the main transaction
2222 	transaction->has_sub_transaction = false;
2223 	transaction->sub_num_blocks = 0;
2224 
2225 	return B_OK;
2226 }
2227 
2228 
2229 status_t
2230 cache_start_sub_transaction(void* _cache, int32 id)
2231 {
2232 	block_cache* cache = (block_cache*)_cache;
2233 	MutexLocker locker(&cache->lock);
2234 
2235 	TRACE(("cache_start_sub_transaction(id = %ld)\n", id));
2236 
2237 	cache_transaction* transaction = lookup_transaction(cache, id);
2238 	if (transaction == NULL) {
2239 		panic("cache_start_sub_transaction(): invalid transaction ID %ld\n", id);
2240 		return B_BAD_VALUE;
2241 	}
2242 
2243 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2244 
2245 	// move all changed blocks up to the parent
2246 
2247 	cached_block* block = transaction->first_block;
2248 	cached_block* last = NULL;
2249 	cached_block* next;
2250 	for (; block != NULL; block = next) {
2251 		next = block->transaction_next;
2252 
2253 		if (block->discard) {
2254 			// This block has been discarded in the parent transaction
2255 			if (last != NULL)
2256 				last->transaction_next = next;
2257 			else
2258 				transaction->first_block = next;
2259 
2260 			cache->DiscardBlock(block);
2261 			transaction->num_blocks--;
2262 			continue;
2263 		}
2264 		if (transaction->has_sub_transaction
2265 			&& block->parent_data != NULL
2266 			&& block->parent_data != block->current_data) {
2267 			// there already is an older sub transaction - we acknowledge
2268 			// its changes and move its blocks up to the parent
2269 			cache->Free(block->parent_data);
2270 		}
2271 
2272 		// we "allocate" the parent data lazily, that means, we don't copy
2273 		// the data (and allocate memory for it) until we need to
2274 		block->parent_data = block->current_data;
2275 		last = block;
2276 	}
2277 
2278 	// all subsequent changes will go into the sub transaction
2279 	transaction->has_sub_transaction = true;
2280 	transaction->main_num_blocks = transaction->num_blocks;
2281 	transaction->sub_num_blocks = 0;
2282 	T(Action("start-sub", cache, transaction));
2283 
2284 	return B_OK;
2285 }
2286 
2287 
2288 /*!	Adds a transaction listener that gets notified when the transaction
2289 	is ended, aborted, written, or idle as specified by \a events.
2290 	The listener gets automatically removed when the transaction ends.
2291 */
2292 status_t
2293 cache_add_transaction_listener(void* _cache, int32 id, int32 events,
2294 	transaction_notification_hook hook, void* data)
2295 {
2296 	block_cache* cache = (block_cache*)_cache;
2297 
2298 	MutexLocker locker(&cache->lock);
2299 
2300 	cache_transaction* transaction = lookup_transaction(cache, id);
2301 	if (transaction == NULL)
2302 		return B_BAD_VALUE;
2303 
2304 	return add_transaction_listener(cache, transaction, events, hook, data);
2305 }
2306 
2307 
2308 status_t
2309 cache_remove_transaction_listener(void* _cache, int32 id,
2310 	transaction_notification_hook hookFunction, void* data)
2311 {
2312 	block_cache* cache = (block_cache*)_cache;
2313 
2314 	MutexLocker locker(&cache->lock);
2315 
2316 	cache_transaction* transaction = lookup_transaction(cache, id);
2317 	if (transaction == NULL)
2318 		return B_BAD_VALUE;
2319 
2320 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2321 	while (iterator.HasNext()) {
2322 		cache_listener* listener = iterator.Next();
2323 		if (listener->data == data && listener->hook == hookFunction) {
2324 			iterator.Remove();
2325 
2326 			if (listener->events_pending != 0) {
2327 				MutexLocker _(sNotificationsLock);
2328 				if (listener->events_pending != 0)
2329 					cache->pending_notifications.Remove(listener);
2330 			}
2331 			delete listener;
2332 			return B_OK;
2333 		}
2334 	}
2335 
2336 	return B_ENTRY_NOT_FOUND;
2337 }
2338 
2339 
2340 status_t
2341 cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
2342 	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
2343 {
2344 	cached_block* block = (cached_block*)*_cookie;
2345 	block_cache* cache = (block_cache*)_cache;
2346 
2347 	MutexLocker locker(&cache->lock);
2348 
2349 	cache_transaction* transaction = lookup_transaction(cache, id);
2350 	if (transaction == NULL || !transaction->open)
2351 		return B_BAD_VALUE;
2352 
2353 	if (block == NULL)
2354 		block = transaction->first_block;
2355 	else
2356 		block = block->transaction_next;
2357 
2358 	if (transaction->has_sub_transaction) {
2359 		if (mainOnly) {
2360 			// find next block that the parent changed
2361 			while (block != NULL && block->parent_data == NULL)
2362 				block = block->transaction_next;
2363 		} else {
2364 			// find next non-discarded block
2365 			while (block != NULL && block->discard)
2366 				block = block->transaction_next;
2367 		}
2368 	}
2369 
2370 	if (block == NULL)
2371 		return B_ENTRY_NOT_FOUND;
2372 
2373 	if (_blockNumber)
2374 		*_blockNumber = block->block_number;
2375 	if (_data)
2376 		*_data = mainOnly ? block->parent_data : block->current_data;
2377 	if (_unchangedData)
2378 		*_unchangedData = block->original_data;
2379 
2380 	*_cookie = (addr_t)block;
2381 	return B_OK;
2382 }
2383 
2384 
2385 int32
2386 cache_blocks_in_transaction(void* _cache, int32 id)
2387 {
2388 	block_cache* cache = (block_cache*)_cache;
2389 	MutexLocker locker(&cache->lock);
2390 
2391 	cache_transaction* transaction = lookup_transaction(cache, id);
2392 	if (transaction == NULL)
2393 		return B_BAD_VALUE;
2394 
2395 	return transaction->num_blocks;
2396 }
2397 
2398 
2399 int32
2400 cache_blocks_in_main_transaction(void* _cache, int32 id)
2401 {
2402 	block_cache* cache = (block_cache*)_cache;
2403 	MutexLocker locker(&cache->lock);
2404 
2405 	cache_transaction* transaction = lookup_transaction(cache, id);
2406 	if (transaction == NULL)
2407 		return B_BAD_VALUE;
2408 
2409 	return transaction->main_num_blocks;
2410 }
2411 
2412 
2413 int32
2414 cache_blocks_in_sub_transaction(void* _cache, int32 id)
2415 {
2416 	block_cache* cache = (block_cache*)_cache;
2417 	MutexLocker locker(&cache->lock);
2418 
2419 	cache_transaction* transaction = lookup_transaction(cache, id);
2420 	if (transaction == NULL)
2421 		return B_BAD_VALUE;
2422 
2423 	return transaction->sub_num_blocks;
2424 }
2425 
2426 
2427 //	#pragma mark - public block cache API
2428 
2429 
2430 void
2431 block_cache_delete(void* _cache, bool allowWrites)
2432 {
2433 	block_cache* cache = (block_cache*)_cache;
2434 
2435 	if (allowWrites)
2436 		block_cache_sync(cache);
2437 
2438 	mutex_lock(&sCachesLock);
2439 	sCaches.Remove(cache);
2440 	mutex_unlock(&sCachesLock);
2441 
2442 	mutex_lock(&cache->lock);
2443 
2444 	// free all blocks
2445 
2446 	uint32 cookie = 0;
2447 	cached_block* block;
2448 	while ((block = (cached_block*)hash_remove_first(cache->hash,
2449 			&cookie)) != NULL) {
2450 		cache->FreeBlock(block);
2451 	}
2452 
2453 	// free all transactions (they will all be aborted)
2454 
2455 	cookie = 0;
2456 	cache_transaction* transaction;
2457 	while ((transaction = (cache_transaction*)hash_remove_first(
2458 			cache->transaction_hash, &cookie)) != NULL) {
2459 		delete transaction;
2460 	}
2461 
2462 	delete cache;
2463 }
2464 
2465 
2466 void*
2467 block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
2468 {
2469 	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
2470 		readOnly);
2471 	if (cache == NULL)
2472 		return NULL;
2473 
2474 	if (cache->Init() != B_OK) {
2475 		delete cache;
2476 		return NULL;
2477 	}
2478 
2479 	MutexLocker _(sCachesLock);
2480 	sCaches.Add(cache);
2481 
2482 	return cache;
2483 }
2484 
2485 
2486 status_t
2487 block_cache_sync(void* _cache)
2488 {
2489 	block_cache* cache = (block_cache*)_cache;
2490 
2491 	// we will sync all dirty blocks to disk that have a completed
2492 	// transaction or no transaction only
2493 
2494 	MutexLocker locker(&cache->lock);
2495 	hash_iterator iterator;
2496 	hash_open(cache->hash, &iterator);
2497 
2498 	cached_block* block;
2499 	while ((block = (cached_block*)hash_next(cache->hash, &iterator)) != NULL) {
2500 		if (block->previous_transaction != NULL
2501 			|| (block->transaction == NULL && block->is_dirty)) {
2502 			status_t status = write_cached_block(cache, block);
2503 			if (status != B_OK)
2504 				return status;
2505 		}
2506 	}
2507 
2508 	hash_close(cache->hash, &iterator, false);
2509 	locker.Unlock();
2510 
2511 	wait_for_notifications(cache);
2512 		// make sure that all pending TRANSACTION_WRITTEN notifications
2513 		// are handled after we return
2514 	return B_OK;
2515 }
2516 
2517 
2518 status_t
2519 block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
2520 {
2521 	block_cache* cache = (block_cache*)_cache;
2522 
2523 	// we will sync all dirty blocks to disk that have a completed
2524 	// transaction or no transaction only
2525 
2526 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
2527 		panic("block_cache_sync_etc: invalid block number %Ld (max %Ld)",
2528 			blockNumber, cache->max_blocks - 1);
2529 		return B_BAD_VALUE;
2530 	}
2531 
2532 	MutexLocker locker(&cache->lock);
2533 
2534 	for (; numBlocks > 0; numBlocks--, blockNumber++) {
2535 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
2536 			&blockNumber);
2537 		if (block == NULL)
2538 			continue;
2539 
2540 		if (block->previous_transaction != NULL
2541 			|| (block->transaction == NULL && block->is_dirty)) {
2542 			status_t status = write_cached_block(cache, block);
2543 			if (status != B_OK)
2544 				return status;
2545 		}
2546 	}
2547 
2548 	locker.Unlock();
2549 
2550 	wait_for_notifications(cache);
2551 		// make sure that all pending TRANSACTION_WRITTEN notifications
2552 		// are handled after we return
2553 	return B_OK;
2554 }
2555 
2556 
2557 void
2558 block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
2559 {
2560 	block_cache* cache = (block_cache*)_cache;
2561 	MutexLocker locker(&cache->lock);
2562 
2563 	for (; numBlocks > 0; numBlocks--, blockNumber++) {
2564 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
2565 			&blockNumber);
2566 		if (block == NULL)
2567 			continue;
2568 
2569 		if (block->previous_transaction != NULL)
2570 			write_cached_block(cache, block);
2571 
2572 		if (block->unused) {
2573 			cache->unused_blocks.Remove(block);
2574 			cache->RemoveBlock(block);
2575 		} else {
2576 			if (block->transaction != NULL && block->parent_data != NULL
2577 				&& block->parent_data != block->current_data) {
2578 				panic("Discarded block %Ld has already been changed in this "
2579 					"transaction!", blockNumber);
2580 			}
2581 
2582 			// mark it as discarded (in the current transaction only, if any)
2583 			block->discard = true;
2584 		}
2585 	}
2586 }
2587 
2588 
2589 status_t
2590 block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
2591 {
2592 	block_cache* cache = (block_cache*)_cache;
2593 	MutexLocker locker(&cache->lock);
2594 
2595 	if (cache->read_only)
2596 		panic("tried to make block writable on a read-only cache!");
2597 
2598 	// TODO: this can be done better!
2599 	void* block = get_writable_cached_block(cache, blockNumber,
2600 		blockNumber, 1, transaction, false);
2601 	if (block != NULL) {
2602 		put_cached_block((block_cache*)_cache, blockNumber);
2603 		return B_OK;
2604 	}
2605 
2606 	return B_ERROR;
2607 }
2608 
2609 
2610 void*
2611 block_cache_get_writable_etc(void* _cache, off_t blockNumber, off_t base,
2612 	off_t length, int32 transaction)
2613 {
2614 	block_cache* cache = (block_cache*)_cache;
2615 	MutexLocker locker(&cache->lock);
2616 
2617 	TRACE(("block_cache_get_writable_etc(block = %Ld, transaction = %ld)\n",
2618 		blockNumber, transaction));
2619 	if (cache->read_only)
2620 		panic("tried to get writable block on a read-only cache!");
2621 
2622 	return get_writable_cached_block(cache, blockNumber, base, length,
2623 		transaction, false);
2624 }
2625 
2626 
2627 void*
2628 block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
2629 {
2630 	return block_cache_get_writable_etc(_cache, blockNumber,
2631 		blockNumber, 1, transaction);
2632 }
2633 
2634 
2635 void*
2636 block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
2637 {
2638 	block_cache* cache = (block_cache*)_cache;
2639 	MutexLocker locker(&cache->lock);
2640 
2641 	TRACE(("block_cache_get_empty(block = %Ld, transaction = %ld)\n",
2642 		blockNumber, transaction));
2643 	if (cache->read_only)
2644 		panic("tried to get empty writable block on a read-only cache!");
2645 
2646 	return get_writable_cached_block((block_cache*)_cache, blockNumber,
2647 		blockNumber, 1, transaction, true);
2648 }
2649 
2650 
2651 const void*
2652 block_cache_get_etc(void* _cache, off_t blockNumber, off_t base, off_t length)
2653 {
2654 	block_cache* cache = (block_cache*)_cache;
2655 	MutexLocker locker(&cache->lock);
2656 	bool allocated;
2657 
2658 	cached_block* block = get_cached_block(cache, blockNumber, &allocated);
2659 	if (block == NULL)
2660 		return NULL;
2661 
2662 #if BLOCK_CACHE_DEBUG_CHANGED
2663 	if (block->compare == NULL)
2664 		block->compare = cache->Allocate();
2665 	if (block->compare != NULL)
2666 		memcpy(block->compare, block->current_data, cache->block_size);
2667 #endif
2668 	TB(Get(cache, block));
2669 
2670 	return block->current_data;
2671 }
2672 
2673 
2674 const void*
2675 block_cache_get(void* _cache, off_t blockNumber)
2676 {
2677 	return block_cache_get_etc(_cache, blockNumber, blockNumber, 1);
2678 }
2679 
2680 
2681 /*!	Changes the internal status of a writable block to \a dirty. This can be
2682 	helpful in case you realize you don't need to change that block anymore
2683 	for whatever reason.
2684 
2685 	Note, you must only use this function on blocks that were acquired
2686 	writable!
2687 */
2688 status_t
2689 block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
2690 	int32 transaction)
2691 {
2692 	block_cache* cache = (block_cache*)_cache;
2693 	MutexLocker locker(&cache->lock);
2694 
2695 	cached_block* block = (cached_block*)hash_lookup(cache->hash,
2696 		&blockNumber);
2697 	if (block == NULL)
2698 		return B_BAD_VALUE;
2699 	if (block->is_dirty == dirty) {
2700 		// there is nothing to do for us
2701 		return B_OK;
2702 	}
2703 
2704 	// TODO: not yet implemented
2705 	if (dirty)
2706 		panic("block_cache_set_dirty(): not yet implemented that way!\n");
2707 
2708 	return B_OK;
2709 }
2710 
2711 
2712 void
2713 block_cache_put(void* _cache, off_t blockNumber)
2714 {
2715 	block_cache* cache = (block_cache*)_cache;
2716 	MutexLocker locker(&cache->lock);
2717 
2718 	put_cached_block(cache, blockNumber);
2719 }
2720 
2721