xref: /haiku/src/system/kernel/cache/block_cache.cpp (revision 7a74a5df454197933bc6e80a542102362ee98703)
1 /*
2  * Copyright 2004-2012, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include <block_cache.h>
8 
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13 
14 #include <KernelExport.h>
15 #include <fs_cache.h>
16 
17 #include <condition_variable.h>
18 #include <lock.h>
19 #include <low_resource_manager.h>
20 #include <slab/Slab.h>
21 #include <tracing.h>
22 #include <util/kernel_cpp.h>
23 #include <util/DoublyLinkedList.h>
24 #include <util/AutoLock.h>
25 #include <util/khash.h>
26 #include <vm/vm_page.h>
27 
28 #include "kernel_debug_config.h"
29 
30 
31 // TODO: this is a naive but growing implementation to test the API:
32 //	block reading/writing is not at all optimized for speed, it will
33 //	just read and write single blocks.
34 // TODO: the retrieval/copy of the original data could be delayed until the
35 //		new data must be written, ie. in low memory situations.
36 
37 //#define TRACE_BLOCK_CACHE
38 #ifdef TRACE_BLOCK_CACHE
39 #	define TRACE(x)	dprintf x
40 #else
41 #	define TRACE(x) ;
42 #endif
43 
44 // This macro is used for fatal situations that are acceptable in a running
45 // system, like out of memory situations - should only panic for debugging.
46 #define FATAL(x) panic x
47 
48 static const bigtime_t kTransactionIdleTime = 2000000LL;
49 	// a transaction is considered idle after 2 seconds of inactivity
50 
51 
52 struct cache_transaction;
53 struct cached_block;
54 struct block_cache;
55 typedef DoublyLinkedListLink<cached_block> block_link;
56 
57 struct cached_block {
58 	cached_block*	next;			// next in hash
59 	cached_block*	transaction_next;
60 	block_link		link;
61 	off_t			block_number;
62 	void*			current_data;
63 		// The data that is seen by everyone using the API; this one is always
64 		// present.
65 	void*			original_data;
66 		// When in a transaction, this contains the original data from before
67 		// the transaction.
68 	void*			parent_data;
69 		// This is a lazily alloced buffer that represents the contents of the
70 		// block in the parent transaction. It may point to current_data if the
71 		// contents have been changed only in the parent transaction, or, if the
72 		// block has been changed in the current sub transaction already, to a
73 		// new block containing the contents changed in the parent transaction.
74 		// If this is NULL, the block has not been changed in the parent
75 		// transaction at all.
76 #if BLOCK_CACHE_DEBUG_CHANGED
77 	void*			compare;
78 #endif
79 	int32			ref_count;
80 	int32			last_accessed;
81 	bool			busy_reading : 1;
82 	bool			busy_writing : 1;
83 	bool			is_writing : 1;
84 		// Block has been checked out for writing without transactions, and
85 		// cannot be written back if set
86 	bool			is_dirty : 1;
87 	bool			unused : 1;
88 	bool			discard : 1;
89 	bool			busy_reading_waiters : 1;
90 	bool			busy_writing_waiters : 1;
91 	cache_transaction* transaction;
92 		// This is the current active transaction, if any, the block is
93 		// currently in (meaning was changed as a part of it).
94 	cache_transaction* previous_transaction;
95 		// This is set to the last transaction that was ended containing this
96 		// block. In this case, the block has not yet written back yet, and
97 		// the changed data is either in current_data, or original_data -- the
98 		// latter if the block is already being part of another transaction.
99 		// There can only be one previous transaction, so when the active
100 		// transaction ends, the changes of the previous transaction have to
101 		// be written back before that transaction becomes the next previous
102 		// transaction.
103 
104 	bool CanBeWritten() const;
105 	int32 LastAccess() const
106 		{ return system_time() / 1000000L - last_accessed; }
107 
108 	static int Compare(void* _cacheEntry, const void* _block);
109 	static uint32 Hash(void* _cacheEntry, const void* _block, uint32 range);
110 };
111 
112 typedef DoublyLinkedList<cached_block,
113 	DoublyLinkedListMemberGetLink<cached_block,
114 		&cached_block::link> > block_list;
115 
116 struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
117 	int32			transaction_id;
118 	int32			events_pending;
119 	int32			events;
120 	transaction_notification_hook hook;
121 	void*			data;
122 	bool			delete_after_event;
123 };
124 
125 typedef DoublyLinkedList<cache_notification> NotificationList;
126 
127 struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
128 	hash_table*		hash;
129 	mutex			lock;
130 	int				fd;
131 	off_t			max_blocks;
132 	size_t			block_size;
133 	int32			next_transaction_id;
134 	cache_transaction* last_transaction;
135 	hash_table*		transaction_hash;
136 
137 	object_cache*	buffer_cache;
138 	block_list		unused_blocks;
139 	uint32			unused_block_count;
140 
141 	ConditionVariable busy_reading_condition;
142 	uint32			busy_reading_count;
143 	bool			busy_reading_waiters;
144 
145 	ConditionVariable busy_writing_condition;
146 	uint32			busy_writing_count;
147 	bool			busy_writing_waiters;
148 
149 	uint32			num_dirty_blocks;
150 	bool			read_only;
151 
152 	NotificationList pending_notifications;
153 	ConditionVariable condition_variable;
154 
155 					block_cache(int fd, off_t numBlocks, size_t blockSize,
156 						bool readOnly);
157 					~block_cache();
158 
159 	status_t		Init();
160 
161 	void			Free(void* buffer);
162 	void*			Allocate();
163 	void			FreeBlock(cached_block* block);
164 	cached_block*	NewBlock(off_t blockNumber);
165 	void			FreeBlockParentData(cached_block* block);
166 
167 	void			RemoveUnusedBlocks(int32 count, int32 minSecondsOld = 0);
168 	void			RemoveBlock(cached_block* block);
169 	void			DiscardBlock(cached_block* block);
170 
171 private:
172 	static void		_LowMemoryHandler(void* data, uint32 resources,
173 						int32 level);
174 	cached_block*	_GetUnusedBlock();
175 };
176 
177 struct cache_listener;
178 typedef DoublyLinkedListLink<cache_listener> listener_link;
179 
180 struct cache_listener : cache_notification {
181 	listener_link	link;
182 };
183 
184 typedef DoublyLinkedList<cache_listener,
185 	DoublyLinkedListMemberGetLink<cache_listener,
186 		&cache_listener::link> > ListenerList;
187 
188 
189 struct cache_transaction {
190 	cache_transaction();
191 
192 	cache_transaction* next;
193 	int32			id;
194 	int32			num_blocks;
195 	int32			main_num_blocks;
196 	int32			sub_num_blocks;
197 	cached_block*	first_block;
198 	block_list		blocks;
199 	ListenerList	listeners;
200 	bool			open;
201 	bool			has_sub_transaction;
202 	bigtime_t		last_used;
203 	int32			busy_writing_count;
204 };
205 
206 
207 class BlockWriter {
208 public:
209 								BlockWriter(block_cache* cache,
210 									size_t max = SIZE_MAX);
211 								~BlockWriter();
212 
213 			bool				Add(cached_block* block,
214 									hash_iterator* iterator = NULL);
215 			bool				Add(cache_transaction* transaction,
216 									hash_iterator* iterator,
217 									bool& hasLeftOvers);
218 
219 			status_t			Write(hash_iterator* iterator = NULL,
220 									bool canUnlock = true);
221 
222 			bool				DeletedTransaction() const
223 									{ return fDeletedTransaction; }
224 
225 	static	status_t			WriteBlock(block_cache* cache,
226 									cached_block* block);
227 
228 private:
229 			void*				_Data(cached_block* block) const;
230 			status_t			_WriteBlock(cached_block* block);
231 			void				_BlockDone(cached_block* block,
232 									hash_iterator* iterator);
233 			void				_UnmarkWriting(cached_block* block);
234 
235 	static	int					_CompareBlocks(const void* _blockA,
236 									const void* _blockB);
237 
238 private:
239 	static	const size_t		kBufferSize = 64;
240 
241 			block_cache*		fCache;
242 			cached_block*		fBuffer[kBufferSize];
243 			cached_block**		fBlocks;
244 			size_t				fCount;
245 			size_t				fTotal;
246 			size_t				fCapacity;
247 			size_t				fMax;
248 			status_t			fStatus;
249 			bool				fDeletedTransaction;
250 };
251 
252 
253 class TransactionLocking {
254 public:
255 	inline bool Lock(block_cache* cache)
256 	{
257 		mutex_lock(&cache->lock);
258 
259 		while (cache->busy_writing_count != 0) {
260 			// wait for all blocks to be written
261 			ConditionVariableEntry entry;
262 			cache->busy_writing_condition.Add(&entry);
263 			cache->busy_writing_waiters = true;
264 
265 			mutex_unlock(&cache->lock);
266 
267 			entry.Wait();
268 
269 			mutex_lock(&cache->lock);
270 		}
271 
272 		return true;
273 	}
274 
275 	inline void Unlock(block_cache* cache)
276 	{
277 		mutex_unlock(&cache->lock);
278 	}
279 };
280 
281 typedef AutoLocker<block_cache, TransactionLocking> TransactionLocker;
282 
283 
284 #if BLOCK_CACHE_BLOCK_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
285 namespace BlockTracing {
286 
287 class Action : public AbstractTraceEntry {
288 public:
289 	Action(block_cache* cache, cached_block* block)
290 		:
291 		fCache(cache),
292 		fBlockNumber(block->block_number),
293 		fIsDirty(block->is_dirty),
294 		fHasOriginal(block->original_data != NULL),
295 		fHasParent(block->parent_data != NULL),
296 		fTransactionID(-1),
297 		fPreviousID(-1)
298 	{
299 		if (block->transaction != NULL)
300 			fTransactionID = block->transaction->id;
301 		if (block->previous_transaction != NULL)
302 			fPreviousID = block->previous_transaction->id;
303 	}
304 
305 	virtual void AddDump(TraceOutput& out)
306 	{
307 		out.Print("block cache %p, %s %Ld, %c%c%c transaction %ld "
308 			"(previous id %ld)\n", fCache, _Action(), fBlockNumber,
309 			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
310 			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
311 	}
312 
313 	virtual const char* _Action() const = 0;
314 
315 private:
316 	block_cache*		fCache;
317 	uint64				fBlockNumber;
318 	bool				fIsDirty;
319 	bool				fHasOriginal;
320 	bool				fHasParent;
321 	int32				fTransactionID;
322 	int32				fPreviousID;
323 };
324 
325 class Get : public Action {
326 public:
327 	Get(block_cache* cache, cached_block* block)
328 		:
329 		Action(cache, block)
330 	{
331 		Initialized();
332 	}
333 
334 	virtual const char* _Action() const { return "get"; }
335 };
336 
337 class Put : public Action {
338 public:
339 	Put(block_cache* cache, cached_block* block)
340 		:
341 		Action(cache, block)
342 	{
343 		Initialized();
344 	}
345 
346 	virtual const char* _Action() const { return "put"; }
347 };
348 
349 class Read : public Action {
350 public:
351 	Read(block_cache* cache, cached_block* block)
352 		:
353 		Action(cache, block)
354 	{
355 		Initialized();
356 	}
357 
358 	virtual const char* _Action() const { return "read"; }
359 };
360 
361 class Write : public Action {
362 public:
363 	Write(block_cache* cache, cached_block* block)
364 		:
365 		Action(cache, block)
366 	{
367 		Initialized();
368 	}
369 
370 	virtual const char* _Action() const { return "write"; }
371 };
372 
373 class Flush : public Action {
374 public:
375 	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
376 		:
377 		Action(cache, block),
378 		fGetUnused(getUnused)
379 	{
380 		Initialized();
381 	}
382 
383 	virtual const char* _Action() const
384 		{ return fGetUnused ? "get-unused" : "flush"; }
385 
386 private:
387 	bool	fGetUnused;
388 };
389 
390 class Error : public AbstractTraceEntry {
391 public:
392 	Error(block_cache* cache, uint64 blockNumber, const char* message,
393 			status_t status = B_OK)
394 		:
395 		fCache(cache),
396 		fBlockNumber(blockNumber),
397 		fMessage(message),
398 		fStatus(status)
399 	{
400 		Initialized();
401 	}
402 
403 	virtual void AddDump(TraceOutput& out)
404 	{
405 		out.Print("block cache %p, error %Ld, %s%s%s",
406 			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
407 			fStatus != B_OK ? strerror(fStatus) : "");
408 	}
409 
410 private:
411 	block_cache*	fCache;
412 	uint64			fBlockNumber;
413 	const char*		fMessage;
414 	status_t		fStatus;
415 };
416 
417 #if BLOCK_CACHE_BLOCK_TRACING >= 2
418 class BlockData : public AbstractTraceEntry {
419 public:
420 	enum {
421 		kCurrent	= 0x01,
422 		kParent		= 0x02,
423 		kOriginal	= 0x04
424 	};
425 
426 	BlockData(block_cache* cache, cached_block* block, const char* message)
427 		:
428 		fCache(cache),
429 		fSize(cache->block_size),
430 		fBlockNumber(block->block_number),
431 		fMessage(message)
432 	{
433 		_Allocate(fCurrent, block->current_data);
434 		_Allocate(fParent, block->parent_data);
435 		_Allocate(fOriginal, block->original_data);
436 
437 #if KTRACE_PRINTF_STACK_TRACE
438 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
439 			false);
440 #endif
441 
442 		Initialized();
443 	}
444 
445 	virtual void AddDump(TraceOutput& out)
446 	{
447 		out.Print("block cache %p, block %Ld, data %c%c%c: %s",
448 			fCache, fBlockNumber, fCurrent != NULL ? 'c' : '-',
449 			fParent != NULL ? 'p' : '-', fOriginal != NULL ? 'o' : '-',
450 			fMessage);
451 	}
452 
453 #if KTRACE_PRINTF_STACK_TRACE
454 	virtual void DumpStackTrace(TraceOutput& out)
455 	{
456 		out.PrintStackTrace(fStackTrace);
457 	}
458 #endif
459 
460 	void DumpBlocks(uint32 which, uint32 offset, uint32 size)
461 	{
462 		if ((which & kCurrent) != 0)
463 			DumpBlock(kCurrent, offset, size);
464 		if ((which & kParent) != 0)
465 			DumpBlock(kParent, offset, size);
466 		if ((which & kOriginal) != 0)
467 			DumpBlock(kOriginal, offset, size);
468 	}
469 
470 	void DumpBlock(uint32 which, uint32 offset, uint32 size)
471 	{
472 		if (offset > fSize) {
473 			kprintf("invalid offset (block size %lu)\n", fSize);
474 			return;
475 		}
476 		if (offset + size > fSize)
477 			size = fSize - offset;
478 
479 		const char* label;
480 		uint8* data;
481 
482 		if ((which & kCurrent) != 0) {
483 			label = "current";
484 			data = fCurrent;
485 		} else if ((which & kParent) != 0) {
486 			label = "parent";
487 			data = fParent;
488 		} else if ((which & kOriginal) != 0) {
489 			label = "original";
490 			data = fOriginal;
491 		} else
492 			return;
493 
494 		kprintf("%s: offset %lu, %lu bytes\n", label, offset, size);
495 
496 		static const uint32 kBlockSize = 16;
497 		data += offset;
498 
499 		for (uint32 i = 0; i < size;) {
500 			int start = i;
501 
502 			kprintf("  %04lx ", i);
503 			for (; i < start + kBlockSize; i++) {
504 				if (!(i % 4))
505 					kprintf(" ");
506 
507 				if (i >= size)
508 					kprintf("  ");
509 				else
510 					kprintf("%02x", data[i]);
511 			}
512 
513 			kprintf("\n");
514 		}
515 	}
516 
517 private:
518 	void _Allocate(uint8*& target, void* source)
519 	{
520 		if (source == NULL) {
521 			target = NULL;
522 			return;
523 		}
524 
525 		target = alloc_tracing_buffer_memcpy(source, fSize, false);
526 	}
527 
528 	block_cache*	fCache;
529 	uint32			fSize;
530 	uint64			fBlockNumber;
531 	const char*		fMessage;
532 	uint8*			fCurrent;
533 	uint8*			fParent;
534 	uint8*			fOriginal;
535 #if KTRACE_PRINTF_STACK_TRACE
536 	tracing_stack_trace* fStackTrace;
537 #endif
538 };
539 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
540 
541 }	// namespace BlockTracing
542 
543 #	define TB(x) new(std::nothrow) BlockTracing::x;
544 #else
545 #	define TB(x) ;
546 #endif
547 
548 #if BLOCK_CACHE_BLOCK_TRACING >= 2
549 #	define TB2(x) new(std::nothrow) BlockTracing::x;
550 #else
551 #	define TB2(x) ;
552 #endif
553 
554 
555 #if BLOCK_CACHE_TRANSACTION_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
556 namespace TransactionTracing {
557 
558 class Action : public AbstractTraceEntry {
559 public:
560 	Action(const char* label, block_cache* cache,
561 			cache_transaction* transaction)
562 		:
563 		fCache(cache),
564 		fTransaction(transaction),
565 		fID(transaction->id),
566 		fSub(transaction->has_sub_transaction),
567 		fNumBlocks(transaction->num_blocks),
568 		fSubNumBlocks(transaction->sub_num_blocks)
569 	{
570 		strlcpy(fLabel, label, sizeof(fLabel));
571 		Initialized();
572 	}
573 
574 	virtual void AddDump(TraceOutput& out)
575 	{
576 		out.Print("block cache %p, %s transaction %p (id %ld)%s"
577 			", %ld/%ld blocks", fCache, fLabel, fTransaction, fID,
578 			fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
579 	}
580 
581 private:
582 	char				fLabel[12];
583 	block_cache*		fCache;
584 	cache_transaction*	fTransaction;
585 	int32				fID;
586 	bool				fSub;
587 	int32				fNumBlocks;
588 	int32				fSubNumBlocks;
589 };
590 
591 class Detach : public AbstractTraceEntry {
592 public:
593 	Detach(block_cache* cache, cache_transaction* transaction,
594 			cache_transaction* newTransaction)
595 		:
596 		fCache(cache),
597 		fTransaction(transaction),
598 		fID(transaction->id),
599 		fSub(transaction->has_sub_transaction),
600 		fNewTransaction(newTransaction),
601 		fNewID(newTransaction->id)
602 	{
603 		Initialized();
604 	}
605 
606 	virtual void AddDump(TraceOutput& out)
607 	{
608 		out.Print("block cache %p, detach transaction %p (id %ld)"
609 			"from transaction %p (id %ld)%s",
610 			fCache, fNewTransaction, fNewID, fTransaction, fID,
611 			fSub ? " sub" : "");
612 	}
613 
614 private:
615 	block_cache*		fCache;
616 	cache_transaction*	fTransaction;
617 	int32				fID;
618 	bool				fSub;
619 	cache_transaction*	fNewTransaction;
620 	int32				fNewID;
621 };
622 
623 class Abort : public AbstractTraceEntry {
624 public:
625 	Abort(block_cache* cache, cache_transaction* transaction)
626 		:
627 		fCache(cache),
628 		fTransaction(transaction),
629 		fID(transaction->id),
630 		fNumBlocks(0)
631 	{
632 		bool isSub = transaction->has_sub_transaction;
633 		fNumBlocks = isSub ? transaction->sub_num_blocks
634 			: transaction->num_blocks;
635 		fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
636 		if (fBlocks != NULL) {
637 			cached_block* block = transaction->first_block;
638 			for (int32 i = 0; block != NULL && i < fNumBlocks;
639 					block = block->transaction_next) {
640 				fBlocks[i++] = block->block_number;
641 			}
642 		} else
643 			fNumBlocks = 0;
644 
645 #if KTRACE_PRINTF_STACK_TRACE
646 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
647 			false);
648 #endif
649 
650 		Initialized();
651 	}
652 
653 	virtual void AddDump(TraceOutput& out)
654 	{
655 		out.Print("block cache %p, abort transaction "
656 			"%p (id %ld), blocks", fCache, fTransaction, fID);
657 		for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
658 			out.Print(" %Ld", fBlocks[i]);
659 	}
660 
661 #if KTRACE_PRINTF_STACK_TRACE
662 	virtual void DumpStackTrace(TraceOutput& out)
663 	{
664 		out.PrintStackTrace(fStackTrace);
665 	}
666 #endif
667 
668 private:
669 	block_cache*		fCache;
670 	cache_transaction*	fTransaction;
671 	int32				fID;
672 	off_t*				fBlocks;
673 	int32				fNumBlocks;
674 #if KTRACE_PRINTF_STACK_TRACE
675 	tracing_stack_trace* fStackTrace;
676 #endif
677 };
678 
679 }	// namespace TransactionTracing
680 
681 #	define T(x) new(std::nothrow) TransactionTracing::x;
682 #else
683 #	define T(x) ;
684 #endif
685 
686 
687 static DoublyLinkedList<block_cache> sCaches;
688 static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
689 static mutex sCachesMemoryUseLock
690 	= MUTEX_INITIALIZER("block caches memory use");
691 static size_t sUsedMemory;
692 static sem_id sEventSemaphore;
693 static mutex sNotificationsLock
694 	= MUTEX_INITIALIZER("block cache notifications");
695 static thread_id sNotifierWriterThread;
696 static DoublyLinkedListLink<block_cache> sMarkCache;
697 	// TODO: this only works if the link is the first entry of block_cache
698 static object_cache* sBlockCache;
699 
700 
701 //	#pragma mark - notifications/listener
702 
703 
704 /*!	Checks whether or not this is an event that closes a transaction. */
705 static inline bool
706 is_closing_event(int32 event)
707 {
708 	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
709 }
710 
711 
712 static inline bool
713 is_written_event(int32 event)
714 {
715 	return (event & TRANSACTION_WRITTEN) != 0;
716 }
717 
718 
719 /*!	From the specified \a notification, it will remove the lowest pending
720 	event, and return that one in \a _event.
721 	If there is no pending event anymore, it will return \c false.
722 */
723 static bool
724 get_next_pending_event(cache_notification* notification, int32* _event)
725 {
726 	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
727 		int32 pending = atomic_and(&notification->events_pending,
728 			~eventMask);
729 
730 		bool more = (pending & ~eventMask) != 0;
731 
732 		if ((pending & eventMask) != 0) {
733 			*_event = eventMask;
734 			return more;
735 		}
736 	}
737 
738 	return false;
739 }
740 
741 
742 static void
743 flush_pending_notifications(block_cache* cache)
744 {
745 	ASSERT_LOCKED_MUTEX(&sCachesLock);
746 
747 	while (true) {
748 		MutexLocker locker(sNotificationsLock);
749 
750 		cache_notification* notification = cache->pending_notifications.Head();
751 		if (notification == NULL)
752 			return;
753 
754 		bool deleteAfterEvent = false;
755 		int32 event = -1;
756 		if (!get_next_pending_event(notification, &event)) {
757 			// remove the notification if this was the last pending event
758 			cache->pending_notifications.Remove(notification);
759 			deleteAfterEvent = notification->delete_after_event;
760 		}
761 
762 		if (event >= 0) {
763 			// Notify listener, we need to copy the notification, as it might
764 			// be removed when we unlock the list.
765 			cache_notification copy = *notification;
766 			locker.Unlock();
767 
768 			copy.hook(copy.transaction_id, event, copy.data);
769 
770 			locker.Lock();
771 		}
772 
773 		if (deleteAfterEvent)
774 			delete notification;
775 	}
776 }
777 
778 
779 /*!	Flushes all pending notifications by calling the appropriate hook
780 	functions.
781 	Must not be called with a cache lock held.
782 */
783 static void
784 flush_pending_notifications()
785 {
786 	MutexLocker _(sCachesLock);
787 
788 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
789 	while (iterator.HasNext()) {
790 		block_cache* cache = iterator.Next();
791 
792 		flush_pending_notifications(cache);
793 	}
794 }
795 
796 
797 /*!	Initializes the \a notification as specified. */
798 static void
799 set_notification(cache_transaction* transaction,
800 	cache_notification &notification, int32 events,
801 	transaction_notification_hook hook, void* data)
802 {
803 	notification.transaction_id = transaction != NULL ? transaction->id : -1;
804 	notification.events_pending = 0;
805 	notification.events = events;
806 	notification.hook = hook;
807 	notification.data = data;
808 	notification.delete_after_event = false;
809 }
810 
811 
812 /*!	Makes sure the notification is deleted. It either deletes it directly,
813 	when possible, or marks it for deletion if the notification is pending.
814 */
815 static void
816 delete_notification(cache_notification* notification)
817 {
818 	MutexLocker locker(sNotificationsLock);
819 
820 	if (notification->events_pending != 0)
821 		notification->delete_after_event = true;
822 	else
823 		delete notification;
824 }
825 
826 
827 /*!	Adds the notification to the pending notifications list, or, if it's
828 	already part of it, updates its events_pending field.
829 	Also marks the notification to be deleted if \a deleteNotification
830 	is \c true.
831 	Triggers the notifier thread to run.
832 */
833 static void
834 add_notification(block_cache* cache, cache_notification* notification,
835 	int32 event, bool deleteNotification)
836 {
837 	if (notification->hook == NULL)
838 		return;
839 
840 	int32 pending = atomic_or(&notification->events_pending, event);
841 	if (pending == 0) {
842 		// not yet part of the notification list
843 		MutexLocker locker(sNotificationsLock);
844 		if (deleteNotification)
845 			notification->delete_after_event = true;
846 		cache->pending_notifications.Add(notification);
847 	} else if (deleteNotification) {
848 		// we might need to delete it ourselves if we're late
849 		delete_notification(notification);
850 	}
851 
852 	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
853 		// We're probably still holding some locks that makes rescheduling
854 		// not a good idea at this point.
855 }
856 
857 
858 /*!	Notifies all interested listeners of this transaction about the \a event.
859 	If \a event is a closing event (ie. TRANSACTION_ENDED, and
860 	TRANSACTION_ABORTED), all listeners except those listening to
861 	TRANSACTION_WRITTEN will be removed.
862 */
863 static void
864 notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
865 	int32 event)
866 {
867 	T(Action("notify", cache, transaction));
868 
869 	bool isClosing = is_closing_event(event);
870 	bool isWritten = is_written_event(event);
871 
872 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
873 	while (iterator.HasNext()) {
874 		cache_listener* listener = iterator.Next();
875 
876 		bool remove = (isClosing && !is_written_event(listener->events))
877 			|| (isWritten && is_written_event(listener->events));
878 		if (remove)
879 			iterator.Remove();
880 
881 		if ((listener->events & event) != 0)
882 			add_notification(cache, listener, event, remove);
883 		else if (remove)
884 			delete_notification(listener);
885 	}
886 }
887 
888 
889 /*!	Removes and deletes all listeners that are still monitoring this
890 	transaction.
891 */
892 static void
893 remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
894 {
895 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
896 	while (iterator.HasNext()) {
897 		cache_listener* listener = iterator.Next();
898 		iterator.Remove();
899 
900 		delete_notification(listener);
901 	}
902 }
903 
904 
905 static status_t
906 add_transaction_listener(block_cache* cache, cache_transaction* transaction,
907 	int32 events, transaction_notification_hook hookFunction, void* data)
908 {
909 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
910 	while (iterator.HasNext()) {
911 		cache_listener* listener = iterator.Next();
912 
913 		if (listener->data == data && listener->hook == hookFunction) {
914 			// this listener already exists, just update it
915 			listener->events |= events;
916 			return B_OK;
917 		}
918 	}
919 
920 	cache_listener* listener = new(std::nothrow) cache_listener;
921 	if (listener == NULL)
922 		return B_NO_MEMORY;
923 
924 	set_notification(transaction, *listener, events, hookFunction, data);
925 	transaction->listeners.Add(listener);
926 	return B_OK;
927 }
928 
929 
930 //	#pragma mark - private transaction
931 
932 
933 cache_transaction::cache_transaction()
934 {
935 	num_blocks = 0;
936 	main_num_blocks = 0;
937 	sub_num_blocks = 0;
938 	first_block = NULL;
939 	open = true;
940 	has_sub_transaction = false;
941 	last_used = system_time();
942 	busy_writing_count = 0;
943 }
944 
945 
946 static int
947 transaction_compare(void* _transaction, const void* _id)
948 {
949 	cache_transaction* transaction = (cache_transaction*)_transaction;
950 	const int32* id = (const int32*)_id;
951 
952 	return transaction->id - *id;
953 }
954 
955 
956 static uint32
957 transaction_hash(void* _transaction, const void* _id, uint32 range)
958 {
959 	cache_transaction* transaction = (cache_transaction*)_transaction;
960 	const int32* id = (const int32*)_id;
961 
962 	if (transaction != NULL)
963 		return transaction->id % range;
964 
965 	return (uint32)*id % range;
966 }
967 
968 
969 static void
970 delete_transaction(block_cache* cache, cache_transaction* transaction)
971 {
972 	if (cache->last_transaction == transaction)
973 		cache->last_transaction = NULL;
974 
975 	remove_transaction_listeners(cache, transaction);
976 	delete transaction;
977 }
978 
979 
980 static cache_transaction*
981 lookup_transaction(block_cache* cache, int32 id)
982 {
983 	return (cache_transaction*)hash_lookup(cache->transaction_hash, &id);
984 }
985 
986 
987 /*!	Writes back any changes made to blocks in \a transaction that are still
988 	part of a previous transacton.
989 */
990 static status_t
991 write_blocks_in_previous_transaction(block_cache* cache,
992 	cache_transaction* transaction)
993 {
994 	BlockWriter writer(cache);
995 
996 	cached_block* block = transaction->first_block;
997 	for (; block != NULL; block = block->transaction_next) {
998 		if (block->previous_transaction != NULL) {
999 			// need to write back pending changes
1000 			writer.Add(block);
1001 		}
1002 	}
1003 
1004 	return writer.Write();
1005 }
1006 
1007 
1008 //	#pragma mark - cached_block
1009 
1010 
1011 bool
1012 cached_block::CanBeWritten() const
1013 {
1014 	return !busy_writing && !busy_reading
1015 		&& (previous_transaction != NULL
1016 			|| (transaction == NULL && is_dirty && !is_writing));
1017 }
1018 
1019 
1020 /*static*/ int
1021 cached_block::Compare(void* _cacheEntry, const void* _block)
1022 {
1023 	cached_block* cacheEntry = (cached_block*)_cacheEntry;
1024 	const off_t* block = (const off_t*)_block;
1025 
1026 	off_t diff = cacheEntry->block_number - *block;
1027 	if (diff > 0)
1028 		return 1;
1029 
1030 	return diff < 0 ? -1 : 0;
1031 }
1032 
1033 
1034 /*static*/ uint32
1035 cached_block::Hash(void* _cacheEntry, const void* _block, uint32 range)
1036 {
1037 	cached_block* cacheEntry = (cached_block*)_cacheEntry;
1038 	const off_t* block = (const off_t*)_block;
1039 
1040 	if (cacheEntry != NULL)
1041 		return cacheEntry->block_number % range;
1042 
1043 	return (uint64)*block % range;
1044 }
1045 
1046 
1047 //	#pragma mark - BlockWriter
1048 
1049 
1050 BlockWriter::BlockWriter(block_cache* cache, size_t max)
1051 	:
1052 	fCache(cache),
1053 	fBlocks(fBuffer),
1054 	fCount(0),
1055 	fTotal(0),
1056 	fCapacity(kBufferSize),
1057 	fMax(max),
1058 	fStatus(B_OK),
1059 	fDeletedTransaction(false)
1060 {
1061 }
1062 
1063 
1064 BlockWriter::~BlockWriter()
1065 {
1066 	if (fBlocks != fBuffer)
1067 		free(fBlocks);
1068 }
1069 
1070 
1071 /*!	Adds the specified block to the to be written array. If no more blocks can
1072 	be added, false is returned, otherwise true.
1073 */
1074 bool
1075 BlockWriter::Add(cached_block* block, hash_iterator* iterator)
1076 {
1077 	ASSERT(block->CanBeWritten());
1078 
1079 	if (fTotal == fMax)
1080 		return false;
1081 
1082 	if (fCount >= fCapacity) {
1083 		// Enlarge array if necessary
1084 		cached_block** newBlocks;
1085 		size_t newCapacity = max_c(256, fCapacity * 2);
1086 		if (fBlocks == fBuffer)
1087 			newBlocks = (cached_block**)malloc(newCapacity * sizeof(void*));
1088 		else {
1089 			newBlocks = (cached_block**)realloc(fBlocks,
1090 				newCapacity * sizeof(void*));
1091 		}
1092 
1093 		if (newBlocks == NULL) {
1094 			// Allocating a larger array failed - we need to write back what
1095 			// we have synchronously now (this will also clear the array)
1096 			Write(iterator, false);
1097 		} else {
1098 			if (fBlocks == fBuffer)
1099 				memcpy(newBlocks, fBuffer, kBufferSize * sizeof(void*));
1100 
1101 			fBlocks = newBlocks;
1102 			fCapacity = newCapacity;
1103 		}
1104 	}
1105 
1106 	fBlocks[fCount++] = block;
1107 	fTotal++;
1108 	block->busy_writing = true;
1109 	fCache->busy_writing_count++;
1110 	if (block->previous_transaction != NULL)
1111 		block->previous_transaction->busy_writing_count++;
1112 
1113 	return true;
1114 }
1115 
1116 
1117 /*!	Adds all blocks of the specified transaction to the to be written array.
1118 	If no more blocks can be added, false is returned, otherwise true.
1119 */
1120 bool
1121 BlockWriter::Add(cache_transaction* transaction, hash_iterator* iterator,
1122 	bool& hasLeftOvers)
1123 {
1124 	ASSERT(!transaction->open);
1125 
1126 	if (transaction->busy_writing_count != 0) {
1127 		hasLeftOvers = true;
1128 		return true;
1129 	}
1130 
1131 	hasLeftOvers = false;
1132 
1133 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1134 	while (cached_block* block = blockIterator.Next()) {
1135 		if (!block->CanBeWritten()) {
1136 			// This block was already part of a previous transaction within this
1137 			// writer
1138 			hasLeftOvers = true;
1139 			continue;
1140 		}
1141 		if (!Add(block, iterator))
1142 			return false;
1143 
1144 		if (DeletedTransaction())
1145 			break;
1146 	}
1147 
1148 	return true;
1149 }
1150 
1151 
1152 /*! Cache must be locked when calling this method, but it will be unlocked
1153 	while the blocks are written back.
1154 */
1155 status_t
1156 BlockWriter::Write(hash_iterator* iterator, bool canUnlock)
1157 {
1158 	if (fCount == 0)
1159 		return B_OK;
1160 
1161 	if (canUnlock)
1162 		mutex_unlock(&fCache->lock);
1163 
1164 	// Sort blocks in their on-disk order
1165 	// TODO: ideally, this should be handled by the I/O scheduler
1166 
1167 	qsort(fBlocks, fCount, sizeof(void*), &_CompareBlocks);
1168 	fDeletedTransaction = false;
1169 
1170 	for (uint32 i = 0; i < fCount; i++) {
1171 		status_t status = _WriteBlock(fBlocks[i]);
1172 		if (status != B_OK) {
1173 			// propagate to global error handling
1174 			if (fStatus == B_OK)
1175 				fStatus = status;
1176 
1177 			_UnmarkWriting(fBlocks[i]);
1178 			fBlocks[i] = NULL;
1179 				// This block will not be marked clean
1180 		}
1181 	}
1182 
1183 	if (canUnlock)
1184 		mutex_lock(&fCache->lock);
1185 
1186 	for (uint32 i = 0; i < fCount; i++)
1187 		_BlockDone(fBlocks[i], iterator);
1188 
1189 	fCount = 0;
1190 	return fStatus;
1191 }
1192 
1193 
1194 /*!	Writes the specified \a block back to disk. It will always only write back
1195 	the oldest change of the block if it is part of more than one transaction.
1196 	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1197 	delete transactions when they are no longer used, and \a deleteTransaction
1198 	is \c true.
1199 */
1200 /*static*/ status_t
1201 BlockWriter::WriteBlock(block_cache* cache, cached_block* block)
1202 {
1203 	BlockWriter writer(cache);
1204 
1205 	writer.Add(block);
1206 	return writer.Write();
1207 }
1208 
1209 
1210 void*
1211 BlockWriter::_Data(cached_block* block) const
1212 {
1213 	return block->previous_transaction != NULL && block->original_data != NULL
1214 		? block->original_data : block->current_data;
1215 		// We first need to write back changes from previous transactions
1216 }
1217 
1218 
1219 status_t
1220 BlockWriter::_WriteBlock(cached_block* block)
1221 {
1222 	ASSERT(block->busy_writing);
1223 
1224 	TRACE(("BlockWriter::_WriteBlock(block %Ld)\n", block->block_number));
1225 	TB(Write(fCache, block));
1226 	TB2(BlockData(fCache, block, "before write"));
1227 
1228 	size_t blockSize = fCache->block_size;
1229 
1230 	ssize_t written = write_pos(fCache->fd,
1231 		block->block_number * blockSize, _Data(block), blockSize);
1232 
1233 	if (written != (ssize_t)blockSize) {
1234 		TB(Error(fCache, block->block_number, "write failed", written));
1235 		FATAL(("could not write back block %Ld (%s)\n", block->block_number,
1236 			strerror(errno)));
1237 		if (written < 0)
1238 			return errno;
1239 
1240 		return B_IO_ERROR;
1241 	}
1242 
1243 	return B_OK;
1244 }
1245 
1246 
1247 void
1248 BlockWriter::_BlockDone(cached_block* block, hash_iterator* iterator)
1249 {
1250 	if (block == NULL) {
1251 		// An error occured when trying to write this block
1252 		return;
1253 	}
1254 
1255 	if (fCache->num_dirty_blocks > 0)
1256 		fCache->num_dirty_blocks--;
1257 
1258 	if (_Data(block) == block->current_data)
1259 		block->is_dirty = false;
1260 
1261 	_UnmarkWriting(block);
1262 
1263 	cache_transaction* previous = block->previous_transaction;
1264 	if (previous != NULL) {
1265 		previous->blocks.Remove(block);
1266 		block->previous_transaction = NULL;
1267 
1268 		if (block->original_data != NULL && block->transaction == NULL) {
1269 			// This block is not part of a transaction, so it does not need
1270 			// its original pointer anymore.
1271 			fCache->Free(block->original_data);
1272 			block->original_data = NULL;
1273 		}
1274 
1275 		// Has the previous transation been finished with that write?
1276 		if (--previous->num_blocks == 0) {
1277 			TRACE(("cache transaction %ld finished!\n", previous->id));
1278 			T(Action("written", fCache, previous));
1279 
1280 			notify_transaction_listeners(fCache, previous,
1281 				TRANSACTION_WRITTEN);
1282 
1283 			if (iterator != NULL)
1284 				hash_remove_current(fCache->transaction_hash, iterator);
1285 			else
1286 				hash_remove(fCache->transaction_hash, previous);
1287 
1288 			delete_transaction(fCache, previous);
1289 			fDeletedTransaction = true;
1290 		}
1291 	}
1292 	if (block->transaction == NULL && block->ref_count == 0 && !block->unused) {
1293 		// the block is no longer used
1294 		block->unused = true;
1295 		fCache->unused_blocks.Add(block);
1296 		fCache->unused_block_count++;
1297 	}
1298 
1299 	TB2(BlockData(fCache, block, "after write"));
1300 }
1301 
1302 
1303 void
1304 BlockWriter::_UnmarkWriting(cached_block* block)
1305 {
1306 	block->busy_writing = false;
1307 	if (block->previous_transaction != NULL)
1308 		block->previous_transaction->busy_writing_count--;
1309 	fCache->busy_writing_count--;
1310 
1311 	if ((fCache->busy_writing_waiters && fCache->busy_writing_count == 0)
1312 		|| block->busy_writing_waiters) {
1313 		fCache->busy_writing_waiters = false;
1314 		block->busy_writing_waiters = false;
1315 		fCache->busy_writing_condition.NotifyAll();
1316 	}
1317 }
1318 
1319 
1320 /*static*/ int
1321 BlockWriter::_CompareBlocks(const void* _blockA, const void* _blockB)
1322 {
1323 	cached_block* blockA = *(cached_block**)_blockA;
1324 	cached_block* blockB = *(cached_block**)_blockB;
1325 
1326 	off_t diff = blockA->block_number - blockB->block_number;
1327 	if (diff > 0)
1328 		return 1;
1329 
1330 	return diff < 0 ? -1 : 0;
1331 }
1332 
1333 
1334 //	#pragma mark - block_cache
1335 
1336 
1337 block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
1338 		bool readOnly)
1339 	:
1340 	hash(NULL),
1341 	fd(_fd),
1342 	max_blocks(numBlocks),
1343 	block_size(blockSize),
1344 	next_transaction_id(1),
1345 	last_transaction(NULL),
1346 	transaction_hash(NULL),
1347 	buffer_cache(NULL),
1348 	unused_block_count(0),
1349 	busy_reading_count(0),
1350 	busy_reading_waiters(false),
1351 	busy_writing_count(0),
1352 	busy_writing_waiters(0),
1353 	num_dirty_blocks(0),
1354 	read_only(readOnly)
1355 {
1356 }
1357 
1358 
1359 /*! Should be called with the cache's lock held. */
1360 block_cache::~block_cache()
1361 {
1362 	unregister_low_resource_handler(&_LowMemoryHandler, this);
1363 
1364 	hash_uninit(transaction_hash);
1365 	hash_uninit(hash);
1366 
1367 	delete_object_cache(buffer_cache);
1368 
1369 	mutex_destroy(&lock);
1370 }
1371 
1372 
1373 status_t
1374 block_cache::Init()
1375 {
1376 	busy_reading_condition.Init(this, "cache block busy_reading");
1377 	busy_writing_condition.Init(this, "cache block busy writing");
1378 	condition_variable.Init(this, "cache transaction sync");
1379 	mutex_init(&lock, "block cache");
1380 
1381 	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
1382 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1383 	if (buffer_cache == NULL)
1384 		return B_NO_MEMORY;
1385 
1386 	cached_block dummyBlock;
1387 	hash = hash_init(1024, offset_of_member(dummyBlock, next),
1388 		&cached_block::Compare, &cached_block::Hash);
1389 	if (hash == NULL)
1390 		return B_NO_MEMORY;
1391 
1392 	cache_transaction dummyTransaction;
1393 	transaction_hash = hash_init(16, offset_of_member(dummyTransaction, next),
1394 		&transaction_compare, &::transaction_hash);
1395 	if (transaction_hash == NULL)
1396 		return B_NO_MEMORY;
1397 
1398 	return register_low_resource_handler(&_LowMemoryHandler, this,
1399 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1400 			| B_KERNEL_RESOURCE_ADDRESS_SPACE, 0);
1401 }
1402 
1403 
1404 void
1405 block_cache::Free(void* buffer)
1406 {
1407 	if (buffer != NULL)
1408 		object_cache_free(buffer_cache, buffer, 0);
1409 }
1410 
1411 
1412 void*
1413 block_cache::Allocate()
1414 {
1415 	void* block = object_cache_alloc(buffer_cache, 0);
1416 	if (block != NULL)
1417 		return block;
1418 
1419 	// recycle existing before allocating a new one
1420 	RemoveUnusedBlocks(100);
1421 
1422 	return object_cache_alloc(buffer_cache, 0);
1423 }
1424 
1425 
1426 void
1427 block_cache::FreeBlock(cached_block* block)
1428 {
1429 	Free(block->current_data);
1430 
1431 	if (block->original_data != NULL || block->parent_data != NULL) {
1432 		panic("block_cache::FreeBlock(): %Ld, original %p, parent %p\n",
1433 			block->block_number, block->original_data, block->parent_data);
1434 	}
1435 
1436 #if BLOCK_CACHE_DEBUG_CHANGED
1437 	Free(block->compare);
1438 #endif
1439 
1440 	object_cache_free(sBlockCache, block, 0);
1441 }
1442 
1443 
1444 /*! Allocates a new block for \a blockNumber, ready for use */
1445 cached_block*
1446 block_cache::NewBlock(off_t blockNumber)
1447 {
1448 	cached_block* block = NULL;
1449 
1450 	if (low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1451 			| B_KERNEL_RESOURCE_ADDRESS_SPACE) != B_NO_LOW_RESOURCE) {
1452 		// recycle existing instead of allocating a new one
1453 		block = _GetUnusedBlock();
1454 	}
1455 	if (block == NULL) {
1456 		block = (cached_block*)object_cache_alloc(sBlockCache, 0);
1457 		if (block != NULL) {
1458 			block->current_data = Allocate();
1459 			if (block->current_data == NULL) {
1460 				object_cache_free(sBlockCache, block, 0);
1461 				return NULL;
1462 			}
1463 		} else {
1464 			TB(Error(this, blockNumber, "allocation failed"));
1465 			dprintf("block allocation failed, unused list is %sempty.\n",
1466 				unused_blocks.IsEmpty() ? "" : "not ");
1467 
1468 			// allocation failed, try to reuse an unused block
1469 			block = _GetUnusedBlock();
1470 			if (block == NULL) {
1471 				TB(Error(this, blockNumber, "get unused failed"));
1472 				FATAL(("could not allocate block!\n"));
1473 				return NULL;
1474 			}
1475 		}
1476 	}
1477 
1478 	block->block_number = blockNumber;
1479 	block->ref_count = 0;
1480 	block->last_accessed = 0;
1481 	block->transaction_next = NULL;
1482 	block->transaction = block->previous_transaction = NULL;
1483 	block->original_data = NULL;
1484 	block->parent_data = NULL;
1485 	block->busy_reading = false;
1486 	block->busy_writing = false;
1487 	block->is_writing = false;
1488 	block->is_dirty = false;
1489 	block->unused = false;
1490 	block->discard = false;
1491 	block->busy_reading_waiters = false;
1492 	block->busy_writing_waiters = false;
1493 #if BLOCK_CACHE_DEBUG_CHANGED
1494 	block->compare = NULL;
1495 #endif
1496 
1497 	return block;
1498 }
1499 
1500 
1501 void
1502 block_cache::FreeBlockParentData(cached_block* block)
1503 {
1504 	ASSERT(block->parent_data != NULL);
1505 	if (block->parent_data != block->current_data)
1506 		Free(block->parent_data);
1507 	block->parent_data = NULL;
1508 }
1509 
1510 
1511 void
1512 block_cache::RemoveUnusedBlocks(int32 count, int32 minSecondsOld)
1513 {
1514 	TRACE(("block_cache: remove up to %" B_PRId32 " unused blocks\n", count));
1515 
1516 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1517 			cached_block* block = iterator.Next();) {
1518 		if (minSecondsOld >= block->LastAccess()) {
1519 			// The list is sorted by last access
1520 			break;
1521 		}
1522 		if (block->busy_reading || block->busy_writing)
1523 			continue;
1524 
1525 		TB(Flush(this, block));
1526 		TRACE(("  remove block %Ld, last accessed %" B_PRId32 "\n",
1527 			block->block_number, block->last_accessed));
1528 
1529 		// this can only happen if no transactions are used
1530 		if (block->is_dirty && !block->discard) {
1531 			if (block->busy_writing)
1532 				continue;
1533 
1534 			BlockWriter::WriteBlock(this, block);
1535 		}
1536 
1537 		// remove block from lists
1538 		iterator.Remove();
1539 		unused_block_count--;
1540 		RemoveBlock(block);
1541 
1542 		if (--count <= 0)
1543 			break;
1544 	}
1545 }
1546 
1547 
1548 void
1549 block_cache::RemoveBlock(cached_block* block)
1550 {
1551 	hash_remove(hash, block);
1552 	FreeBlock(block);
1553 }
1554 
1555 
1556 /*!	Discards the block from a transaction (this method must not be called
1557 	for blocks not part of a transaction).
1558 */
1559 void
1560 block_cache::DiscardBlock(cached_block* block)
1561 {
1562 	ASSERT(block->discard);
1563 	ASSERT(block->previous_transaction == NULL);
1564 
1565 	if (block->parent_data != NULL)
1566 		FreeBlockParentData(block);
1567 
1568 	if (block->original_data != NULL) {
1569 		Free(block->original_data);
1570 		block->original_data = NULL;
1571 	}
1572 
1573 	RemoveBlock(block);
1574 }
1575 
1576 
1577 void
1578 block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
1579 {
1580 	TRACE(("block_cache: low memory handler called with level %ld\n", level));
1581 
1582 	// free some blocks according to the low memory state
1583 	// (if there is enough memory left, we don't free any)
1584 
1585 	block_cache* cache = (block_cache*)data;
1586 	int32 free = 0;
1587 	int32 secondsOld = 0;
1588 	switch (level) {
1589 		case B_NO_LOW_RESOURCE:
1590 			return;
1591 		case B_LOW_RESOURCE_NOTE:
1592 			free = cache->unused_block_count / 8;
1593 			secondsOld = 120;
1594 			break;
1595 		case B_LOW_RESOURCE_WARNING:
1596 			free = cache->unused_block_count / 4;
1597 			secondsOld = 10;
1598 			break;
1599 		case B_LOW_RESOURCE_CRITICAL:
1600 			free = cache->unused_block_count / 2;
1601 			secondsOld = 0;
1602 			break;
1603 	}
1604 
1605 	MutexLocker locker(&cache->lock);
1606 
1607 	if (!locker.IsLocked()) {
1608 		// If our block_cache were deleted, it could be that we had
1609 		// been called before that deletion went through, therefore,
1610 		// acquiring its lock might fail.
1611 		return;
1612 	}
1613 
1614 #ifdef TRACE_BLOCK_CACHE
1615 	uint32 oldUnused = cache->unused_block_count;
1616 #endif
1617 
1618 	cache->RemoveUnusedBlocks(free, secondsOld);
1619 
1620 	TRACE(("block_cache::_LowMemoryHandler(): %p: unused: %lu -> %lu\n", cache,
1621 		oldUnused, cache->unused_block_count));
1622 }
1623 
1624 
1625 cached_block*
1626 block_cache::_GetUnusedBlock()
1627 {
1628 	TRACE(("block_cache: get unused block\n"));
1629 
1630 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1631 			cached_block* block = iterator.Next();) {
1632 		TB(Flush(this, block, true));
1633 		// this can only happen if no transactions are used
1634 		if (block->is_dirty && !block->busy_writing && !block->discard)
1635 			BlockWriter::WriteBlock(this, block);
1636 
1637 		// remove block from lists
1638 		iterator.Remove();
1639 		unused_block_count--;
1640 		hash_remove(hash, block);
1641 
1642 		// TODO: see if parent/compare data is handled correctly here!
1643 		if (block->parent_data != NULL
1644 			&& block->parent_data != block->original_data)
1645 			Free(block->parent_data);
1646 		if (block->original_data != NULL)
1647 			Free(block->original_data);
1648 
1649 #if BLOCK_CACHE_DEBUG_CHANGED
1650 		if (block->compare != NULL)
1651 			Free(block->compare);
1652 #endif
1653 		return block;
1654 	}
1655 
1656 	return NULL;
1657 }
1658 
1659 
1660 //	#pragma mark - private block functions
1661 
1662 
1663 /*!	Cache must be locked.
1664 */
1665 static void
1666 mark_block_busy_reading(block_cache* cache, cached_block* block)
1667 {
1668 	block->busy_reading = true;
1669 	cache->busy_reading_count++;
1670 }
1671 
1672 
1673 /*!	Cache must be locked.
1674 */
1675 static void
1676 mark_block_unbusy_reading(block_cache* cache, cached_block* block)
1677 {
1678 	block->busy_reading = false;
1679 	cache->busy_reading_count--;
1680 
1681 	if ((cache->busy_reading_waiters && cache->busy_reading_count == 0)
1682 		|| block->busy_reading_waiters) {
1683 		cache->busy_reading_waiters = false;
1684 		block->busy_reading_waiters = false;
1685 		cache->busy_reading_condition.NotifyAll();
1686 	}
1687 }
1688 
1689 
1690 /*!	Cache must be locked.
1691 */
1692 static void
1693 wait_for_busy_reading_block(block_cache* cache, cached_block* block)
1694 {
1695 	while (block->busy_reading) {
1696 		// wait for at least the specified block to be read in
1697 		ConditionVariableEntry entry;
1698 		cache->busy_reading_condition.Add(&entry);
1699 		block->busy_reading_waiters = true;
1700 
1701 		mutex_unlock(&cache->lock);
1702 
1703 		entry.Wait();
1704 
1705 		mutex_lock(&cache->lock);
1706 	}
1707 }
1708 
1709 
1710 /*!	Cache must be locked.
1711 */
1712 static void
1713 wait_for_busy_reading_blocks(block_cache* cache)
1714 {
1715 	while (cache->busy_reading_count != 0) {
1716 		// wait for all blocks to be read in
1717 		ConditionVariableEntry entry;
1718 		cache->busy_reading_condition.Add(&entry);
1719 		cache->busy_reading_waiters = true;
1720 
1721 		mutex_unlock(&cache->lock);
1722 
1723 		entry.Wait();
1724 
1725 		mutex_lock(&cache->lock);
1726 	}
1727 }
1728 
1729 
1730 /*!	Cache must be locked.
1731 */
1732 static void
1733 wait_for_busy_writing_block(block_cache* cache, cached_block* block)
1734 {
1735 	while (block->busy_writing) {
1736 		// wait for all blocks to be written back
1737 		ConditionVariableEntry entry;
1738 		cache->busy_writing_condition.Add(&entry);
1739 		block->busy_writing_waiters = true;
1740 
1741 		mutex_unlock(&cache->lock);
1742 
1743 		entry.Wait();
1744 
1745 		mutex_lock(&cache->lock);
1746 	}
1747 }
1748 
1749 
1750 /*!	Cache must be locked.
1751 */
1752 static void
1753 wait_for_busy_writing_blocks(block_cache* cache)
1754 {
1755 	while (cache->busy_writing_count != 0) {
1756 		// wait for all blocks to be written back
1757 		ConditionVariableEntry entry;
1758 		cache->busy_writing_condition.Add(&entry);
1759 		cache->busy_writing_waiters = true;
1760 
1761 		mutex_unlock(&cache->lock);
1762 
1763 		entry.Wait();
1764 
1765 		mutex_lock(&cache->lock);
1766 	}
1767 }
1768 
1769 
1770 /*!	Removes a reference from the specified \a block. If this was the last
1771 	reference, the block is moved into the unused list.
1772 	In low memory situations, it will also free some blocks from that list,
1773 	but not necessarily the \a block it just released.
1774 */
1775 static void
1776 put_cached_block(block_cache* cache, cached_block* block)
1777 {
1778 #if BLOCK_CACHE_DEBUG_CHANGED
1779 	if (!block->is_dirty && block->compare != NULL
1780 		&& memcmp(block->current_data, block->compare, cache->block_size)) {
1781 		dprintf("new block:\n");
1782 		dump_block((const char*)block->current_data, 256, "  ");
1783 		dprintf("unchanged block:\n");
1784 		dump_block((const char*)block->compare, 256, "  ");
1785 		BlockWriter::WriteBlock(cache, block);
1786 		panic("block_cache: supposed to be clean block was changed!\n");
1787 
1788 		cache->Free(block->compare);
1789 		block->compare = NULL;
1790 	}
1791 #endif
1792 	TB(Put(cache, block));
1793 
1794 	if (block->ref_count < 1) {
1795 		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
1796 		return;
1797 	}
1798 
1799 	if (--block->ref_count == 0
1800 		&& block->transaction == NULL && block->previous_transaction == NULL) {
1801 		// This block is not used anymore, and not part of any transaction
1802 		block->is_writing = false;
1803 
1804 		if (block->discard) {
1805 			cache->RemoveBlock(block);
1806 		} else {
1807 			// put this block in the list of unused blocks
1808 			ASSERT(!block->unused);
1809 			block->unused = true;
1810 
1811 			ASSERT(block->original_data == NULL && block->parent_data == NULL);
1812 			cache->unused_blocks.Add(block);
1813 			cache->unused_block_count++;
1814 		}
1815 	}
1816 }
1817 
1818 
1819 static void
1820 put_cached_block(block_cache* cache, off_t blockNumber)
1821 {
1822 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1823 		panic("put_cached_block: invalid block number %lld (max %lld)",
1824 			blockNumber, cache->max_blocks - 1);
1825 	}
1826 
1827 	cached_block* block = (cached_block*)hash_lookup(cache->hash, &blockNumber);
1828 	if (block != NULL)
1829 		put_cached_block(cache, block);
1830 	else {
1831 		TB(Error(cache, blockNumber, "put unknown"));
1832 	}
1833 }
1834 
1835 
1836 /*!	Retrieves the block \a blockNumber from the hash table, if it's already
1837 	there, or reads it from the disk.
1838 	You need to have the cache locked when calling this function.
1839 
1840 	\param _allocated tells you whether or not a new block has been allocated
1841 		to satisfy your request.
1842 	\param readBlock if \c false, the block will not be read in case it was
1843 		not already in the cache. The block you retrieve may contain random
1844 		data. If \c true, the cache will be temporarily unlocked while the
1845 		block is read in.
1846 */
1847 static cached_block*
1848 get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
1849 	bool readBlock = true)
1850 {
1851 	ASSERT_LOCKED_MUTEX(&cache->lock);
1852 
1853 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1854 		panic("get_cached_block: invalid block number %lld (max %lld)",
1855 			blockNumber, cache->max_blocks - 1);
1856 		return NULL;
1857 	}
1858 
1859 retry:
1860 	cached_block* block = (cached_block*)hash_lookup(cache->hash,
1861 		&blockNumber);
1862 	*_allocated = false;
1863 
1864 	if (block == NULL) {
1865 		// put block into cache
1866 		block = cache->NewBlock(blockNumber);
1867 		if (block == NULL)
1868 			return NULL;
1869 
1870 		hash_insert_grow(cache->hash, block);
1871 		*_allocated = true;
1872 	} else if (block->busy_reading) {
1873 		// The block is currently busy_reading - wait and try again later
1874 		wait_for_busy_reading_block(cache, block);
1875 		goto retry;
1876 	}
1877 
1878 	if (block->unused) {
1879 		//TRACE(("remove block %Ld from unused\n", blockNumber));
1880 		block->unused = false;
1881 		cache->unused_blocks.Remove(block);
1882 		cache->unused_block_count--;
1883 	}
1884 
1885 	if (*_allocated && readBlock) {
1886 		// read block into cache
1887 		int32 blockSize = cache->block_size;
1888 
1889 		mark_block_busy_reading(cache, block);
1890 		mutex_unlock(&cache->lock);
1891 
1892 		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
1893 			block->current_data, blockSize);
1894 
1895 		mutex_lock(&cache->lock);
1896 		if (bytesRead < blockSize) {
1897 			cache->RemoveBlock(block);
1898 			TB(Error(cache, blockNumber, "read failed", bytesRead));
1899 
1900 			FATAL(("could not read block %Ld: bytesRead: %ld, error: %s\n",
1901 				blockNumber, bytesRead, strerror(errno)));
1902 			return NULL;
1903 		}
1904 		TB(Read(cache, block));
1905 
1906 		mark_block_unbusy_reading(cache, block);
1907 	}
1908 
1909 	block->ref_count++;
1910 	block->last_accessed = system_time() / 1000000L;
1911 
1912 	return block;
1913 }
1914 
1915 
1916 /*!	Returns the writable block data for the requested blockNumber.
1917 	If \a cleared is true, the block is not read from disk; an empty block
1918 	is returned.
1919 
1920 	This is the only method to insert a block into a transaction. It makes
1921 	sure that the previous block contents are preserved in that case.
1922 */
1923 static void*
1924 get_writable_cached_block(block_cache* cache, off_t blockNumber, off_t base,
1925 	off_t length, int32 transactionID, bool cleared)
1926 {
1927 	TRACE(("get_writable_cached_block(blockNumber = %Ld, transaction = %ld)\n",
1928 		blockNumber, transactionID));
1929 
1930 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1931 		panic("get_writable_cached_block: invalid block number %lld (max %lld)",
1932 			blockNumber, cache->max_blocks - 1);
1933 	}
1934 
1935 	bool allocated;
1936 	cached_block* block = get_cached_block(cache, blockNumber, &allocated,
1937 		!cleared);
1938 	if (block == NULL)
1939 		return NULL;
1940 
1941 	if (block->busy_writing)
1942 		wait_for_busy_writing_block(cache, block);
1943 
1944 	block->discard = false;
1945 
1946 	// if there is no transaction support, we just return the current block
1947 	if (transactionID == -1) {
1948 		if (cleared) {
1949 			mark_block_busy_reading(cache, block);
1950 			mutex_unlock(&cache->lock);
1951 
1952 			memset(block->current_data, 0, cache->block_size);
1953 
1954 			mutex_lock(&cache->lock);
1955 			mark_block_unbusy_reading(cache, block);
1956 		}
1957 
1958 		block->is_writing = true;
1959 
1960 		if (!block->is_dirty) {
1961 			cache->num_dirty_blocks++;
1962 			block->is_dirty = true;
1963 				// mark the block as dirty
1964 		}
1965 
1966 		TB(Get(cache, block));
1967 		return block->current_data;
1968 	}
1969 
1970 	cache_transaction* transaction = block->transaction;
1971 
1972 	if (transaction != NULL && transaction->id != transactionID) {
1973 		// TODO: we have to wait here until the other transaction is done.
1974 		//	Maybe we should even panic, since we can't prevent any deadlocks.
1975 		panic("get_writable_cached_block(): asked to get busy writable block "
1976 			"(transaction %ld)\n", block->transaction->id);
1977 		put_cached_block(cache, block);
1978 		return NULL;
1979 	}
1980 	if (transaction == NULL && transactionID != -1) {
1981 		// get new transaction
1982 		transaction = lookup_transaction(cache, transactionID);
1983 		if (transaction == NULL) {
1984 			panic("get_writable_cached_block(): invalid transaction %ld!\n",
1985 				transactionID);
1986 			put_cached_block(cache, block);
1987 			return NULL;
1988 		}
1989 		if (!transaction->open) {
1990 			panic("get_writable_cached_block(): transaction already done!\n");
1991 			put_cached_block(cache, block);
1992 			return NULL;
1993 		}
1994 
1995 		block->transaction = transaction;
1996 
1997 		// attach the block to the transaction block list
1998 		block->transaction_next = transaction->first_block;
1999 		transaction->first_block = block;
2000 		transaction->num_blocks++;
2001 	}
2002 	if (transaction != NULL)
2003 		transaction->last_used = system_time();
2004 
2005 	bool wasUnchanged = block->original_data == NULL
2006 		|| block->previous_transaction != NULL;
2007 
2008 	if (!(allocated && cleared) && block->original_data == NULL) {
2009 		// we already have data, so we need to preserve it
2010 		block->original_data = cache->Allocate();
2011 		if (block->original_data == NULL) {
2012 			TB(Error(cache, blockNumber, "allocate original failed"));
2013 			FATAL(("could not allocate original_data\n"));
2014 			put_cached_block(cache, block);
2015 			return NULL;
2016 		}
2017 
2018 		mark_block_busy_reading(cache, block);
2019 		mutex_unlock(&cache->lock);
2020 
2021 		memcpy(block->original_data, block->current_data, cache->block_size);
2022 
2023 		mutex_lock(&cache->lock);
2024 		mark_block_unbusy_reading(cache, block);
2025 	}
2026 	if (block->parent_data == block->current_data) {
2027 		// remember any previous contents for the parent transaction
2028 		block->parent_data = cache->Allocate();
2029 		if (block->parent_data == NULL) {
2030 			// TODO: maybe we should just continue the current transaction in
2031 			// this case...
2032 			TB(Error(cache, blockNumber, "allocate parent failed"));
2033 			FATAL(("could not allocate parent\n"));
2034 			put_cached_block(cache, block);
2035 			return NULL;
2036 		}
2037 
2038 		mark_block_busy_reading(cache, block);
2039 		mutex_unlock(&cache->lock);
2040 
2041 		memcpy(block->parent_data, block->current_data, cache->block_size);
2042 
2043 		mutex_lock(&cache->lock);
2044 		mark_block_unbusy_reading(cache, block);
2045 
2046 		transaction->sub_num_blocks++;
2047 	} else if (transaction != NULL && transaction->has_sub_transaction
2048 		&& block->parent_data == NULL && wasUnchanged)
2049 		transaction->sub_num_blocks++;
2050 
2051 	if (cleared) {
2052 		mark_block_busy_reading(cache, block);
2053 		mutex_unlock(&cache->lock);
2054 
2055 		memset(block->current_data, 0, cache->block_size);
2056 
2057 		mutex_lock(&cache->lock);
2058 		mark_block_unbusy_reading(cache, block);
2059 	}
2060 
2061 	block->is_dirty = true;
2062 	TB(Get(cache, block));
2063 	TB2(BlockData(cache, block, "get writable"));
2064 
2065 	return block->current_data;
2066 }
2067 
2068 
2069 #if DEBUG_BLOCK_CACHE
2070 
2071 
2072 static void
2073 dump_block(cached_block* block)
2074 {
2075 	kprintf("%08lx %9Ld %08lx %08lx %08lx %5ld %6ld %c%c%c%c%c%c %08lx %08lx\n",
2076 		(addr_t)block, block->block_number,
2077 		(addr_t)block->current_data, (addr_t)block->original_data,
2078 		(addr_t)block->parent_data, block->ref_count, block->LastAccess(),
2079 		block->busy_reading ? 'r' : '-', block->busy_writing ? 'w' : '-',
2080 		block->is_writing ? 'W' : '-', block->is_dirty ? 'D' : '-',
2081 		block->unused ? 'U' : '-', block->discard ? 'D' : '-',
2082 		(addr_t)block->transaction,
2083 		(addr_t)block->previous_transaction);
2084 }
2085 
2086 
2087 static void
2088 dump_block_long(cached_block* block)
2089 {
2090 	kprintf("BLOCK %p\n", block);
2091 	kprintf(" current data:  %p\n", block->current_data);
2092 	kprintf(" original data: %p\n", block->original_data);
2093 	kprintf(" parent data:   %p\n", block->parent_data);
2094 #if BLOCK_CACHE_DEBUG_CHANGED
2095 	kprintf(" compare data:  %p\n", block->compare);
2096 #endif
2097 	kprintf(" ref_count:     %ld\n", block->ref_count);
2098 	kprintf(" accessed:      %ld\n", block->LastAccess());
2099 	kprintf(" flags:        ");
2100 	if (block->busy_reading)
2101 		kprintf(" busy_reading");
2102 	if (block->busy_writing)
2103 		kprintf(" busy_writing");
2104 	if (block->is_writing)
2105 		kprintf(" is-writing");
2106 	if (block->is_dirty)
2107 		kprintf(" is-dirty");
2108 	if (block->unused)
2109 		kprintf(" unused");
2110 	if (block->discard)
2111 		kprintf(" discard");
2112 	kprintf("\n");
2113 	if (block->transaction != NULL) {
2114 		kprintf(" transaction:   %p (%ld)\n", block->transaction,
2115 			block->transaction->id);
2116 		if (block->transaction_next != NULL) {
2117 			kprintf(" next in transaction: %Ld\n",
2118 				block->transaction_next->block_number);
2119 		}
2120 	}
2121 	if (block->previous_transaction != NULL) {
2122 		kprintf(" previous transaction: %p (%ld)\n",
2123 			block->previous_transaction,
2124 			block->previous_transaction->id);
2125 	}
2126 
2127 	set_debug_variable("_current", (addr_t)block->current_data);
2128 	set_debug_variable("_original", (addr_t)block->original_data);
2129 	set_debug_variable("_parent", (addr_t)block->parent_data);
2130 }
2131 
2132 
2133 static int
2134 dump_cached_block(int argc, char** argv)
2135 {
2136 	if (argc != 2) {
2137 		kprintf("usage: %s <block-address>\n", argv[0]);
2138 		return 0;
2139 	}
2140 
2141 	dump_block_long((struct cached_block*)(addr_t)parse_expression(argv[1]));
2142 	return 0;
2143 }
2144 
2145 
2146 static int
2147 dump_cache(int argc, char** argv)
2148 {
2149 	bool showTransactions = false;
2150 	bool showBlocks = false;
2151 	int32 i = 1;
2152 	while (argv[i] != NULL && argv[i][0] == '-') {
2153 		for (char* arg = &argv[i][1]; arg[0]; arg++) {
2154 			switch (arg[0]) {
2155 				case 'b':
2156 					showBlocks = true;
2157 					break;
2158 				case 't':
2159 					showTransactions = true;
2160 					break;
2161 				default:
2162 					print_debugger_command_usage(argv[0]);
2163 					return 0;
2164 			}
2165 		}
2166 		i++;
2167 	}
2168 
2169 	if (i >= argc) {
2170 		print_debugger_command_usage(argv[0]);
2171 		return 0;
2172 	}
2173 
2174 	block_cache* cache = (struct block_cache*)(addr_t)parse_expression(argv[i]);
2175 	if (cache == NULL) {
2176 		kprintf("invalid cache address\n");
2177 		return 0;
2178 	}
2179 
2180 	off_t blockNumber = -1;
2181 	if (i + 1 < argc) {
2182 		blockNumber = parse_expression(argv[i + 1]);
2183 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
2184 			&blockNumber);
2185 		if (block != NULL)
2186 			dump_block_long(block);
2187 		else
2188 			kprintf("block %Ld not found\n", blockNumber);
2189 		return 0;
2190 	}
2191 
2192 	kprintf("BLOCK CACHE: %p\n", cache);
2193 
2194 	kprintf(" fd:           %d\n", cache->fd);
2195 	kprintf(" max_blocks:   %Ld\n", cache->max_blocks);
2196 	kprintf(" block_size:   %lu\n", cache->block_size);
2197 	kprintf(" next_transaction_id: %ld\n", cache->next_transaction_id);
2198 	kprintf(" buffer_cache: %p\n", cache->buffer_cache);
2199 	kprintf(" busy_reading: %lu, %s waiters\n", cache->busy_reading_count,
2200 		cache->busy_reading_waiters ? "has" : "no");
2201 	kprintf(" busy_writing: %lu, %s waiters\n", cache->busy_writing_count,
2202 		cache->busy_writing_waiters ? "has" : "no");
2203 
2204 	if (!cache->pending_notifications.IsEmpty()) {
2205 		kprintf(" pending notifications:\n");
2206 
2207 		NotificationList::Iterator iterator
2208 			= cache->pending_notifications.GetIterator();
2209 		while (iterator.HasNext()) {
2210 			cache_notification* notification = iterator.Next();
2211 
2212 			kprintf("  %p %5lx %p - %p\n", notification,
2213 				notification->events_pending, notification->hook,
2214 				notification->data);
2215 		}
2216 	}
2217 
2218 	if (showTransactions) {
2219 		kprintf(" transactions:\n");
2220 		kprintf("address       id state  blocks  main   sub\n");
2221 
2222 		hash_iterator iterator;
2223 		hash_open(cache->transaction_hash, &iterator);
2224 
2225 		cache_transaction* transaction;
2226 		while ((transaction = (cache_transaction*)hash_next(
2227 				cache->transaction_hash, &iterator)) != NULL) {
2228 			kprintf("%p %5ld %-7s %5ld %5ld %5ld\n", transaction,
2229 				transaction->id, transaction->open ? "open" : "closed",
2230 				transaction->num_blocks, transaction->main_num_blocks,
2231 				transaction->sub_num_blocks);
2232 		}
2233 	}
2234 
2235 	if (showBlocks) {
2236 		kprintf(" blocks:\n");
2237 		kprintf("address  block no. current  original parent    refs access "
2238 			"flags transact prev. trans\n");
2239 	}
2240 
2241 	uint32 referenced = 0;
2242 	uint32 count = 0;
2243 	uint32 dirty = 0;
2244 	uint32 discarded = 0;
2245 	hash_iterator iterator;
2246 	hash_open(cache->hash, &iterator);
2247 	cached_block* block;
2248 	while ((block = (cached_block*)hash_next(cache->hash, &iterator)) != NULL) {
2249 		if (showBlocks)
2250 			dump_block(block);
2251 
2252 		if (block->is_dirty)
2253 			dirty++;
2254 		if (block->discard)
2255 			discarded++;
2256 		if (block->ref_count)
2257 			referenced++;
2258 		count++;
2259 	}
2260 
2261 	kprintf(" %ld blocks total, %ld dirty, %ld discarded, %ld referenced, %ld "
2262 		"busy, %" B_PRIu32 " in unused.\n", count, dirty, discarded, referenced,
2263 		cache->busy_reading_count, cache->unused_block_count);
2264 
2265 	hash_close(cache->hash, &iterator, false);
2266 	return 0;
2267 }
2268 
2269 
2270 static int
2271 dump_transaction(int argc, char** argv)
2272 {
2273 	bool showBlocks = false;
2274 	int i = 1;
2275 	if (argc > 1 && !strcmp(argv[1], "-b")) {
2276 		showBlocks = true;
2277 		i++;
2278 	}
2279 
2280 	if (argc - i < 1 || argc - i > 2) {
2281 		print_debugger_command_usage(argv[0]);
2282 		return 0;
2283 	}
2284 
2285 	cache_transaction* transaction = NULL;
2286 
2287 	if (argc - i == 1) {
2288 		transaction = (cache_transaction*)(addr_t)parse_expression(argv[i]);
2289 	} else {
2290 		block_cache* cache = (block_cache*)(addr_t)parse_expression(argv[i]);
2291 		int32 id = parse_expression(argv[i + 1]);
2292 		transaction = lookup_transaction(cache, id);
2293 		if (transaction == NULL) {
2294 			kprintf("No transaction with ID %ld found.\n", id);
2295 			return 0;
2296 		}
2297 	}
2298 
2299 	kprintf("TRANSACTION %p\n", transaction);
2300 
2301 	kprintf(" id:             %ld\n", transaction->id);
2302 	kprintf(" num block:      %ld\n", transaction->num_blocks);
2303 	kprintf(" main num block: %ld\n", transaction->main_num_blocks);
2304 	kprintf(" sub num block:  %ld\n", transaction->sub_num_blocks);
2305 	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
2306 	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
2307 	kprintf(" idle:           %Ld secs\n",
2308 		(system_time() - transaction->last_used) / 1000000);
2309 
2310 	kprintf(" listeners:\n");
2311 
2312 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2313 	while (iterator.HasNext()) {
2314 		cache_listener* listener = iterator.Next();
2315 
2316 		kprintf("  %p %5lx %p - %p\n", listener, listener->events_pending,
2317 			listener->hook, listener->data);
2318 	}
2319 
2320 	if (!showBlocks)
2321 		return 0;
2322 
2323 	kprintf(" blocks:\n");
2324 	kprintf("address  block no. current  original parent    refs access "
2325 		"flags transact prev. trans\n");
2326 
2327 	cached_block* block = transaction->first_block;
2328 	while (block != NULL) {
2329 		dump_block(block);
2330 		block = block->transaction_next;
2331 	}
2332 
2333 	kprintf("--\n");
2334 
2335 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
2336 	while (blockIterator.HasNext()) {
2337 		block = blockIterator.Next();
2338 		dump_block(block);
2339 	}
2340 
2341 	return 0;
2342 }
2343 
2344 
2345 static int
2346 dump_caches(int argc, char** argv)
2347 {
2348 	kprintf("Block caches:\n");
2349 	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
2350 	while (i.HasNext()) {
2351 		block_cache* cache = i.Next();
2352 		if (cache == (block_cache*)&sMarkCache)
2353 			continue;
2354 
2355 		kprintf("  %p\n", cache);
2356 	}
2357 
2358 	return 0;
2359 }
2360 
2361 
2362 #if BLOCK_CACHE_BLOCK_TRACING >= 2
2363 static int
2364 dump_block_data(int argc, char** argv)
2365 {
2366 	using namespace BlockTracing;
2367 
2368 	// Determine which blocks to show
2369 
2370 	bool printStackTrace = true;
2371 	uint32 which = 0;
2372 	int32 i = 1;
2373 	while (i < argc && argv[i][0] == '-') {
2374 		char* arg = &argv[i][1];
2375 		while (arg[0]) {
2376 			switch (arg[0]) {
2377 				case 'c':
2378 					which |= BlockData::kCurrent;
2379 					break;
2380 				case 'p':
2381 					which |= BlockData::kParent;
2382 					break;
2383 				case 'o':
2384 					which |= BlockData::kOriginal;
2385 					break;
2386 
2387 				default:
2388 					kprintf("invalid block specifier (only o/c/p are "
2389 						"allowed).\n");
2390 					return 0;
2391 			}
2392 			arg++;
2393 		}
2394 
2395 		i++;
2396 	}
2397 	if (which == 0)
2398 		which = BlockData::kCurrent | BlockData::kParent | BlockData::kOriginal;
2399 
2400 	if (i == argc) {
2401 		print_debugger_command_usage(argv[0]);
2402 		return 0;
2403 	}
2404 
2405 	// Get the range of blocks to print
2406 
2407 	int64 from = parse_expression(argv[i]);
2408 	int64 to = from;
2409 	if (argc > i + 1)
2410 		to = parse_expression(argv[i + 1]);
2411 	if (to < from)
2412 		to = from;
2413 
2414 	uint32 offset = 0;
2415 	uint32 size = LONG_MAX;
2416 	if (argc > i + 2)
2417 		offset = parse_expression(argv[i + 2]);
2418 	if (argc > i + 3)
2419 		size = parse_expression(argv[i + 3]);
2420 
2421 	TraceEntryIterator iterator;
2422 	iterator.MoveTo(from - 1);
2423 
2424 	static char sBuffer[1024];
2425 	LazyTraceOutput out(sBuffer, sizeof(sBuffer), TRACE_OUTPUT_TEAM_ID);
2426 
2427 	while (TraceEntry* entry = iterator.Next()) {
2428 		int32 index = iterator.Index();
2429 		if (index > to)
2430 			break;
2431 
2432 		Action* action = dynamic_cast<Action*>(entry);
2433 		if (action != NULL) {
2434 			out.Clear();
2435 			out.DumpEntry(action);
2436 			continue;
2437 		}
2438 
2439 		BlockData* blockData = dynamic_cast<BlockData*>(entry);
2440 		if (blockData == NULL)
2441 			continue;
2442 
2443 		out.Clear();
2444 
2445 		const char* dump = out.DumpEntry(entry);
2446 		int length = strlen(dump);
2447 		if (length > 0 && dump[length - 1] == '\n')
2448 			length--;
2449 
2450 		kprintf("%5ld. %.*s\n", index, length, dump);
2451 
2452 		if (printStackTrace) {
2453 			out.Clear();
2454 			entry->DumpStackTrace(out);
2455 			if (out.Size() > 0)
2456 				kputs(out.Buffer());
2457 		}
2458 
2459 		blockData->DumpBlocks(which, offset, size);
2460 	}
2461 
2462 	return 0;
2463 }
2464 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
2465 
2466 
2467 #endif	// DEBUG_BLOCK_CACHE
2468 
2469 
2470 /*!	Traverses through the block_cache list, and returns one cache after the
2471 	other. The cache returned is automatically locked when you get it, and
2472 	unlocked with the next call to this function. Ignores caches that are in
2473 	deletion state.
2474 	Returns \c NULL when the end of the list is reached.
2475 */
2476 static block_cache*
2477 get_next_locked_block_cache(block_cache* last)
2478 {
2479 	MutexLocker _(sCachesLock);
2480 
2481 	block_cache* cache;
2482 	if (last != NULL) {
2483 		mutex_unlock(&last->lock);
2484 
2485 		cache = sCaches.GetNext((block_cache*)&sMarkCache);
2486 		sCaches.Remove((block_cache*)&sMarkCache);
2487 	} else
2488 		cache = sCaches.Head();
2489 
2490 	if (cache != NULL) {
2491 		mutex_lock(&cache->lock);
2492 		sCaches.Insert(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
2493 	}
2494 
2495 	return cache;
2496 }
2497 
2498 
2499 /*!	Background thread that continuously checks for pending notifications of
2500 	all caches.
2501 	Every two seconds, it will also write back up to 64 blocks per cache.
2502 */
2503 static status_t
2504 block_notifier_and_writer(void* /*data*/)
2505 {
2506 	const bigtime_t kTimeout = 2000000LL;
2507 	bigtime_t timeout = kTimeout;
2508 
2509 	while (true) {
2510 		bigtime_t start = system_time();
2511 
2512 		status_t status = acquire_sem_etc(sEventSemaphore, 1,
2513 			B_RELATIVE_TIMEOUT, timeout);
2514 		if (status == B_OK) {
2515 			flush_pending_notifications();
2516 			timeout -= system_time() - start;
2517 			continue;
2518 		}
2519 
2520 		// write 64 blocks of each block_cache every two seconds
2521 		// TODO: change this once we have an I/O scheduler
2522 		timeout = kTimeout;
2523 		size_t usedMemory;
2524 		object_cache_get_usage(sBlockCache, &usedMemory);
2525 
2526 		block_cache* cache = NULL;
2527 		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
2528 			BlockWriter writer(cache, 64);
2529 
2530 			size_t cacheUsedMemory;
2531 			object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
2532 			usedMemory += cacheUsedMemory;
2533 
2534 			if (cache->num_dirty_blocks) {
2535 				// This cache is not using transactions, we'll scan the blocks
2536 				// directly
2537 				hash_iterator iterator;
2538 				hash_open(cache->hash, &iterator);
2539 
2540 				cached_block* block;
2541 				while ((block = (cached_block*)hash_next(cache->hash, &iterator))
2542 						!= NULL) {
2543 					if (block->CanBeWritten() && !writer.Add(block))
2544 						break;
2545 				}
2546 
2547 				hash_close(cache->hash, &iterator, false);
2548 			} else {
2549 				hash_iterator iterator;
2550 				hash_open(cache->transaction_hash, &iterator);
2551 
2552 				cache_transaction* transaction;
2553 				while ((transaction = (cache_transaction*)hash_next(
2554 						cache->transaction_hash, &iterator)) != NULL) {
2555 					if (transaction->open) {
2556 						if (system_time() > transaction->last_used
2557 								+ kTransactionIdleTime) {
2558 							// Transaction is open but idle
2559 							notify_transaction_listeners(cache, transaction,
2560 								TRANSACTION_IDLE);
2561 						}
2562 						continue;
2563 					}
2564 
2565 					bool hasLeftOvers;
2566 						// we ignore this one
2567 					if (!writer.Add(transaction, &iterator, hasLeftOvers))
2568 						break;
2569 				}
2570 
2571 				hash_close(cache->transaction_hash, &iterator, false);
2572 			}
2573 
2574 			writer.Write();
2575 
2576 			if ((block_cache_used_memory() / B_PAGE_SIZE)
2577 					> vm_page_num_pages() / 2) {
2578 				// Try to reduce memory usage to half of the available
2579 				// RAM at maximum
2580 				cache->RemoveUnusedBlocks(1000, 10);
2581 			}
2582 		}
2583 
2584 		MutexLocker _(sCachesMemoryUseLock);
2585 		sUsedMemory = usedMemory;
2586 	}
2587 
2588 	// never can get here
2589 	return B_OK;
2590 }
2591 
2592 
2593 /*!	Notify function for wait_for_notifications(). */
2594 static void
2595 notify_sync(int32 transactionID, int32 event, void* _cache)
2596 {
2597 	block_cache* cache = (block_cache*)_cache;
2598 
2599 	cache->condition_variable.NotifyOne();
2600 }
2601 
2602 
2603 /*!	Must be called with the sCachesLock held. */
2604 static bool
2605 is_valid_cache(block_cache* cache)
2606 {
2607 	ASSERT_LOCKED_MUTEX(&sCachesLock);
2608 
2609 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
2610 	while (iterator.HasNext()) {
2611 		if (cache == iterator.Next())
2612 			return true;
2613 	}
2614 
2615 	return false;
2616 }
2617 
2618 
2619 /*!	Waits until all pending notifications are carried out.
2620 	Safe to be called from the block writer/notifier thread.
2621 	You must not hold the \a cache lock when calling this function.
2622 */
2623 static void
2624 wait_for_notifications(block_cache* cache)
2625 {
2626 	MutexLocker locker(sCachesLock);
2627 
2628 	if (find_thread(NULL) == sNotifierWriterThread) {
2629 		// We're the notifier thread, don't wait, but flush all pending
2630 		// notifications directly.
2631 		if (is_valid_cache(cache))
2632 			flush_pending_notifications(cache);
2633 		return;
2634 	}
2635 
2636 	// add sync notification
2637 	cache_notification notification;
2638 	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
2639 		cache);
2640 
2641 	ConditionVariableEntry entry;
2642 	cache->condition_variable.Add(&entry);
2643 
2644 	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
2645 	locker.Unlock();
2646 
2647 	// wait for notification hook to be called
2648 	entry.Wait();
2649 }
2650 
2651 
2652 status_t
2653 block_cache_init(void)
2654 {
2655 	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
2656 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
2657 	if (sBlockCache == NULL)
2658 		return B_NO_MEMORY;
2659 
2660 	new (&sCaches) DoublyLinkedList<block_cache>;
2661 		// manually call constructor
2662 
2663 	sEventSemaphore = create_sem(0, "block cache event");
2664 	if (sEventSemaphore < B_OK)
2665 		return sEventSemaphore;
2666 
2667 	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
2668 		"block notifier/writer", B_LOW_PRIORITY, NULL);
2669 	if (sNotifierWriterThread >= B_OK)
2670 		resume_thread(sNotifierWriterThread);
2671 
2672 #if DEBUG_BLOCK_CACHE
2673 	add_debugger_command_etc("block_caches", &dump_caches,
2674 		"dumps all block caches", "\n", 0);
2675 	add_debugger_command_etc("block_cache", &dump_cache,
2676 		"dumps a specific block cache",
2677 		"[-bt] <cache-address> [block-number]\n"
2678 		"  -t lists the transactions\n"
2679 		"  -b lists all blocks\n", 0);
2680 	add_debugger_command("cached_block", &dump_cached_block,
2681 		"dumps the specified cached block");
2682 	add_debugger_command_etc("transaction", &dump_transaction,
2683 		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
2684 		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
2685 		"  -b lists all blocks that are part of this transaction\n", 0);
2686 #	if BLOCK_CACHE_BLOCK_TRACING >= 2
2687 	add_debugger_command_etc("block_cache_data", &dump_block_data,
2688 		"dumps the data blocks logged for the actions",
2689 		"[-cpo] <from> [<to> [<offset> [<size>]]]\n"
2690 		"If no data specifier is used, all blocks are shown by default.\n"
2691 		" -c       the current data is shown, if available.\n"
2692 		" -p       the parent data is shown, if available.\n"
2693 		" -o       the original data is shown, if available.\n"
2694 		" <from>   first index of tracing entries to show.\n"
2695 		" <to>     if given, the last entry. If not, only <from> is shown.\n"
2696 		" <offset> the offset of the block data.\n"
2697 		" <from>   the size of the block data that is dumped\n", 0);
2698 #	endif
2699 #endif	// DEBUG_BLOCK_CACHE
2700 
2701 	return B_OK;
2702 }
2703 
2704 
2705 size_t
2706 block_cache_used_memory(void)
2707 {
2708 	MutexLocker _(sCachesMemoryUseLock);
2709 	return sUsedMemory;
2710 }
2711 
2712 
2713 //	#pragma mark - public transaction API
2714 
2715 
2716 int32
2717 cache_start_transaction(void* _cache)
2718 {
2719 	block_cache* cache = (block_cache*)_cache;
2720 	TransactionLocker locker(cache);
2721 
2722 	if (cache->last_transaction && cache->last_transaction->open) {
2723 		panic("last transaction (%ld) still open!\n",
2724 			cache->last_transaction->id);
2725 	}
2726 
2727 	cache_transaction* transaction = new(std::nothrow) cache_transaction;
2728 	if (transaction == NULL)
2729 		return B_NO_MEMORY;
2730 
2731 	transaction->id = atomic_add(&cache->next_transaction_id, 1);
2732 	cache->last_transaction = transaction;
2733 
2734 	TRACE(("cache_start_transaction(): id %ld started\n", transaction->id));
2735 	T(Action("start", cache, transaction));
2736 
2737 	hash_insert_grow(cache->transaction_hash, transaction);
2738 
2739 	return transaction->id;
2740 }
2741 
2742 
2743 status_t
2744 cache_sync_transaction(void* _cache, int32 id)
2745 {
2746 	block_cache* cache = (block_cache*)_cache;
2747 	bool hadBusy;
2748 
2749 	TRACE(("cache_sync_transaction(id %ld)\n", id));
2750 
2751 	do {
2752 		TransactionLocker locker(cache);
2753 		hadBusy = false;
2754 
2755 		BlockWriter writer(cache);
2756 		hash_iterator iterator;
2757 		hash_open(cache->transaction_hash, &iterator);
2758 
2759 		cache_transaction* transaction;
2760 		while ((transaction = (cache_transaction*)hash_next(
2761 				cache->transaction_hash, &iterator)) != NULL) {
2762 			// close all earlier transactions which haven't been closed yet
2763 
2764 			if (transaction->busy_writing_count != 0) {
2765 				hadBusy = true;
2766 				continue;
2767 			}
2768 			if (transaction->id <= id && !transaction->open) {
2769 				// write back all of their remaining dirty blocks
2770 				T(Action("sync", cache, transaction));
2771 
2772 				bool hasLeftOvers;
2773 				writer.Add(transaction, &iterator, hasLeftOvers);
2774 
2775 				if (hasLeftOvers) {
2776 					// This transaction contains blocks that a previous
2777 					// transaction is trying to write back in this write run
2778 					hadBusy = true;
2779 				}
2780 			}
2781 		}
2782 
2783 		hash_close(cache->transaction_hash, &iterator, false);
2784 
2785 		status_t status = writer.Write();
2786 		if (status != B_OK)
2787 			return status;
2788 	} while (hadBusy);
2789 
2790 	wait_for_notifications(cache);
2791 		// make sure that all pending TRANSACTION_WRITTEN notifications
2792 		// are handled after we return
2793 	return B_OK;
2794 }
2795 
2796 
2797 status_t
2798 cache_end_transaction(void* _cache, int32 id,
2799 	transaction_notification_hook hook, void* data)
2800 {
2801 	block_cache* cache = (block_cache*)_cache;
2802 	TransactionLocker locker(cache);
2803 
2804 	TRACE(("cache_end_transaction(id = %ld)\n", id));
2805 
2806 	cache_transaction* transaction = lookup_transaction(cache, id);
2807 	if (transaction == NULL) {
2808 		panic("cache_end_transaction(): invalid transaction ID\n");
2809 		return B_BAD_VALUE;
2810 	}
2811 
2812 	// Write back all pending transaction blocks
2813 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2814 	if (status != B_OK)
2815 		return status;
2816 
2817 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2818 
2819 	if (hook != NULL
2820 		&& add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN,
2821 			hook, data) != B_OK) {
2822 		return B_NO_MEMORY;
2823 	}
2824 
2825 	T(Action("end", cache, transaction));
2826 
2827 	// iterate through all blocks and free the unchanged original contents
2828 
2829 	cached_block* next;
2830 	for (cached_block* block = transaction->first_block; block != NULL;
2831 			block = next) {
2832 		next = block->transaction_next;
2833 		ASSERT(block->previous_transaction == NULL);
2834 
2835 		if (block->discard) {
2836 			// This block has been discarded in the transaction
2837 			cache->DiscardBlock(block);
2838 			transaction->num_blocks--;
2839 			continue;
2840 		}
2841 
2842 		if (block->original_data != NULL) {
2843 			cache->Free(block->original_data);
2844 			block->original_data = NULL;
2845 		}
2846 		if (block->parent_data != NULL) {
2847 			ASSERT(transaction->has_sub_transaction);
2848 			cache->FreeBlockParentData(block);
2849 		}
2850 
2851 		// move the block to the previous transaction list
2852 		transaction->blocks.Add(block);
2853 
2854 		block->previous_transaction = transaction;
2855 		block->transaction_next = NULL;
2856 		block->transaction = NULL;
2857 	}
2858 
2859 	transaction->open = false;
2860 	return B_OK;
2861 }
2862 
2863 
2864 status_t
2865 cache_abort_transaction(void* _cache, int32 id)
2866 {
2867 	block_cache* cache = (block_cache*)_cache;
2868 	TransactionLocker locker(cache);
2869 
2870 	TRACE(("cache_abort_transaction(id = %ld)\n", id));
2871 
2872 	cache_transaction* transaction = lookup_transaction(cache, id);
2873 	if (transaction == NULL) {
2874 		panic("cache_abort_transaction(): invalid transaction ID\n");
2875 		return B_BAD_VALUE;
2876 	}
2877 
2878 	T(Abort(cache, transaction));
2879 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
2880 
2881 	// iterate through all blocks and restore their original contents
2882 
2883 	cached_block* block = transaction->first_block;
2884 	cached_block* next;
2885 	for (; block != NULL; block = next) {
2886 		next = block->transaction_next;
2887 
2888 		if (block->original_data != NULL) {
2889 			TRACE(("cache_abort_transaction(id = %ld): restored contents of "
2890 				"block %Ld\n", transaction->id, block->block_number));
2891 			memcpy(block->current_data, block->original_data,
2892 				cache->block_size);
2893 			cache->Free(block->original_data);
2894 			block->original_data = NULL;
2895 		}
2896 		if (transaction->has_sub_transaction && block->parent_data != NULL)
2897 			cache->FreeBlockParentData(block);
2898 
2899 		block->transaction_next = NULL;
2900 		block->transaction = NULL;
2901 		block->discard = false;
2902 		if (block->previous_transaction == NULL)
2903 			block->is_dirty = false;
2904 	}
2905 
2906 	hash_remove(cache->transaction_hash, transaction);
2907 	delete_transaction(cache, transaction);
2908 	return B_OK;
2909 }
2910 
2911 
2912 /*!	Acknowledges the current parent transaction, and starts a new transaction
2913 	from its sub transaction.
2914 	The new transaction also gets a new transaction ID.
2915 */
2916 int32
2917 cache_detach_sub_transaction(void* _cache, int32 id,
2918 	transaction_notification_hook hook, void* data)
2919 {
2920 	block_cache* cache = (block_cache*)_cache;
2921 	TransactionLocker locker(cache);
2922 
2923 	TRACE(("cache_detach_sub_transaction(id = %ld)\n", id));
2924 
2925 	cache_transaction* transaction = lookup_transaction(cache, id);
2926 	if (transaction == NULL) {
2927 		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
2928 		return B_BAD_VALUE;
2929 	}
2930 	if (!transaction->has_sub_transaction)
2931 		return B_BAD_VALUE;
2932 
2933 	// iterate through all blocks and free the unchanged original contents
2934 
2935 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2936 	if (status != B_OK)
2937 		return status;
2938 
2939 	// create a new transaction for the sub transaction
2940 	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
2941 	if (newTransaction == NULL)
2942 		return B_NO_MEMORY;
2943 
2944 	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
2945 	T(Detach(cache, transaction, newTransaction));
2946 
2947 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2948 
2949 	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
2950 			data) != B_OK) {
2951 		delete newTransaction;
2952 		return B_NO_MEMORY;
2953 	}
2954 
2955 	cached_block* last = NULL;
2956 	cached_block* next;
2957 	for (cached_block* block = transaction->first_block; block != NULL;
2958 			block = next) {
2959 		next = block->transaction_next;
2960 		ASSERT(block->previous_transaction == NULL);
2961 
2962 		if (block->discard) {
2963 			cache->DiscardBlock(block);
2964 			transaction->main_num_blocks--;
2965 			continue;
2966 		}
2967 
2968 		if (block->parent_data != NULL) {
2969 			// The block changed in the parent - free the original data, since
2970 			// they will be replaced by what is in current.
2971 			ASSERT(block->original_data != NULL);
2972 			cache->Free(block->original_data);
2973 
2974 			if (block->parent_data != block->current_data) {
2975 				// The block had been changed in both transactions
2976 				block->original_data = block->parent_data;
2977 			} else {
2978 				// The block has only been changed in the parent
2979 				block->original_data = NULL;
2980 			}
2981 
2982 			// move the block to the previous transaction list
2983 			transaction->blocks.Add(block);
2984 			block->previous_transaction = transaction;
2985 			block->parent_data = NULL;
2986 		}
2987 
2988 		if (block->original_data != NULL) {
2989 			// This block had been changed in the current sub transaction,
2990 			// we need to move this block over to the new transaction.
2991 			ASSERT(block->parent_data == NULL);
2992 
2993 			if (last == NULL)
2994 				newTransaction->first_block = block;
2995 			else
2996 				last->transaction_next = block;
2997 
2998 			block->transaction = newTransaction;
2999 			last = block;
3000 		} else
3001 			block->transaction = NULL;
3002 
3003 		block->transaction_next = NULL;
3004 	}
3005 
3006 	newTransaction->num_blocks = transaction->sub_num_blocks;
3007 
3008 	transaction->open = false;
3009 	transaction->has_sub_transaction = false;
3010 	transaction->num_blocks = transaction->main_num_blocks;
3011 	transaction->sub_num_blocks = 0;
3012 
3013 	hash_insert_grow(cache->transaction_hash, newTransaction);
3014 	cache->last_transaction = newTransaction;
3015 
3016 	return newTransaction->id;
3017 }
3018 
3019 
3020 status_t
3021 cache_abort_sub_transaction(void* _cache, int32 id)
3022 {
3023 	block_cache* cache = (block_cache*)_cache;
3024 	TransactionLocker locker(cache);
3025 
3026 	TRACE(("cache_abort_sub_transaction(id = %ld)\n", id));
3027 
3028 	cache_transaction* transaction = lookup_transaction(cache, id);
3029 	if (transaction == NULL) {
3030 		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
3031 		return B_BAD_VALUE;
3032 	}
3033 	if (!transaction->has_sub_transaction)
3034 		return B_BAD_VALUE;
3035 
3036 	T(Abort(cache, transaction));
3037 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3038 
3039 	// revert all changes back to the version of the parent
3040 
3041 	cached_block* block = transaction->first_block;
3042 	cached_block* last = NULL;
3043 	cached_block* next;
3044 	for (; block != NULL; block = next) {
3045 		next = block->transaction_next;
3046 
3047 		if (block->parent_data == NULL) {
3048 			// The parent transaction didn't change the block, but the sub
3049 			// transaction did - we need to revert to the original data.
3050 			// The block is no longer part of the transaction
3051 			ASSERT(block->original_data != NULL);
3052 			memcpy(block->current_data, block->original_data,
3053 				cache->block_size);
3054 
3055 			if (last != NULL)
3056 				last->transaction_next = next;
3057 			else
3058 				transaction->first_block = next;
3059 
3060 			block->transaction_next = NULL;
3061 			block->transaction = NULL;
3062 			if (block->previous_transaction == NULL)
3063 				block->is_dirty = false;
3064 		} else {
3065 			if (block->parent_data != block->current_data) {
3066 				// The block has been changed and must be restored - the block
3067 				// is still dirty and part of the transaction
3068 				TRACE(("cache_abort_sub_transaction(id = %ld): restored "
3069 					"contents of block %Ld\n", transaction->id,
3070 					block->block_number));
3071 				memcpy(block->current_data, block->parent_data,
3072 					cache->block_size);
3073 				cache->Free(block->parent_data);
3074 				// The block stays dirty
3075 			}
3076 			block->parent_data = NULL;
3077 			last = block;
3078 		}
3079 
3080 		block->discard = false;
3081 	}
3082 
3083 	// all subsequent changes will go into the main transaction
3084 	transaction->has_sub_transaction = false;
3085 	transaction->sub_num_blocks = 0;
3086 
3087 	return B_OK;
3088 }
3089 
3090 
3091 status_t
3092 cache_start_sub_transaction(void* _cache, int32 id)
3093 {
3094 	block_cache* cache = (block_cache*)_cache;
3095 	TransactionLocker locker(cache);
3096 
3097 	TRACE(("cache_start_sub_transaction(id = %ld)\n", id));
3098 
3099 	cache_transaction* transaction = lookup_transaction(cache, id);
3100 	if (transaction == NULL) {
3101 		panic("cache_start_sub_transaction(): invalid transaction ID %ld\n",
3102 			id);
3103 		return B_BAD_VALUE;
3104 	}
3105 
3106 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3107 
3108 	// move all changed blocks up to the parent
3109 
3110 	cached_block* block = transaction->first_block;
3111 	cached_block* next;
3112 	for (; block != NULL; block = next) {
3113 		next = block->transaction_next;
3114 
3115 		if (block->parent_data != NULL) {
3116 			// There already is an older sub transaction - we acknowledge
3117 			// its changes and move its blocks up to the parent
3118 			ASSERT(transaction->has_sub_transaction);
3119 			cache->FreeBlockParentData(block);
3120 		}
3121 		if (block->discard) {
3122 			// This block has been discarded in the parent transaction.
3123 			// Just throw away any changes made in this transaction, so that
3124 			// it can still be reverted to its original contents if needed
3125 			ASSERT(block->previous_transaction == NULL);
3126 			if (block->original_data != NULL) {
3127 				memcpy(block->current_data, block->original_data,
3128 					cache->block_size);
3129 				block->original_data = NULL;
3130 			}
3131 			continue;
3132 		}
3133 
3134 		// we "allocate" the parent data lazily, that means, we don't copy
3135 		// the data (and allocate memory for it) until we need to
3136 		block->parent_data = block->current_data;
3137 	}
3138 
3139 	// all subsequent changes will go into the sub transaction
3140 	transaction->has_sub_transaction = true;
3141 	transaction->main_num_blocks = transaction->num_blocks;
3142 	transaction->sub_num_blocks = 0;
3143 	T(Action("start-sub", cache, transaction));
3144 
3145 	return B_OK;
3146 }
3147 
3148 
3149 /*!	Adds a transaction listener that gets notified when the transaction
3150 	is ended, aborted, written, or idle as specified by \a events.
3151 	The listener gets automatically removed when the transaction ends.
3152 */
3153 status_t
3154 cache_add_transaction_listener(void* _cache, int32 id, int32 events,
3155 	transaction_notification_hook hook, void* data)
3156 {
3157 	block_cache* cache = (block_cache*)_cache;
3158 	TransactionLocker locker(cache);
3159 
3160 	cache_transaction* transaction = lookup_transaction(cache, id);
3161 	if (transaction == NULL)
3162 		return B_BAD_VALUE;
3163 
3164 	return add_transaction_listener(cache, transaction, events, hook, data);
3165 }
3166 
3167 
3168 status_t
3169 cache_remove_transaction_listener(void* _cache, int32 id,
3170 	transaction_notification_hook hookFunction, void* data)
3171 {
3172 	block_cache* cache = (block_cache*)_cache;
3173 	TransactionLocker locker(cache);
3174 
3175 	cache_transaction* transaction = lookup_transaction(cache, id);
3176 	if (transaction == NULL)
3177 		return B_BAD_VALUE;
3178 
3179 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
3180 	while (iterator.HasNext()) {
3181 		cache_listener* listener = iterator.Next();
3182 		if (listener->data == data && listener->hook == hookFunction) {
3183 			iterator.Remove();
3184 
3185 			if (listener->events_pending != 0) {
3186 				MutexLocker _(sNotificationsLock);
3187 				if (listener->events_pending != 0)
3188 					cache->pending_notifications.Remove(listener);
3189 			}
3190 			delete listener;
3191 			return B_OK;
3192 		}
3193 	}
3194 
3195 	return B_ENTRY_NOT_FOUND;
3196 }
3197 
3198 
3199 status_t
3200 cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
3201 	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
3202 {
3203 	cached_block* block = (cached_block*)*_cookie;
3204 	block_cache* cache = (block_cache*)_cache;
3205 	TransactionLocker locker(cache);
3206 
3207 	cache_transaction* transaction = lookup_transaction(cache, id);
3208 	if (transaction == NULL || !transaction->open)
3209 		return B_BAD_VALUE;
3210 
3211 	if (block == NULL)
3212 		block = transaction->first_block;
3213 	else
3214 		block = block->transaction_next;
3215 
3216 	if (transaction->has_sub_transaction) {
3217 		if (mainOnly) {
3218 			// find next block that the parent changed
3219 			while (block != NULL && block->parent_data == NULL)
3220 				block = block->transaction_next;
3221 		} else {
3222 			// find next non-discarded block
3223 			while (block != NULL && block->discard)
3224 				block = block->transaction_next;
3225 		}
3226 	}
3227 
3228 	if (block == NULL)
3229 		return B_ENTRY_NOT_FOUND;
3230 
3231 	if (_blockNumber)
3232 		*_blockNumber = block->block_number;
3233 	if (_data)
3234 		*_data = mainOnly ? block->parent_data : block->current_data;
3235 	if (_unchangedData)
3236 		*_unchangedData = block->original_data;
3237 
3238 	*_cookie = (addr_t)block;
3239 	return B_OK;
3240 }
3241 
3242 
3243 int32
3244 cache_blocks_in_transaction(void* _cache, int32 id)
3245 {
3246 	block_cache* cache = (block_cache*)_cache;
3247 	TransactionLocker locker(cache);
3248 
3249 	cache_transaction* transaction = lookup_transaction(cache, id);
3250 	if (transaction == NULL)
3251 		return B_BAD_VALUE;
3252 
3253 	return transaction->num_blocks;
3254 }
3255 
3256 
3257 /*!	Returns the number of blocks that are part of the main transaction. If this
3258 	transaction does not have a sub transaction yet, this is the same value as
3259 	cache_blocks_in_transaction() would return.
3260 */
3261 int32
3262 cache_blocks_in_main_transaction(void* _cache, int32 id)
3263 {
3264 	block_cache* cache = (block_cache*)_cache;
3265 	TransactionLocker locker(cache);
3266 
3267 	cache_transaction* transaction = lookup_transaction(cache, id);
3268 	if (transaction == NULL)
3269 		return B_BAD_VALUE;
3270 
3271 	if (transaction->has_sub_transaction)
3272 		return transaction->main_num_blocks;
3273 
3274 	return transaction->num_blocks;
3275 }
3276 
3277 
3278 int32
3279 cache_blocks_in_sub_transaction(void* _cache, int32 id)
3280 {
3281 	block_cache* cache = (block_cache*)_cache;
3282 	TransactionLocker locker(cache);
3283 
3284 	cache_transaction* transaction = lookup_transaction(cache, id);
3285 	if (transaction == NULL)
3286 		return B_BAD_VALUE;
3287 
3288 	return transaction->sub_num_blocks;
3289 }
3290 
3291 
3292 //	#pragma mark - public block cache API
3293 
3294 
3295 void
3296 block_cache_delete(void* _cache, bool allowWrites)
3297 {
3298 	block_cache* cache = (block_cache*)_cache;
3299 
3300 	if (allowWrites)
3301 		block_cache_sync(cache);
3302 
3303 	mutex_lock(&sCachesLock);
3304 	sCaches.Remove(cache);
3305 	mutex_unlock(&sCachesLock);
3306 
3307 	mutex_lock(&cache->lock);
3308 
3309 	// wait for all blocks to become unbusy
3310 	wait_for_busy_reading_blocks(cache);
3311 	wait_for_busy_writing_blocks(cache);
3312 
3313 	// free all blocks
3314 
3315 	uint32 cookie = 0;
3316 	cached_block* block;
3317 	while ((block = (cached_block*)hash_remove_first(cache->hash,
3318 			&cookie)) != NULL) {
3319 		cache->FreeBlock(block);
3320 	}
3321 
3322 	// free all transactions (they will all be aborted)
3323 
3324 	cookie = 0;
3325 	cache_transaction* transaction;
3326 	while ((transaction = (cache_transaction*)hash_remove_first(
3327 			cache->transaction_hash, &cookie)) != NULL) {
3328 		delete transaction;
3329 	}
3330 
3331 	delete cache;
3332 }
3333 
3334 
3335 void*
3336 block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
3337 {
3338 	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
3339 		readOnly);
3340 	if (cache == NULL)
3341 		return NULL;
3342 
3343 	if (cache->Init() != B_OK) {
3344 		delete cache;
3345 		return NULL;
3346 	}
3347 
3348 	MutexLocker _(sCachesLock);
3349 	sCaches.Add(cache);
3350 
3351 	return cache;
3352 }
3353 
3354 
3355 status_t
3356 block_cache_sync(void* _cache)
3357 {
3358 	block_cache* cache = (block_cache*)_cache;
3359 
3360 	// We will sync all dirty blocks to disk that have a completed
3361 	// transaction or no transaction only
3362 
3363 	MutexLocker locker(&cache->lock);
3364 
3365 	BlockWriter writer(cache);
3366 	hash_iterator iterator;
3367 	hash_open(cache->hash, &iterator);
3368 
3369 	cached_block* block;
3370 	while ((block = (cached_block*)hash_next(cache->hash, &iterator)) != NULL) {
3371 		if (block->CanBeWritten())
3372 			writer.Add(block);
3373 	}
3374 
3375 	hash_close(cache->hash, &iterator, false);
3376 
3377 	status_t status = writer.Write();
3378 
3379 	locker.Unlock();
3380 
3381 	wait_for_notifications(cache);
3382 		// make sure that all pending TRANSACTION_WRITTEN notifications
3383 		// are handled after we return
3384 	return status;
3385 }
3386 
3387 
3388 status_t
3389 block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
3390 {
3391 	block_cache* cache = (block_cache*)_cache;
3392 
3393 	// We will sync all dirty blocks to disk that have a completed
3394 	// transaction or no transaction only
3395 
3396 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
3397 		panic("block_cache_sync_etc: invalid block number %Ld (max %Ld)",
3398 			blockNumber, cache->max_blocks - 1);
3399 		return B_BAD_VALUE;
3400 	}
3401 
3402 	MutexLocker locker(&cache->lock);
3403 	BlockWriter writer(cache);
3404 
3405 	for (; numBlocks > 0; numBlocks--, blockNumber++) {
3406 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
3407 			&blockNumber);
3408 		if (block == NULL)
3409 			continue;
3410 
3411 		if (block->CanBeWritten())
3412 			writer.Add(block);
3413 	}
3414 
3415 	status_t status = writer.Write();
3416 
3417 	locker.Unlock();
3418 
3419 	wait_for_notifications(cache);
3420 		// make sure that all pending TRANSACTION_WRITTEN notifications
3421 		// are handled after we return
3422 	return status;
3423 }
3424 
3425 
3426 /*!	Discards a block from the current transaction or from the cache.
3427 	You have to call this function when you no longer use a block, ie. when it
3428 	might be reclaimed by the file cache in order to make sure they won't
3429 	interfere.
3430 */
3431 void
3432 block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
3433 {
3434 	// TODO: this could be a nice place to issue the ATA trim command
3435 	block_cache* cache = (block_cache*)_cache;
3436 	TransactionLocker locker(cache);
3437 
3438 	BlockWriter writer(cache);
3439 
3440 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3441 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
3442 			&blockNumber);
3443 		if (block != NULL && block->previous_transaction != NULL)
3444 			writer.Add(block);
3445 	}
3446 
3447 	writer.Write();
3448 		// TODO: this can fail, too!
3449 
3450 	blockNumber -= numBlocks;
3451 		// reset blockNumber to its original value
3452 
3453 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3454 		cached_block* block = (cached_block*)hash_lookup(cache->hash,
3455 			&blockNumber);
3456 		if (block == NULL)
3457 			continue;
3458 
3459 		ASSERT(block->previous_transaction == NULL);
3460 
3461 		if (block->unused) {
3462 			cache->unused_blocks.Remove(block);
3463 			cache->unused_block_count--;
3464 			cache->RemoveBlock(block);
3465 		} else {
3466 			if (block->transaction != NULL && block->parent_data != NULL
3467 				&& block->parent_data != block->current_data) {
3468 				panic("Discarded block %Ld has already been changed in this "
3469 					"transaction!", blockNumber);
3470 			}
3471 
3472 			// mark it as discarded (in the current transaction only, if any)
3473 			block->discard = true;
3474 		}
3475 	}
3476 }
3477 
3478 
3479 status_t
3480 block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
3481 {
3482 	block_cache* cache = (block_cache*)_cache;
3483 	MutexLocker locker(&cache->lock);
3484 
3485 	if (cache->read_only) {
3486 		panic("tried to make block writable on a read-only cache!");
3487 		return B_ERROR;
3488 	}
3489 
3490 	// TODO: this can be done better!
3491 	void* block = get_writable_cached_block(cache, blockNumber,
3492 		blockNumber, 1, transaction, false);
3493 	if (block != NULL) {
3494 		put_cached_block((block_cache*)_cache, blockNumber);
3495 		return B_OK;
3496 	}
3497 
3498 	return B_ERROR;
3499 }
3500 
3501 
3502 void*
3503 block_cache_get_writable_etc(void* _cache, off_t blockNumber, off_t base,
3504 	off_t length, int32 transaction)
3505 {
3506 	block_cache* cache = (block_cache*)_cache;
3507 	MutexLocker locker(&cache->lock);
3508 
3509 	TRACE(("block_cache_get_writable_etc(block = %Ld, transaction = %ld)\n",
3510 		blockNumber, transaction));
3511 	if (cache->read_only)
3512 		panic("tried to get writable block on a read-only cache!");
3513 
3514 	return get_writable_cached_block(cache, blockNumber, base, length,
3515 		transaction, false);
3516 }
3517 
3518 
3519 void*
3520 block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
3521 {
3522 	return block_cache_get_writable_etc(_cache, blockNumber,
3523 		blockNumber, 1, transaction);
3524 }
3525 
3526 
3527 void*
3528 block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
3529 {
3530 	block_cache* cache = (block_cache*)_cache;
3531 	MutexLocker locker(&cache->lock);
3532 
3533 	TRACE(("block_cache_get_empty(block = %Ld, transaction = %ld)\n",
3534 		blockNumber, transaction));
3535 	if (cache->read_only)
3536 		panic("tried to get empty writable block on a read-only cache!");
3537 
3538 	return get_writable_cached_block((block_cache*)_cache, blockNumber,
3539 		blockNumber, 1, transaction, true);
3540 }
3541 
3542 
3543 const void*
3544 block_cache_get_etc(void* _cache, off_t blockNumber, off_t base, off_t length)
3545 {
3546 	block_cache* cache = (block_cache*)_cache;
3547 	MutexLocker locker(&cache->lock);
3548 	bool allocated;
3549 
3550 	cached_block* block = get_cached_block(cache, blockNumber, &allocated);
3551 	if (block == NULL)
3552 		return NULL;
3553 
3554 #if BLOCK_CACHE_DEBUG_CHANGED
3555 	if (block->compare == NULL)
3556 		block->compare = cache->Allocate();
3557 	if (block->compare != NULL)
3558 		memcpy(block->compare, block->current_data, cache->block_size);
3559 #endif
3560 	TB(Get(cache, block));
3561 
3562 	return block->current_data;
3563 }
3564 
3565 
3566 const void*
3567 block_cache_get(void* _cache, off_t blockNumber)
3568 {
3569 	return block_cache_get_etc(_cache, blockNumber, blockNumber, 1);
3570 }
3571 
3572 
3573 /*!	Changes the internal status of a writable block to \a dirty. This can be
3574 	helpful in case you realize you don't need to change that block anymore
3575 	for whatever reason.
3576 
3577 	Note, you must only use this function on blocks that were acquired
3578 	writable!
3579 */
3580 status_t
3581 block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
3582 	int32 transaction)
3583 {
3584 	block_cache* cache = (block_cache*)_cache;
3585 	MutexLocker locker(&cache->lock);
3586 
3587 	cached_block* block = (cached_block*)hash_lookup(cache->hash,
3588 		&blockNumber);
3589 	if (block == NULL)
3590 		return B_BAD_VALUE;
3591 	if (block->is_dirty == dirty) {
3592 		// there is nothing to do for us
3593 		return B_OK;
3594 	}
3595 
3596 	// TODO: not yet implemented
3597 	if (dirty)
3598 		panic("block_cache_set_dirty(): not yet implemented that way!\n");
3599 
3600 	return B_OK;
3601 }
3602 
3603 
3604 void
3605 block_cache_put(void* _cache, off_t blockNumber)
3606 {
3607 	block_cache* cache = (block_cache*)_cache;
3608 	MutexLocker locker(&cache->lock);
3609 
3610 	put_cached_block(cache, blockNumber);
3611 }
3612 
3613