xref: /haiku/src/system/kernel/cache/block_cache.cpp (revision 909af08f4328301fbdef1ffb41f566c3b5bec0c7)
1 /*
2  * Copyright 2004-2020, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include <block_cache.h>
8 
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13 #include <sys/uio.h>
14 
15 #include <KernelExport.h>
16 #include <fs_cache.h>
17 
18 #include <condition_variable.h>
19 #include <lock.h>
20 #include <low_resource_manager.h>
21 #include <slab/Slab.h>
22 #include <tracing.h>
23 #include <util/kernel_cpp.h>
24 #include <util/DoublyLinkedList.h>
25 #include <util/AutoLock.h>
26 #include <StackOrHeapArray.h>
27 #include <vm/vm_page.h>
28 
29 #include "kernel_debug_config.h"
30 
31 
32 // TODO: this is a naive but growing implementation to test the API:
33 //	block reading/writing is not at all optimized for speed, it will
34 //	just read and write single blocks.
35 // TODO: the retrieval/copy of the original data could be delayed until the
36 //		new data must be written, ie. in low memory situations.
37 
38 #ifdef _KERNEL_MODE
39 #	define TRACE_ALWAYS(x...) dprintf(x)
40 #else
41 #	define TRACE_ALWAYS(x...) printf(x)
42 #endif
43 
44 //#define TRACE_BLOCK_CACHE
45 #ifdef TRACE_BLOCK_CACHE
46 #	define TRACE(x)	TRACE_ALWAYS(x)
47 #else
48 #	define TRACE(x) ;
49 #endif
50 
51 
52 // This macro is used for fatal situations that are acceptable in a running
53 // system, like out of memory situations - should only panic for debugging.
54 #define FATAL(x) panic x
55 
56 static const bigtime_t kTransactionIdleTime = 2000000LL;
57 	// a transaction is considered idle after 2 seconds of inactivity
58 
59 
60 namespace {
61 
62 struct cache_transaction;
63 struct cached_block;
64 struct block_cache;
65 typedef DoublyLinkedListLink<cached_block> block_link;
66 
67 struct cached_block {
68 	cached_block*	next;			// next in hash
69 	cached_block*	transaction_next;
70 	block_link		link;
71 	off_t			block_number;
72 	void*			current_data;
73 		// The data that is seen by everyone using the API; this one is always
74 		// present.
75 	void*			original_data;
76 		// When in a transaction, this contains the original data from before
77 		// the transaction.
78 	void*			parent_data;
79 		// This is a lazily alloced buffer that represents the contents of the
80 		// block in the parent transaction. It may point to current_data if the
81 		// contents have been changed only in the parent transaction, or, if the
82 		// block has been changed in the current sub transaction already, to a
83 		// new block containing the contents changed in the parent transaction.
84 		// If this is NULL, the block has not been changed in the parent
85 		// transaction at all.
86 #if BLOCK_CACHE_DEBUG_CHANGED
87 	void*			compare;
88 #endif
89 	int32			ref_count;
90 	int32			last_accessed;
91 	bool			busy_reading : 1;
92 	bool			busy_writing : 1;
93 	bool			is_writing : 1;
94 		// Block has been checked out for writing without transactions, and
95 		// cannot be written back if set
96 	bool			is_dirty : 1;
97 	bool			unused : 1;
98 	bool			discard : 1;
99 	bool			busy_reading_waiters : 1;
100 	bool			busy_writing_waiters : 1;
101 	cache_transaction* transaction;
102 		// This is the current active transaction, if any, the block is
103 		// currently in (meaning was changed as a part of it).
104 	cache_transaction* previous_transaction;
105 		// This is set to the last transaction that was ended containing this
106 		// block. In this case, the block has not yet written back yet, and
107 		// the changed data is either in current_data, or original_data -- the
108 		// latter if the block is already being part of another transaction.
109 		// There can only be one previous transaction, so when the active
110 		// transaction ends, the changes of the previous transaction have to
111 		// be written back before that transaction becomes the next previous
112 		// transaction.
113 
114 	bool CanBeWritten() const;
115 	int32 LastAccess() const
116 		{ return system_time() / 1000000L - last_accessed; }
117 };
118 
119 typedef DoublyLinkedList<cached_block,
120 	DoublyLinkedListMemberGetLink<cached_block,
121 		&cached_block::link> > block_list;
122 
123 struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
124 	static inline void* operator new(size_t size);
125 	static inline void operator delete(void* block);
126 
127 	int32			transaction_id;
128 	int32			events_pending;
129 	int32			events;
130 	transaction_notification_hook hook;
131 	void*			data;
132 	bool			delete_after_event;
133 };
134 
135 typedef DoublyLinkedList<cache_notification> NotificationList;
136 
137 static object_cache* sCacheNotificationCache;
138 
139 struct cache_listener;
140 typedef DoublyLinkedListLink<cache_listener> listener_link;
141 
142 struct cache_listener : cache_notification {
143 	listener_link	link;
144 };
145 
146 typedef DoublyLinkedList<cache_listener,
147 	DoublyLinkedListMemberGetLink<cache_listener,
148 		&cache_listener::link> > ListenerList;
149 
150 void*
151 cache_notification::operator new(size_t size)
152 {
153 	// We can't really know whether something is a cache_notification or a
154 	// cache_listener at runtime, so we just use one object_cache for both
155 	// with the size set to that of the (slightly larger) cache_listener.
156 	// In practice, the vast majority of cache_notifications are really
157 	// cache_listeners, so this is a more than acceptable trade-off.
158 	ASSERT(size <= sizeof(cache_listener));
159 	return object_cache_alloc(sCacheNotificationCache, 0);
160 }
161 
162 void
163 cache_notification::operator delete(void* block)
164 {
165 	object_cache_free(sCacheNotificationCache, block, 0);
166 }
167 
168 
169 struct BlockHash {
170 	typedef off_t			KeyType;
171 	typedef	cached_block	ValueType;
172 
173 	size_t HashKey(KeyType key) const
174 	{
175 		return key;
176 	}
177 
178 	size_t Hash(ValueType* block) const
179 	{
180 		return block->block_number;
181 	}
182 
183 	bool Compare(KeyType key, ValueType* block) const
184 	{
185 		return block->block_number == key;
186 	}
187 
188 	ValueType*& GetLink(ValueType* value) const
189 	{
190 		return value->next;
191 	}
192 };
193 
194 typedef BOpenHashTable<BlockHash> BlockTable;
195 
196 
197 struct TransactionHash {
198 	typedef int32				KeyType;
199 	typedef	cache_transaction	ValueType;
200 
201 	size_t HashKey(KeyType key) const
202 	{
203 		return key;
204 	}
205 
206 	size_t Hash(ValueType* transaction) const;
207 	bool Compare(KeyType key, ValueType* transaction) const;
208 	ValueType*& GetLink(ValueType* value) const;
209 };
210 
211 typedef BOpenHashTable<TransactionHash> TransactionTable;
212 
213 
214 struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
215 	BlockTable*		hash;
216 	mutex			lock;
217 	int				fd;
218 	off_t			max_blocks;
219 	size_t			block_size;
220 	int32			next_transaction_id;
221 	cache_transaction* last_transaction;
222 	TransactionTable* transaction_hash;
223 
224 	object_cache*	buffer_cache;
225 	block_list		unused_blocks;
226 	uint32			unused_block_count;
227 
228 	ConditionVariable busy_reading_condition;
229 	uint32			busy_reading_count;
230 	bool			busy_reading_waiters;
231 
232 	ConditionVariable busy_writing_condition;
233 	uint32			busy_writing_count;
234 	bool			busy_writing_waiters;
235 
236 	bigtime_t		last_block_write;
237 	bigtime_t		last_block_write_duration;
238 
239 	uint32			num_dirty_blocks;
240 	bool			read_only;
241 
242 	NotificationList pending_notifications;
243 	ConditionVariable condition_variable;
244 
245 					block_cache(int fd, off_t numBlocks, size_t blockSize,
246 						bool readOnly);
247 					~block_cache();
248 
249 	status_t		Init();
250 
251 	void			Free(void* buffer);
252 	void*			Allocate();
253 	void			FreeBlock(cached_block* block);
254 	cached_block*	NewBlock(off_t blockNumber);
255 	void			FreeBlockParentData(cached_block* block);
256 
257 	void			RemoveUnusedBlocks(int32 count, int32 minSecondsOld = 0);
258 	void			RemoveBlock(cached_block* block);
259 	void			DiscardBlock(cached_block* block);
260 
261 private:
262 	static void		_LowMemoryHandler(void* data, uint32 resources,
263 						int32 level);
264 	cached_block*	_GetUnusedBlock();
265 };
266 
267 struct cache_transaction {
268 	cache_transaction();
269 
270 	cache_transaction* next;
271 	int32			id;
272 	int32			num_blocks;
273 	int32			main_num_blocks;
274 	int32			sub_num_blocks;
275 	cached_block*	first_block;
276 	block_list		blocks;
277 	ListenerList	listeners;
278 	bool			open;
279 	bool			has_sub_transaction;
280 	bigtime_t		last_used;
281 	int32			busy_writing_count;
282 };
283 
284 
285 class BlockWriter {
286 public:
287 								BlockWriter(block_cache* cache,
288 									size_t max = SIZE_MAX);
289 								~BlockWriter();
290 
291 			bool				Add(cached_block* block,
292 									cache_transaction* transaction = NULL);
293 			bool				Add(cache_transaction* transaction,
294 									bool& hasLeftOvers);
295 
296 			status_t			Write(cache_transaction* transaction = NULL,
297 									bool canUnlock = true);
298 
299 			bool				DeletedTransaction() const
300 									{ return fDeletedTransaction; }
301 
302 	static	status_t			WriteBlock(block_cache* cache,
303 									cached_block* block);
304 
305 private:
306 			void*				_Data(cached_block* block) const;
307 			status_t			_WriteBlocks(cached_block** blocks, uint32 count);
308 			void				_BlockDone(cached_block* block,
309 									cache_transaction* transaction);
310 			void				_UnmarkWriting(cached_block* block);
311 
312 	static	int					_CompareBlocks(const void* _blockA,
313 									const void* _blockB);
314 
315 private:
316 	static	const size_t		kBufferSize = 64;
317 
318 			block_cache*		fCache;
319 			cached_block*		fBuffer[kBufferSize];
320 			cached_block**		fBlocks;
321 			size_t				fCount;
322 			size_t				fTotal;
323 			size_t				fCapacity;
324 			size_t				fMax;
325 			status_t			fStatus;
326 			bool				fDeletedTransaction;
327 };
328 
329 
330 class TransactionLocking {
331 public:
332 	inline bool Lock(block_cache* cache)
333 	{
334 		mutex_lock(&cache->lock);
335 
336 		while (cache->busy_writing_count != 0) {
337 			// wait for all blocks to be written
338 			ConditionVariableEntry entry;
339 			cache->busy_writing_condition.Add(&entry);
340 			cache->busy_writing_waiters = true;
341 
342 			mutex_unlock(&cache->lock);
343 
344 			entry.Wait();
345 
346 			mutex_lock(&cache->lock);
347 		}
348 
349 		return true;
350 	}
351 
352 	inline void Unlock(block_cache* cache)
353 	{
354 		mutex_unlock(&cache->lock);
355 	}
356 };
357 
358 typedef AutoLocker<block_cache, TransactionLocking> TransactionLocker;
359 
360 } // namespace
361 
362 
363 #if BLOCK_CACHE_BLOCK_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
364 namespace BlockTracing {
365 
366 class Action : public AbstractTraceEntry {
367 public:
368 	Action(block_cache* cache, cached_block* block)
369 		:
370 		fCache(cache),
371 		fBlockNumber(block->block_number),
372 		fIsDirty(block->is_dirty),
373 		fHasOriginal(block->original_data != NULL),
374 		fHasParent(block->parent_data != NULL),
375 		fTransactionID(-1),
376 		fPreviousID(-1)
377 	{
378 		if (block->transaction != NULL)
379 			fTransactionID = block->transaction->id;
380 		if (block->previous_transaction != NULL)
381 			fPreviousID = block->previous_transaction->id;
382 	}
383 
384 	virtual void AddDump(TraceOutput& out)
385 	{
386 		out.Print("block cache %p, %s %" B_PRIu64 ", %c%c%c transaction %" B_PRId32
387 			" (previous id %" B_PRId32 ")\n", fCache, _Action(), fBlockNumber,
388 			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
389 			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
390 	}
391 
392 	virtual const char* _Action() const = 0;
393 
394 private:
395 	block_cache*		fCache;
396 	uint64				fBlockNumber;
397 	bool				fIsDirty;
398 	bool				fHasOriginal;
399 	bool				fHasParent;
400 	int32				fTransactionID;
401 	int32				fPreviousID;
402 };
403 
404 class Get : public Action {
405 public:
406 	Get(block_cache* cache, cached_block* block)
407 		:
408 		Action(cache, block)
409 	{
410 		Initialized();
411 	}
412 
413 	virtual const char* _Action() const { return "get"; }
414 };
415 
416 class Put : public Action {
417 public:
418 	Put(block_cache* cache, cached_block* block)
419 		:
420 		Action(cache, block)
421 	{
422 		Initialized();
423 	}
424 
425 	virtual const char* _Action() const { return "put"; }
426 };
427 
428 class Read : public Action {
429 public:
430 	Read(block_cache* cache, cached_block* block)
431 		:
432 		Action(cache, block)
433 	{
434 		Initialized();
435 	}
436 
437 	virtual const char* _Action() const { return "read"; }
438 };
439 
440 class Write : public Action {
441 public:
442 	Write(block_cache* cache, cached_block* block)
443 		:
444 		Action(cache, block)
445 	{
446 		Initialized();
447 	}
448 
449 	virtual const char* _Action() const { return "write"; }
450 };
451 
452 class Flush : public Action {
453 public:
454 	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
455 		:
456 		Action(cache, block),
457 		fGetUnused(getUnused)
458 	{
459 		Initialized();
460 	}
461 
462 	virtual const char* _Action() const
463 		{ return fGetUnused ? "get-unused" : "flush"; }
464 
465 private:
466 	bool	fGetUnused;
467 };
468 
469 class Error : public AbstractTraceEntry {
470 public:
471 	Error(block_cache* cache, uint64 blockNumber, const char* message,
472 			status_t status = B_OK)
473 		:
474 		fCache(cache),
475 		fBlockNumber(blockNumber),
476 		fMessage(message),
477 		fStatus(status)
478 	{
479 		Initialized();
480 	}
481 
482 	virtual void AddDump(TraceOutput& out)
483 	{
484 		out.Print("block cache %p, error %" B_PRIu64 ", %s%s%s",
485 			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
486 			fStatus != B_OK ? strerror(fStatus) : "");
487 	}
488 
489 private:
490 	block_cache*	fCache;
491 	uint64			fBlockNumber;
492 	const char*		fMessage;
493 	status_t		fStatus;
494 };
495 
496 #if BLOCK_CACHE_BLOCK_TRACING >= 2
497 class BlockData : public AbstractTraceEntry {
498 public:
499 	enum {
500 		kCurrent	= 0x01,
501 		kParent		= 0x02,
502 		kOriginal	= 0x04
503 	};
504 
505 	BlockData(block_cache* cache, cached_block* block, const char* message)
506 		:
507 		fCache(cache),
508 		fSize(cache->block_size),
509 		fBlockNumber(block->block_number),
510 		fMessage(message)
511 	{
512 		_Allocate(fCurrent, block->current_data);
513 		_Allocate(fParent, block->parent_data);
514 		_Allocate(fOriginal, block->original_data);
515 
516 #if KTRACE_PRINTF_STACK_TRACE
517 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
518 			false);
519 #endif
520 
521 		Initialized();
522 	}
523 
524 	virtual void AddDump(TraceOutput& out)
525 	{
526 		out.Print("block cache %p, block %" B_PRIu64 ", data %c%c%c: %s",
527 			fCache, fBlockNumber, fCurrent != NULL ? 'c' : '-',
528 			fParent != NULL ? 'p' : '-', fOriginal != NULL ? 'o' : '-',
529 			fMessage);
530 	}
531 
532 #if KTRACE_PRINTF_STACK_TRACE
533 	virtual void DumpStackTrace(TraceOutput& out)
534 	{
535 		out.PrintStackTrace(fStackTrace);
536 	}
537 #endif
538 
539 	void DumpBlocks(uint32 which, uint32 offset, uint32 size)
540 	{
541 		if ((which & kCurrent) != 0)
542 			DumpBlock(kCurrent, offset, size);
543 		if ((which & kParent) != 0)
544 			DumpBlock(kParent, offset, size);
545 		if ((which & kOriginal) != 0)
546 			DumpBlock(kOriginal, offset, size);
547 	}
548 
549 	void DumpBlock(uint32 which, uint32 offset, uint32 size)
550 	{
551 		if (offset > fSize) {
552 			kprintf("invalid offset (block size %" B_PRIu32 ")\n", fSize);
553 			return;
554 		}
555 		if (offset + size > fSize)
556 			size = fSize - offset;
557 
558 		const char* label;
559 		uint8* data;
560 
561 		if ((which & kCurrent) != 0) {
562 			label = "current";
563 			data = fCurrent;
564 		} else if ((which & kParent) != 0) {
565 			label = "parent";
566 			data = fParent;
567 		} else if ((which & kOriginal) != 0) {
568 			label = "original";
569 			data = fOriginal;
570 		} else
571 			return;
572 
573 		kprintf("%s: offset %" B_PRIu32 ", %" B_PRIu32 " bytes\n", label, offset, size);
574 
575 		static const uint32 kBlockSize = 16;
576 		data += offset;
577 
578 		for (uint32 i = 0; i < size;) {
579 			int start = i;
580 
581 			kprintf("  %04" B_PRIx32 " ", i);
582 			for (; i < start + kBlockSize; i++) {
583 				if (!(i % 4))
584 					kprintf(" ");
585 
586 				if (i >= size)
587 					kprintf("  ");
588 				else
589 					kprintf("%02x", data[i]);
590 			}
591 
592 			kprintf("\n");
593 		}
594 	}
595 
596 private:
597 	void _Allocate(uint8*& target, void* source)
598 	{
599 		if (source == NULL) {
600 			target = NULL;
601 			return;
602 		}
603 
604 		target = alloc_tracing_buffer_memcpy(source, fSize, false);
605 	}
606 
607 	block_cache*	fCache;
608 	uint32			fSize;
609 	uint64			fBlockNumber;
610 	const char*		fMessage;
611 	uint8*			fCurrent;
612 	uint8*			fParent;
613 	uint8*			fOriginal;
614 #if KTRACE_PRINTF_STACK_TRACE
615 	tracing_stack_trace* fStackTrace;
616 #endif
617 };
618 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
619 
620 }	// namespace BlockTracing
621 
622 #	define TB(x) new(std::nothrow) BlockTracing::x;
623 #else
624 #	define TB(x) ;
625 #endif
626 
627 #if BLOCK_CACHE_BLOCK_TRACING >= 2
628 #	define TB2(x) new(std::nothrow) BlockTracing::x;
629 #else
630 #	define TB2(x) ;
631 #endif
632 
633 
634 #if BLOCK_CACHE_TRANSACTION_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
635 namespace TransactionTracing {
636 
637 class Action : public AbstractTraceEntry {
638 public:
639 	Action(const char* label, block_cache* cache,
640 			cache_transaction* transaction)
641 		:
642 		fCache(cache),
643 		fTransaction(transaction),
644 		fID(transaction->id),
645 		fSub(transaction->has_sub_transaction),
646 		fNumBlocks(transaction->num_blocks),
647 		fSubNumBlocks(transaction->sub_num_blocks)
648 	{
649 		strlcpy(fLabel, label, sizeof(fLabel));
650 		Initialized();
651 	}
652 
653 	virtual void AddDump(TraceOutput& out)
654 	{
655 		out.Print("block cache %p, %s transaction %p (id %" B_PRId32 ")%s"
656 			", %" B_PRId32 "/%" B_PRId32 " blocks", fCache, fLabel, fTransaction,
657 			fID, fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
658 	}
659 
660 private:
661 	char				fLabel[12];
662 	block_cache*		fCache;
663 	cache_transaction*	fTransaction;
664 	int32				fID;
665 	bool				fSub;
666 	int32				fNumBlocks;
667 	int32				fSubNumBlocks;
668 };
669 
670 class Detach : public AbstractTraceEntry {
671 public:
672 	Detach(block_cache* cache, cache_transaction* transaction,
673 			cache_transaction* newTransaction)
674 		:
675 		fCache(cache),
676 		fTransaction(transaction),
677 		fID(transaction->id),
678 		fSub(transaction->has_sub_transaction),
679 		fNewTransaction(newTransaction),
680 		fNewID(newTransaction->id)
681 	{
682 		Initialized();
683 	}
684 
685 	virtual void AddDump(TraceOutput& out)
686 	{
687 		out.Print("block cache %p, detach transaction %p (id %" B_PRId32 ")"
688 			"from transaction %p (id %" B_PRId32 ")%s",
689 			fCache, fNewTransaction, fNewID, fTransaction, fID,
690 			fSub ? " sub" : "");
691 	}
692 
693 private:
694 	block_cache*		fCache;
695 	cache_transaction*	fTransaction;
696 	int32				fID;
697 	bool				fSub;
698 	cache_transaction*	fNewTransaction;
699 	int32				fNewID;
700 };
701 
702 class Abort : public AbstractTraceEntry {
703 public:
704 	Abort(block_cache* cache, cache_transaction* transaction)
705 		:
706 		fCache(cache),
707 		fTransaction(transaction),
708 		fID(transaction->id),
709 		fNumBlocks(0)
710 	{
711 		bool isSub = transaction->has_sub_transaction;
712 		fNumBlocks = isSub ? transaction->sub_num_blocks
713 			: transaction->num_blocks;
714 		fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
715 		if (fBlocks != NULL) {
716 			cached_block* block = transaction->first_block;
717 			for (int32 i = 0; block != NULL && i < fNumBlocks;
718 					block = block->transaction_next) {
719 				fBlocks[i++] = block->block_number;
720 			}
721 		} else
722 			fNumBlocks = 0;
723 
724 #if KTRACE_PRINTF_STACK_TRACE
725 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
726 			false);
727 #endif
728 
729 		Initialized();
730 	}
731 
732 	virtual void AddDump(TraceOutput& out)
733 	{
734 		out.Print("block cache %p, abort transaction "
735 			"%p (id %" B_PRId32 "), blocks", fCache, fTransaction, fID);
736 		for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
737 			out.Print(" %" B_PRIdOFF, fBlocks[i]);
738 	}
739 
740 #if KTRACE_PRINTF_STACK_TRACE
741 	virtual void DumpStackTrace(TraceOutput& out)
742 	{
743 		out.PrintStackTrace(fStackTrace);
744 	}
745 #endif
746 
747 private:
748 	block_cache*		fCache;
749 	cache_transaction*	fTransaction;
750 	int32				fID;
751 	off_t*				fBlocks;
752 	int32				fNumBlocks;
753 #if KTRACE_PRINTF_STACK_TRACE
754 	tracing_stack_trace* fStackTrace;
755 #endif
756 };
757 
758 }	// namespace TransactionTracing
759 
760 #	define T(x) new(std::nothrow) TransactionTracing::x;
761 #else
762 #	define T(x) ;
763 #endif
764 
765 
766 static DoublyLinkedList<block_cache> sCaches;
767 static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
768 static mutex sCachesMemoryUseLock
769 	= MUTEX_INITIALIZER("block caches memory use");
770 static size_t sUsedMemory;
771 static sem_id sEventSemaphore;
772 static mutex sNotificationsLock
773 	= MUTEX_INITIALIZER("block cache notifications");
774 static thread_id sNotifierWriterThread;
775 static DoublyLinkedListLink<block_cache> sMarkCache;
776 	// TODO: this only works if the link is the first entry of block_cache
777 static object_cache* sBlockCache;
778 
779 
780 //	#pragma mark - notifications/listener
781 
782 
783 /*!	Checks whether or not this is an event that closes a transaction. */
784 static inline bool
785 is_closing_event(int32 event)
786 {
787 	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
788 }
789 
790 
791 static inline bool
792 is_written_event(int32 event)
793 {
794 	return (event & TRANSACTION_WRITTEN) != 0;
795 }
796 
797 
798 /*!	From the specified \a notification, it will remove the lowest pending
799 	event, and return that one in \a _event.
800 	If there is no pending event anymore, it will return \c false.
801 */
802 static bool
803 get_next_pending_event(cache_notification* notification, int32* _event)
804 {
805 	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
806 		int32 pending = atomic_and(&notification->events_pending,
807 			~eventMask);
808 
809 		bool more = (pending & ~eventMask) != 0;
810 
811 		if ((pending & eventMask) != 0) {
812 			*_event = eventMask;
813 			return more;
814 		}
815 	}
816 
817 	return false;
818 }
819 
820 
821 static void
822 flush_pending_notifications(block_cache* cache)
823 {
824 	ASSERT_LOCKED_MUTEX(&sCachesLock);
825 
826 	while (true) {
827 		MutexLocker locker(sNotificationsLock);
828 
829 		cache_notification* notification = cache->pending_notifications.Head();
830 		if (notification == NULL)
831 			return;
832 
833 		bool deleteAfterEvent = false;
834 		int32 event = -1;
835 		if (!get_next_pending_event(notification, &event)) {
836 			// remove the notification if this was the last pending event
837 			cache->pending_notifications.Remove(notification);
838 			deleteAfterEvent = notification->delete_after_event;
839 		}
840 
841 		if (event >= 0) {
842 			// Notify listener, we need to copy the notification, as it might
843 			// be removed when we unlock the list.
844 			cache_notification copy = *notification;
845 			locker.Unlock();
846 
847 			copy.hook(copy.transaction_id, event, copy.data);
848 
849 			locker.Lock();
850 		}
851 
852 		if (deleteAfterEvent)
853 			delete notification;
854 	}
855 }
856 
857 
858 /*!	Flushes all pending notifications by calling the appropriate hook
859 	functions.
860 	Must not be called with a cache lock held.
861 */
862 static void
863 flush_pending_notifications()
864 {
865 	MutexLocker _(sCachesLock);
866 
867 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
868 	while (iterator.HasNext()) {
869 		block_cache* cache = iterator.Next();
870 
871 		flush_pending_notifications(cache);
872 	}
873 }
874 
875 
876 /*!	Initializes the \a notification as specified. */
877 static void
878 set_notification(cache_transaction* transaction,
879 	cache_notification &notification, int32 events,
880 	transaction_notification_hook hook, void* data)
881 {
882 	notification.transaction_id = transaction != NULL ? transaction->id : -1;
883 	notification.events_pending = 0;
884 	notification.events = events;
885 	notification.hook = hook;
886 	notification.data = data;
887 	notification.delete_after_event = false;
888 }
889 
890 
891 /*!	Makes sure the notification is deleted. It either deletes it directly,
892 	when possible, or marks it for deletion if the notification is pending.
893 */
894 static void
895 delete_notification(cache_notification* notification)
896 {
897 	MutexLocker locker(sNotificationsLock);
898 
899 	if (notification->events_pending != 0)
900 		notification->delete_after_event = true;
901 	else
902 		delete notification;
903 }
904 
905 
906 /*!	Adds the notification to the pending notifications list, or, if it's
907 	already part of it, updates its events_pending field.
908 	Also marks the notification to be deleted if \a deleteNotification
909 	is \c true.
910 	Triggers the notifier thread to run.
911 */
912 static void
913 add_notification(block_cache* cache, cache_notification* notification,
914 	int32 event, bool deleteNotification)
915 {
916 	if (notification->hook == NULL)
917 		return;
918 
919 	int32 pending = atomic_or(&notification->events_pending, event);
920 	if (pending == 0) {
921 		// not yet part of the notification list
922 		MutexLocker locker(sNotificationsLock);
923 		if (deleteNotification)
924 			notification->delete_after_event = true;
925 		cache->pending_notifications.Add(notification);
926 	} else if (deleteNotification) {
927 		// we might need to delete it ourselves if we're late
928 		delete_notification(notification);
929 	}
930 
931 	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
932 		// We're probably still holding some locks that makes rescheduling
933 		// not a good idea at this point.
934 }
935 
936 
937 /*!	Notifies all interested listeners of this transaction about the \a event.
938 	If \a event is a closing event (ie. TRANSACTION_ENDED, and
939 	TRANSACTION_ABORTED), all listeners except those listening to
940 	TRANSACTION_WRITTEN will be removed.
941 */
942 static void
943 notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
944 	int32 event)
945 {
946 	T(Action("notify", cache, transaction));
947 
948 	bool isClosing = is_closing_event(event);
949 	bool isWritten = is_written_event(event);
950 
951 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
952 	while (iterator.HasNext()) {
953 		cache_listener* listener = iterator.Next();
954 
955 		bool remove = (isClosing && !is_written_event(listener->events))
956 			|| (isWritten && is_written_event(listener->events));
957 		if (remove)
958 			iterator.Remove();
959 
960 		if ((listener->events & event) != 0)
961 			add_notification(cache, listener, event, remove);
962 		else if (remove)
963 			delete_notification(listener);
964 	}
965 }
966 
967 
968 /*!	Removes and deletes all listeners that are still monitoring this
969 	transaction.
970 */
971 static void
972 remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
973 {
974 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
975 	while (iterator.HasNext()) {
976 		cache_listener* listener = iterator.Next();
977 		iterator.Remove();
978 
979 		delete_notification(listener);
980 	}
981 }
982 
983 
984 static status_t
985 add_transaction_listener(block_cache* cache, cache_transaction* transaction,
986 	int32 events, transaction_notification_hook hookFunction, void* data)
987 {
988 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
989 	while (iterator.HasNext()) {
990 		cache_listener* listener = iterator.Next();
991 
992 		if (listener->data == data && listener->hook == hookFunction) {
993 			// this listener already exists, just update it
994 			listener->events |= events;
995 			return B_OK;
996 		}
997 	}
998 
999 	cache_listener* listener = new cache_listener;
1000 	if (listener == NULL)
1001 		return B_NO_MEMORY;
1002 
1003 	set_notification(transaction, *listener, events, hookFunction, data);
1004 	transaction->listeners.Add(listener);
1005 	return B_OK;
1006 }
1007 
1008 
1009 //	#pragma mark - private transaction
1010 
1011 
1012 cache_transaction::cache_transaction()
1013 {
1014 	num_blocks = 0;
1015 	main_num_blocks = 0;
1016 	sub_num_blocks = 0;
1017 	first_block = NULL;
1018 	open = true;
1019 	has_sub_transaction = false;
1020 	last_used = system_time();
1021 	busy_writing_count = 0;
1022 }
1023 
1024 
1025 static void
1026 delete_transaction(block_cache* cache, cache_transaction* transaction)
1027 {
1028 	if (cache->last_transaction == transaction)
1029 		cache->last_transaction = NULL;
1030 
1031 	remove_transaction_listeners(cache, transaction);
1032 	delete transaction;
1033 }
1034 
1035 
1036 static cache_transaction*
1037 lookup_transaction(block_cache* cache, int32 id)
1038 {
1039 	return cache->transaction_hash->Lookup(id);
1040 }
1041 
1042 
1043 size_t TransactionHash::Hash(cache_transaction* transaction) const
1044 {
1045 	return transaction->id;
1046 }
1047 
1048 
1049 bool TransactionHash::Compare(int32 key, cache_transaction* transaction) const
1050 {
1051 	return transaction->id == key;
1052 }
1053 
1054 
1055 cache_transaction*& TransactionHash::GetLink(cache_transaction* value) const
1056 {
1057 	return value->next;
1058 }
1059 
1060 
1061 /*!	Writes back any changes made to blocks in \a transaction that are still
1062 	part of a previous transacton.
1063 */
1064 static status_t
1065 write_blocks_in_previous_transaction(block_cache* cache,
1066 	cache_transaction* transaction)
1067 {
1068 	BlockWriter writer(cache);
1069 
1070 	cached_block* block = transaction->first_block;
1071 	for (; block != NULL; block = block->transaction_next) {
1072 		if (block->previous_transaction != NULL) {
1073 			// need to write back pending changes
1074 			writer.Add(block);
1075 		}
1076 	}
1077 
1078 	return writer.Write();
1079 }
1080 
1081 
1082 //	#pragma mark - cached_block
1083 
1084 
1085 bool
1086 cached_block::CanBeWritten() const
1087 {
1088 	return !busy_writing && !busy_reading
1089 		&& (previous_transaction != NULL
1090 			|| (transaction == NULL && is_dirty && !is_writing));
1091 }
1092 
1093 
1094 //	#pragma mark - BlockWriter
1095 
1096 
1097 BlockWriter::BlockWriter(block_cache* cache, size_t max)
1098 	:
1099 	fCache(cache),
1100 	fBlocks(fBuffer),
1101 	fCount(0),
1102 	fTotal(0),
1103 	fCapacity(kBufferSize),
1104 	fMax(max),
1105 	fStatus(B_OK),
1106 	fDeletedTransaction(false)
1107 {
1108 }
1109 
1110 
1111 BlockWriter::~BlockWriter()
1112 {
1113 	if (fBlocks != fBuffer)
1114 		free(fBlocks);
1115 }
1116 
1117 
1118 /*!	Adds the specified block to the to be written array. If no more blocks can
1119 	be added, false is returned, otherwise true.
1120 */
1121 bool
1122 BlockWriter::Add(cached_block* block, cache_transaction* transaction)
1123 {
1124 	ASSERT(block->CanBeWritten());
1125 
1126 	if (fTotal == fMax)
1127 		return false;
1128 
1129 	if (fCount >= fCapacity) {
1130 		// Enlarge array if necessary
1131 		cached_block** newBlocks;
1132 		size_t newCapacity = max_c(256, fCapacity * 2);
1133 		if (fBlocks == fBuffer)
1134 			newBlocks = (cached_block**)malloc(newCapacity * sizeof(void*));
1135 		else {
1136 			newBlocks = (cached_block**)realloc(fBlocks,
1137 				newCapacity * sizeof(void*));
1138 		}
1139 
1140 		if (newBlocks == NULL) {
1141 			// Allocating a larger array failed - we need to write back what
1142 			// we have synchronously now (this will also clear the array)
1143 			Write(transaction, false);
1144 		} else {
1145 			if (fBlocks == fBuffer)
1146 				memcpy(newBlocks, fBuffer, kBufferSize * sizeof(void*));
1147 
1148 			fBlocks = newBlocks;
1149 			fCapacity = newCapacity;
1150 		}
1151 	}
1152 
1153 	fBlocks[fCount++] = block;
1154 	fTotal++;
1155 	block->busy_writing = true;
1156 	fCache->busy_writing_count++;
1157 	if (block->previous_transaction != NULL)
1158 		block->previous_transaction->busy_writing_count++;
1159 
1160 	return true;
1161 }
1162 
1163 
1164 /*!	Adds all blocks of the specified transaction to the to be written array.
1165 	If no more blocks can be added, false is returned, otherwise true.
1166 */
1167 bool
1168 BlockWriter::Add(cache_transaction* transaction, bool& hasLeftOvers)
1169 {
1170 	ASSERT(!transaction->open);
1171 
1172 	if (transaction->busy_writing_count != 0) {
1173 		hasLeftOvers = true;
1174 		return true;
1175 	}
1176 
1177 	hasLeftOvers = false;
1178 
1179 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1180 	while (cached_block* block = blockIterator.Next()) {
1181 		if (!block->CanBeWritten()) {
1182 			// This block was already part of a previous transaction within this
1183 			// writer
1184 			hasLeftOvers = true;
1185 			continue;
1186 		}
1187 		if (!Add(block, transaction))
1188 			return false;
1189 
1190 		if (DeletedTransaction())
1191 			break;
1192 	}
1193 
1194 	return true;
1195 }
1196 
1197 
1198 /*! Cache must be locked when calling this method, but it will be unlocked
1199 	while the blocks are written back.
1200 */
1201 status_t
1202 BlockWriter::Write(cache_transaction* transaction, bool canUnlock)
1203 {
1204 	if (fCount == 0)
1205 		return B_OK;
1206 
1207 	if (canUnlock)
1208 		mutex_unlock(&fCache->lock);
1209 
1210 	// Sort blocks in their on-disk order, so we can merge consecutive writes.
1211 	qsort(fBlocks, fCount, sizeof(void*), &_CompareBlocks);
1212 	fDeletedTransaction = false;
1213 
1214 	bigtime_t start = system_time();
1215 
1216 	for (uint32 i = 0; i < fCount; i++) {
1217 		uint32 blocks = 1;
1218 		for (; (i + blocks) < fCount && blocks < IOV_MAX; blocks++) {
1219 			const uint32 j = i + blocks;
1220 			if (fBlocks[j]->block_number != (fBlocks[j - 1]->block_number + 1))
1221 				break;
1222 		}
1223 
1224 		status_t status = _WriteBlocks(fBlocks + i, blocks);
1225 		if (status != B_OK) {
1226 			// propagate to global error handling
1227 			if (fStatus == B_OK)
1228 				fStatus = status;
1229 
1230 			for (uint32 j = i; j < (i + blocks); j++) {
1231 				_UnmarkWriting(fBlocks[j]);
1232 				fBlocks[j] = NULL;
1233 					// This block will not be marked clean
1234 			}
1235 		}
1236 
1237 		i += (blocks - 1);
1238 	}
1239 
1240 	bigtime_t finish = system_time();
1241 
1242 	if (canUnlock)
1243 		mutex_lock(&fCache->lock);
1244 
1245 	if (fStatus == B_OK && fCount >= 8) {
1246 		fCache->last_block_write = finish;
1247 		fCache->last_block_write_duration = (fCache->last_block_write - start)
1248 			/ fCount;
1249 	}
1250 
1251 	for (uint32 i = 0; i < fCount; i++)
1252 		_BlockDone(fBlocks[i], transaction);
1253 
1254 	fCount = 0;
1255 	return fStatus;
1256 }
1257 
1258 
1259 /*!	Writes the specified \a block back to disk. It will always only write back
1260 	the oldest change of the block if it is part of more than one transaction.
1261 	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1262 	delete transactions when they are no longer used, and \a deleteTransaction
1263 	is \c true.
1264 */
1265 /*static*/ status_t
1266 BlockWriter::WriteBlock(block_cache* cache, cached_block* block)
1267 {
1268 	BlockWriter writer(cache);
1269 
1270 	writer.Add(block);
1271 	return writer.Write();
1272 }
1273 
1274 
1275 void*
1276 BlockWriter::_Data(cached_block* block) const
1277 {
1278 	return block->previous_transaction != NULL && block->original_data != NULL
1279 		? block->original_data : block->current_data;
1280 		// We first need to write back changes from previous transactions
1281 }
1282 
1283 
1284 status_t
1285 BlockWriter::_WriteBlocks(cached_block** blocks, uint32 count)
1286 {
1287 	const size_t blockSize = fCache->block_size;
1288 
1289 	BStackOrHeapArray<iovec, 8> vecs(count);
1290 	for (uint32 i = 0; i < count; i++) {
1291 		cached_block* block = blocks[i];
1292 		ASSERT(block->busy_writing);
1293 		ASSERT(i == 0 || block->block_number == (blocks[i - 1]->block_number + 1));
1294 
1295 		TRACE(("BlockWriter::_WriteBlocks(block %" B_PRIdOFF ", count %" B_PRIu32 ")\n",
1296 			block->block_number, count));
1297 		TB(Write(fCache, block));
1298 		TB2(BlockData(fCache, block, "before write"));
1299 
1300 		vecs[i].iov_base = _Data(block);
1301 		vecs[i].iov_len = blockSize;
1302 	}
1303 
1304 	ssize_t written = writev_pos(fCache->fd,
1305 		blocks[0]->block_number * blockSize, vecs, count);
1306 
1307 	if (written != (ssize_t)(blockSize * count)) {
1308 		TB(Error(fCache, block->block_number, "write failed", written));
1309 		TRACE_ALWAYS("could not write back %" B_PRIu32 " blocks (start block %" B_PRIdOFF "): %s\n",
1310 			count, blocks[0]->block_number, strerror(errno));
1311 		if (written < 0)
1312 			return errno;
1313 
1314 		return B_IO_ERROR;
1315 	}
1316 
1317 	return B_OK;
1318 }
1319 
1320 
1321 void
1322 BlockWriter::_BlockDone(cached_block* block,
1323 	cache_transaction* transaction)
1324 {
1325 	if (block == NULL) {
1326 		// An error occured when trying to write this block
1327 		return;
1328 	}
1329 
1330 	if (fCache->num_dirty_blocks > 0)
1331 		fCache->num_dirty_blocks--;
1332 
1333 	if (_Data(block) == block->current_data)
1334 		block->is_dirty = false;
1335 
1336 	_UnmarkWriting(block);
1337 
1338 	cache_transaction* previous = block->previous_transaction;
1339 	if (previous != NULL) {
1340 		previous->blocks.Remove(block);
1341 		block->previous_transaction = NULL;
1342 
1343 		if (block->original_data != NULL && block->transaction == NULL) {
1344 			// This block is not part of a transaction, so it does not need
1345 			// its original pointer anymore.
1346 			fCache->Free(block->original_data);
1347 			block->original_data = NULL;
1348 		}
1349 
1350 		// Has the previous transaction been finished with that write?
1351 		if (--previous->num_blocks == 0) {
1352 			TRACE(("cache transaction %" B_PRId32 " finished!\n", previous->id));
1353 			T(Action("written", fCache, previous));
1354 
1355 			notify_transaction_listeners(fCache, previous,
1356 				TRANSACTION_WRITTEN);
1357 
1358 			if (transaction != NULL) {
1359 				// This function is called while iterating transaction_hash. We
1360 				// use RemoveUnchecked so the iterator is still valid. A regular
1361 				// Remove can trigger a resize of the hash table which would
1362 				// result in the linked items in the table changing order.
1363 				fCache->transaction_hash->RemoveUnchecked(transaction);
1364 			} else
1365 				fCache->transaction_hash->Remove(previous);
1366 
1367 			delete_transaction(fCache, previous);
1368 			fDeletedTransaction = true;
1369 		}
1370 	}
1371 	if (block->transaction == NULL && block->ref_count == 0 && !block->unused) {
1372 		// the block is no longer used
1373 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1374 		block->unused = true;
1375 		fCache->unused_blocks.Add(block);
1376 		fCache->unused_block_count++;
1377 	}
1378 
1379 	TB2(BlockData(fCache, block, "after write"));
1380 }
1381 
1382 
1383 void
1384 BlockWriter::_UnmarkWriting(cached_block* block)
1385 {
1386 	block->busy_writing = false;
1387 	if (block->previous_transaction != NULL)
1388 		block->previous_transaction->busy_writing_count--;
1389 	fCache->busy_writing_count--;
1390 
1391 	if ((fCache->busy_writing_waiters && fCache->busy_writing_count == 0)
1392 		|| block->busy_writing_waiters) {
1393 		fCache->busy_writing_waiters = false;
1394 		block->busy_writing_waiters = false;
1395 		fCache->busy_writing_condition.NotifyAll();
1396 	}
1397 }
1398 
1399 
1400 /*static*/ int
1401 BlockWriter::_CompareBlocks(const void* _blockA, const void* _blockB)
1402 {
1403 	cached_block* blockA = *(cached_block**)_blockA;
1404 	cached_block* blockB = *(cached_block**)_blockB;
1405 
1406 	off_t diff = blockA->block_number - blockB->block_number;
1407 	if (diff > 0)
1408 		return 1;
1409 
1410 	return diff < 0 ? -1 : 0;
1411 }
1412 
1413 
1414 //	#pragma mark - block_cache
1415 
1416 
1417 block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
1418 		bool readOnly)
1419 	:
1420 	hash(NULL),
1421 	fd(_fd),
1422 	max_blocks(numBlocks),
1423 	block_size(blockSize),
1424 	next_transaction_id(1),
1425 	last_transaction(NULL),
1426 	transaction_hash(NULL),
1427 	buffer_cache(NULL),
1428 	unused_block_count(0),
1429 	busy_reading_count(0),
1430 	busy_reading_waiters(false),
1431 	busy_writing_count(0),
1432 	busy_writing_waiters(0),
1433 	last_block_write(0),
1434 	last_block_write_duration(0),
1435 	num_dirty_blocks(0),
1436 	read_only(readOnly)
1437 {
1438 }
1439 
1440 
1441 /*! Should be called with the cache's lock held. */
1442 block_cache::~block_cache()
1443 {
1444 	unregister_low_resource_handler(&_LowMemoryHandler, this);
1445 
1446 	delete transaction_hash;
1447 	delete hash;
1448 
1449 	delete_object_cache(buffer_cache);
1450 
1451 	mutex_destroy(&lock);
1452 }
1453 
1454 
1455 status_t
1456 block_cache::Init()
1457 {
1458 	busy_reading_condition.Init(this, "cache block busy_reading");
1459 	busy_writing_condition.Init(this, "cache block busy writing");
1460 	condition_variable.Init(this, "cache transaction sync");
1461 	mutex_init(&lock, "block cache");
1462 
1463 	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
1464 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1465 	if (buffer_cache == NULL)
1466 		return B_NO_MEMORY;
1467 
1468 	hash = new BlockTable();
1469 	if (hash == NULL || hash->Init(1024) != B_OK)
1470 		return B_NO_MEMORY;
1471 
1472 	transaction_hash = new(std::nothrow) TransactionTable();
1473 	if (transaction_hash == NULL || transaction_hash->Init(16) != B_OK)
1474 		return B_NO_MEMORY;
1475 
1476 	return register_low_resource_handler(&_LowMemoryHandler, this,
1477 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1478 			| B_KERNEL_RESOURCE_ADDRESS_SPACE, 0);
1479 }
1480 
1481 
1482 void
1483 block_cache::Free(void* buffer)
1484 {
1485 	if (buffer != NULL)
1486 		object_cache_free(buffer_cache, buffer, 0);
1487 }
1488 
1489 
1490 void*
1491 block_cache::Allocate()
1492 {
1493 	void* block = object_cache_alloc(buffer_cache, 0);
1494 	if (block != NULL)
1495 		return block;
1496 
1497 	// recycle existing before allocating a new one
1498 	RemoveUnusedBlocks(100);
1499 
1500 	return object_cache_alloc(buffer_cache, 0);
1501 }
1502 
1503 
1504 void
1505 block_cache::FreeBlock(cached_block* block)
1506 {
1507 	Free(block->current_data);
1508 
1509 	if (block->original_data != NULL || block->parent_data != NULL) {
1510 		panic("block_cache::FreeBlock(): %" B_PRIdOFF ", original %p, parent %p\n",
1511 			block->block_number, block->original_data, block->parent_data);
1512 	}
1513 
1514 #if BLOCK_CACHE_DEBUG_CHANGED
1515 	Free(block->compare);
1516 #endif
1517 
1518 	object_cache_free(sBlockCache, block, 0);
1519 }
1520 
1521 
1522 /*! Allocates a new block for \a blockNumber, ready for use */
1523 cached_block*
1524 block_cache::NewBlock(off_t blockNumber)
1525 {
1526 	cached_block* block = NULL;
1527 
1528 	if (low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1529 			| B_KERNEL_RESOURCE_ADDRESS_SPACE) != B_NO_LOW_RESOURCE) {
1530 		// recycle existing instead of allocating a new one
1531 		block = _GetUnusedBlock();
1532 	}
1533 	if (block == NULL) {
1534 		block = (cached_block*)object_cache_alloc(sBlockCache, 0);
1535 		if (block != NULL) {
1536 			block->current_data = Allocate();
1537 			if (block->current_data == NULL) {
1538 				object_cache_free(sBlockCache, block, 0);
1539 				return NULL;
1540 			}
1541 		} else {
1542 			TB(Error(this, blockNumber, "allocation failed"));
1543 			TRACE_ALWAYS("block allocation failed, unused list is %sempty.\n",
1544 				unused_blocks.IsEmpty() ? "" : "not ");
1545 
1546 			// allocation failed, try to reuse an unused block
1547 			block = _GetUnusedBlock();
1548 			if (block == NULL) {
1549 				TB(Error(this, blockNumber, "get unused failed"));
1550 				FATAL(("could not allocate block!\n"));
1551 				return NULL;
1552 			}
1553 		}
1554 	}
1555 
1556 	block->block_number = blockNumber;
1557 	block->ref_count = 0;
1558 	block->last_accessed = 0;
1559 	block->transaction_next = NULL;
1560 	block->transaction = block->previous_transaction = NULL;
1561 	block->original_data = NULL;
1562 	block->parent_data = NULL;
1563 	block->busy_reading = false;
1564 	block->busy_writing = false;
1565 	block->is_writing = false;
1566 	block->is_dirty = false;
1567 	block->unused = false;
1568 	block->discard = false;
1569 	block->busy_reading_waiters = false;
1570 	block->busy_writing_waiters = false;
1571 #if BLOCK_CACHE_DEBUG_CHANGED
1572 	block->compare = NULL;
1573 #endif
1574 
1575 	return block;
1576 }
1577 
1578 
1579 void
1580 block_cache::FreeBlockParentData(cached_block* block)
1581 {
1582 	ASSERT(block->parent_data != NULL);
1583 	if (block->parent_data != block->current_data)
1584 		Free(block->parent_data);
1585 	block->parent_data = NULL;
1586 }
1587 
1588 
1589 void
1590 block_cache::RemoveUnusedBlocks(int32 count, int32 minSecondsOld)
1591 {
1592 	TRACE(("block_cache: remove up to %" B_PRId32 " unused blocks\n", count));
1593 
1594 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1595 			cached_block* block = iterator.Next();) {
1596 		if (minSecondsOld >= block->LastAccess()) {
1597 			// The list is sorted by last access
1598 			break;
1599 		}
1600 		if (block->busy_reading || block->busy_writing)
1601 			continue;
1602 
1603 		TB(Flush(this, block));
1604 		TRACE(("  remove block %" B_PRIdOFF ", last accessed %" B_PRId32 "\n",
1605 			block->block_number, block->last_accessed));
1606 
1607 		// this can only happen if no transactions are used
1608 		if (block->is_dirty && !block->discard) {
1609 			if (block->busy_writing)
1610 				continue;
1611 
1612 			BlockWriter::WriteBlock(this, block);
1613 		}
1614 
1615 		// remove block from lists
1616 		iterator.Remove();
1617 		unused_block_count--;
1618 		RemoveBlock(block);
1619 
1620 		if (--count <= 0)
1621 			break;
1622 	}
1623 }
1624 
1625 
1626 void
1627 block_cache::RemoveBlock(cached_block* block)
1628 {
1629 	hash->Remove(block);
1630 	FreeBlock(block);
1631 }
1632 
1633 
1634 /*!	Discards the block from a transaction (this method must not be called
1635 	for blocks not part of a transaction).
1636 */
1637 void
1638 block_cache::DiscardBlock(cached_block* block)
1639 {
1640 	ASSERT(block->discard);
1641 	ASSERT(block->previous_transaction == NULL);
1642 
1643 	if (block->parent_data != NULL)
1644 		FreeBlockParentData(block);
1645 
1646 	if (block->original_data != NULL) {
1647 		Free(block->original_data);
1648 		block->original_data = NULL;
1649 	}
1650 
1651 	RemoveBlock(block);
1652 }
1653 
1654 
1655 void
1656 block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
1657 {
1658 	TRACE(("block_cache: low memory handler called with level %" B_PRId32 "\n", level));
1659 
1660 	// free some blocks according to the low memory state
1661 	// (if there is enough memory left, we don't free any)
1662 
1663 	block_cache* cache = (block_cache*)data;
1664 	if (cache->unused_block_count <= 1)
1665 		return;
1666 
1667 	int32 free = 0;
1668 	int32 secondsOld = 0;
1669 	switch (level) {
1670 		case B_NO_LOW_RESOURCE:
1671 			return;
1672 		case B_LOW_RESOURCE_NOTE:
1673 			free = cache->unused_block_count / 4;
1674 			secondsOld = 120;
1675 			break;
1676 		case B_LOW_RESOURCE_WARNING:
1677 			free = cache->unused_block_count / 2;
1678 			secondsOld = 10;
1679 			break;
1680 		case B_LOW_RESOURCE_CRITICAL:
1681 			free = cache->unused_block_count - 1;
1682 			secondsOld = 0;
1683 			break;
1684 	}
1685 
1686 	MutexLocker locker(&cache->lock);
1687 
1688 	if (!locker.IsLocked()) {
1689 		// If our block_cache were deleted, it could be that we had
1690 		// been called before that deletion went through, therefore,
1691 		// acquiring its lock might fail.
1692 		return;
1693 	}
1694 
1695 #ifdef TRACE_BLOCK_CACHE
1696 	uint32 oldUnused = cache->unused_block_count;
1697 #endif
1698 
1699 	cache->RemoveUnusedBlocks(free, secondsOld);
1700 
1701 	TRACE(("block_cache::_LowMemoryHandler(): %p: unused: %" B_PRIu32 " -> %" B_PRIu32 "\n",
1702 		cache, oldUnused, cache->unused_block_count));
1703 }
1704 
1705 
1706 cached_block*
1707 block_cache::_GetUnusedBlock()
1708 {
1709 	TRACE(("block_cache: get unused block\n"));
1710 
1711 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1712 			cached_block* block = iterator.Next();) {
1713 		TB(Flush(this, block, true));
1714 		// this can only happen if no transactions are used
1715 		if (block->is_dirty && !block->busy_writing && !block->discard)
1716 			BlockWriter::WriteBlock(this, block);
1717 
1718 		// remove block from lists
1719 		iterator.Remove();
1720 		unused_block_count--;
1721 		hash->Remove(block);
1722 
1723 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1724 		block->unused = false;
1725 
1726 		// TODO: see if compare data is handled correctly here!
1727 #if BLOCK_CACHE_DEBUG_CHANGED
1728 		if (block->compare != NULL)
1729 			Free(block->compare);
1730 #endif
1731 		return block;
1732 	}
1733 
1734 	return NULL;
1735 }
1736 
1737 
1738 //	#pragma mark - private block functions
1739 
1740 
1741 /*!	Cache must be locked.
1742 */
1743 static void
1744 mark_block_busy_reading(block_cache* cache, cached_block* block)
1745 {
1746 	block->busy_reading = true;
1747 	cache->busy_reading_count++;
1748 }
1749 
1750 
1751 /*!	Cache must be locked.
1752 */
1753 static void
1754 mark_block_unbusy_reading(block_cache* cache, cached_block* block)
1755 {
1756 	block->busy_reading = false;
1757 	cache->busy_reading_count--;
1758 
1759 	if ((cache->busy_reading_waiters && cache->busy_reading_count == 0)
1760 		|| block->busy_reading_waiters) {
1761 		cache->busy_reading_waiters = false;
1762 		block->busy_reading_waiters = false;
1763 		cache->busy_reading_condition.NotifyAll();
1764 	}
1765 }
1766 
1767 
1768 /*!	Cache must be locked.
1769 */
1770 static void
1771 wait_for_busy_reading_block(block_cache* cache, cached_block* block)
1772 {
1773 	while (block->busy_reading) {
1774 		// wait for at least the specified block to be read in
1775 		ConditionVariableEntry entry;
1776 		cache->busy_reading_condition.Add(&entry);
1777 		block->busy_reading_waiters = true;
1778 
1779 		mutex_unlock(&cache->lock);
1780 
1781 		entry.Wait();
1782 
1783 		mutex_lock(&cache->lock);
1784 	}
1785 }
1786 
1787 
1788 /*!	Cache must be locked.
1789 */
1790 static void
1791 wait_for_busy_reading_blocks(block_cache* cache)
1792 {
1793 	while (cache->busy_reading_count != 0) {
1794 		// wait for all blocks to be read in
1795 		ConditionVariableEntry entry;
1796 		cache->busy_reading_condition.Add(&entry);
1797 		cache->busy_reading_waiters = true;
1798 
1799 		mutex_unlock(&cache->lock);
1800 
1801 		entry.Wait();
1802 
1803 		mutex_lock(&cache->lock);
1804 	}
1805 }
1806 
1807 
1808 /*!	Cache must be locked.
1809 */
1810 static void
1811 wait_for_busy_writing_block(block_cache* cache, cached_block* block)
1812 {
1813 	while (block->busy_writing) {
1814 		// wait for all blocks to be written back
1815 		ConditionVariableEntry entry;
1816 		cache->busy_writing_condition.Add(&entry);
1817 		block->busy_writing_waiters = true;
1818 
1819 		mutex_unlock(&cache->lock);
1820 
1821 		entry.Wait();
1822 
1823 		mutex_lock(&cache->lock);
1824 	}
1825 }
1826 
1827 
1828 /*!	Cache must be locked.
1829 */
1830 static void
1831 wait_for_busy_writing_blocks(block_cache* cache)
1832 {
1833 	while (cache->busy_writing_count != 0) {
1834 		// wait for all blocks to be written back
1835 		ConditionVariableEntry entry;
1836 		cache->busy_writing_condition.Add(&entry);
1837 		cache->busy_writing_waiters = true;
1838 
1839 		mutex_unlock(&cache->lock);
1840 
1841 		entry.Wait();
1842 
1843 		mutex_lock(&cache->lock);
1844 	}
1845 }
1846 
1847 
1848 /*!	Removes a reference from the specified \a block. If this was the last
1849 	reference, the block is moved into the unused list.
1850 	In low memory situations, it will also free some blocks from that list,
1851 	but not necessarily the \a block it just released.
1852 */
1853 static void
1854 put_cached_block(block_cache* cache, cached_block* block)
1855 {
1856 #if BLOCK_CACHE_DEBUG_CHANGED
1857 	if (!block->is_dirty && block->compare != NULL
1858 		&& memcmp(block->current_data, block->compare, cache->block_size)) {
1859 		TRACE_ALWAYS("new block:\n");
1860 		dump_block((const char*)block->current_data, 256, "  ");
1861 		TRACE_ALWAYS("unchanged block:\n");
1862 		dump_block((const char*)block->compare, 256, "  ");
1863 		BlockWriter::WriteBlock(cache, block);
1864 		panic("block_cache: supposed to be clean block was changed!\n");
1865 
1866 		cache->Free(block->compare);
1867 		block->compare = NULL;
1868 	}
1869 #endif
1870 	TB(Put(cache, block));
1871 
1872 	if (block->ref_count < 1) {
1873 		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
1874 		return;
1875 	}
1876 
1877 	if (--block->ref_count == 0
1878 		&& block->transaction == NULL && block->previous_transaction == NULL) {
1879 		// This block is not used anymore, and not part of any transaction
1880 		block->is_writing = false;
1881 
1882 		if (block->discard) {
1883 			cache->RemoveBlock(block);
1884 		} else {
1885 			// put this block in the list of unused blocks
1886 			ASSERT(!block->unused);
1887 			block->unused = true;
1888 
1889 			ASSERT(block->original_data == NULL && block->parent_data == NULL);
1890 			cache->unused_blocks.Add(block);
1891 			cache->unused_block_count++;
1892 		}
1893 	}
1894 }
1895 
1896 
1897 static void
1898 put_cached_block(block_cache* cache, off_t blockNumber)
1899 {
1900 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1901 		panic("put_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1902 			blockNumber, cache->max_blocks - 1);
1903 	}
1904 
1905 	cached_block* block = cache->hash->Lookup(blockNumber);
1906 	if (block != NULL)
1907 		put_cached_block(cache, block);
1908 	else {
1909 		TB(Error(cache, blockNumber, "put unknown"));
1910 	}
1911 }
1912 
1913 
1914 /*!	Retrieves the block \a blockNumber from the hash table, if it's already
1915 	there, or reads it from the disk.
1916 	You need to have the cache locked when calling this function.
1917 
1918 	\param _allocated tells you whether or not a new block has been allocated
1919 		to satisfy your request.
1920 	\param readBlock if \c false, the block will not be read in case it was
1921 		not already in the cache. The block you retrieve may contain random
1922 		data. If \c true, the cache will be temporarily unlocked while the
1923 		block is read in.
1924 */
1925 static status_t
1926 get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
1927 	bool readBlock, cached_block** _block)
1928 {
1929 	ASSERT_LOCKED_MUTEX(&cache->lock);
1930 
1931 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1932 		panic("get_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
1933 			blockNumber, cache->max_blocks - 1);
1934 		return B_BAD_VALUE;
1935 	}
1936 
1937 retry:
1938 	cached_block* block = cache->hash->Lookup(blockNumber);
1939 	*_allocated = false;
1940 
1941 	if (block == NULL) {
1942 		// put block into cache
1943 		block = cache->NewBlock(blockNumber);
1944 		if (block == NULL)
1945 			return B_NO_MEMORY;
1946 
1947 		cache->hash->Insert(block);
1948 		*_allocated = true;
1949 	} else if (block->busy_reading) {
1950 		// The block is currently busy_reading - wait and try again later
1951 		wait_for_busy_reading_block(cache, block);
1952 		goto retry;
1953 	}
1954 
1955 	if (block->unused) {
1956 		//TRACE(("remove block %" B_PRIdOFF " from unused\n", blockNumber));
1957 		block->unused = false;
1958 		cache->unused_blocks.Remove(block);
1959 		cache->unused_block_count--;
1960 	}
1961 
1962 	if (*_allocated && readBlock) {
1963 		// read block into cache
1964 		int32 blockSize = cache->block_size;
1965 
1966 		mark_block_busy_reading(cache, block);
1967 		mutex_unlock(&cache->lock);
1968 
1969 		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
1970 			block->current_data, blockSize);
1971 
1972 		mutex_lock(&cache->lock);
1973 		if (bytesRead < blockSize) {
1974 			cache->RemoveBlock(block);
1975 			TB(Error(cache, blockNumber, "read failed", bytesRead));
1976 
1977 			TRACE_ALWAYS("could not read block %" B_PRIdOFF ": bytesRead: %zd,"
1978 				" error: %s\n", blockNumber, bytesRead, strerror(errno));
1979 			return errno;
1980 		}
1981 		TB(Read(cache, block));
1982 
1983 		mark_block_unbusy_reading(cache, block);
1984 	}
1985 
1986 	block->ref_count++;
1987 	block->last_accessed = system_time() / 1000000L;
1988 
1989 	*_block = block;
1990 	return B_OK;
1991 }
1992 
1993 
1994 /*!	Returns the writable block data for the requested blockNumber.
1995 	If \a cleared is true, the block is not read from disk; an empty block
1996 	is returned.
1997 
1998 	This is the only method to insert a block into a transaction. It makes
1999 	sure that the previous block contents are preserved in that case.
2000 */
2001 static status_t
2002 get_writable_cached_block(block_cache* cache, off_t blockNumber, off_t base,
2003 	off_t length, int32 transactionID, bool cleared, void** _block)
2004 {
2005 	TRACE(("get_writable_cached_block(blockNumber = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
2006 		blockNumber, transactionID));
2007 
2008 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
2009 		panic("get_writable_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
2010 			blockNumber, cache->max_blocks - 1);
2011 		return B_BAD_VALUE;
2012 	}
2013 
2014 	bool allocated;
2015 	cached_block* block;
2016 	status_t status = get_cached_block(cache, blockNumber, &allocated,
2017 		!cleared, &block);
2018 	if (status != B_OK)
2019 		return status;
2020 
2021 	if (block->busy_writing)
2022 		wait_for_busy_writing_block(cache, block);
2023 
2024 	block->discard = false;
2025 
2026 	// if there is no transaction support, we just return the current block
2027 	if (transactionID == -1) {
2028 		if (cleared) {
2029 			mark_block_busy_reading(cache, block);
2030 			mutex_unlock(&cache->lock);
2031 
2032 			memset(block->current_data, 0, cache->block_size);
2033 
2034 			mutex_lock(&cache->lock);
2035 			mark_block_unbusy_reading(cache, block);
2036 		}
2037 
2038 		block->is_writing = true;
2039 
2040 		if (!block->is_dirty) {
2041 			cache->num_dirty_blocks++;
2042 			block->is_dirty = true;
2043 				// mark the block as dirty
2044 		}
2045 
2046 		TB(Get(cache, block));
2047 		*_block = block->current_data;
2048 		return B_OK;
2049 	}
2050 
2051 	cache_transaction* transaction = block->transaction;
2052 
2053 	if (transaction != NULL && transaction->id != transactionID) {
2054 		// TODO: we have to wait here until the other transaction is done.
2055 		//	Maybe we should even panic, since we can't prevent any deadlocks.
2056 		panic("get_writable_cached_block(): asked to get busy writable block "
2057 			"(transaction %" B_PRId32 ")\n", block->transaction->id);
2058 		put_cached_block(cache, block);
2059 		return B_BAD_VALUE;
2060 	}
2061 	if (transaction == NULL && transactionID != -1) {
2062 		// get new transaction
2063 		transaction = lookup_transaction(cache, transactionID);
2064 		if (transaction == NULL) {
2065 			panic("get_writable_cached_block(): invalid transaction %" B_PRId32 "!\n",
2066 				transactionID);
2067 			put_cached_block(cache, block);
2068 			return B_BAD_VALUE;
2069 		}
2070 		if (!transaction->open) {
2071 			panic("get_writable_cached_block(): transaction already done!\n");
2072 			put_cached_block(cache, block);
2073 			return B_BAD_VALUE;
2074 		}
2075 
2076 		block->transaction = transaction;
2077 
2078 		// attach the block to the transaction block list
2079 		block->transaction_next = transaction->first_block;
2080 		transaction->first_block = block;
2081 		transaction->num_blocks++;
2082 	}
2083 	if (transaction != NULL)
2084 		transaction->last_used = system_time();
2085 
2086 	bool wasUnchanged = block->original_data == NULL
2087 		|| block->previous_transaction != NULL;
2088 
2089 	if (!(allocated && cleared) && block->original_data == NULL) {
2090 		// we already have data, so we need to preserve it
2091 		block->original_data = cache->Allocate();
2092 		if (block->original_data == NULL) {
2093 			TB(Error(cache, blockNumber, "allocate original failed"));
2094 			FATAL(("could not allocate original_data\n"));
2095 			put_cached_block(cache, block);
2096 			return B_NO_MEMORY;
2097 		}
2098 
2099 		mark_block_busy_reading(cache, block);
2100 		mutex_unlock(&cache->lock);
2101 
2102 		memcpy(block->original_data, block->current_data, cache->block_size);
2103 
2104 		mutex_lock(&cache->lock);
2105 		mark_block_unbusy_reading(cache, block);
2106 	}
2107 	if (block->parent_data == block->current_data) {
2108 		// remember any previous contents for the parent transaction
2109 		block->parent_data = cache->Allocate();
2110 		if (block->parent_data == NULL) {
2111 			// TODO: maybe we should just continue the current transaction in
2112 			// this case...
2113 			TB(Error(cache, blockNumber, "allocate parent failed"));
2114 			FATAL(("could not allocate parent\n"));
2115 			put_cached_block(cache, block);
2116 			return B_NO_MEMORY;
2117 		}
2118 
2119 		mark_block_busy_reading(cache, block);
2120 		mutex_unlock(&cache->lock);
2121 
2122 		memcpy(block->parent_data, block->current_data, cache->block_size);
2123 
2124 		mutex_lock(&cache->lock);
2125 		mark_block_unbusy_reading(cache, block);
2126 
2127 		transaction->sub_num_blocks++;
2128 	} else if (transaction != NULL && transaction->has_sub_transaction
2129 		&& block->parent_data == NULL && wasUnchanged)
2130 		transaction->sub_num_blocks++;
2131 
2132 	if (cleared) {
2133 		mark_block_busy_reading(cache, block);
2134 		mutex_unlock(&cache->lock);
2135 
2136 		memset(block->current_data, 0, cache->block_size);
2137 
2138 		mutex_lock(&cache->lock);
2139 		mark_block_unbusy_reading(cache, block);
2140 	}
2141 
2142 	block->is_dirty = true;
2143 	TB(Get(cache, block));
2144 	TB2(BlockData(cache, block, "get writable"));
2145 
2146 	*_block = block->current_data;
2147 	return B_OK;
2148 }
2149 
2150 
2151 #if DEBUG_BLOCK_CACHE
2152 
2153 
2154 static void
2155 dump_block(cached_block* block)
2156 {
2157 	kprintf("%08lx %9" B_PRIdOFF " %08lx %08lx %08lx %5" B_PRId32 " %6" B_PRId32
2158 		" %c%c%c%c%c%c %08lx %08lx\n",
2159 		(addr_t)block, block->block_number,
2160 		(addr_t)block->current_data, (addr_t)block->original_data,
2161 		(addr_t)block->parent_data, block->ref_count, block->LastAccess(),
2162 		block->busy_reading ? 'r' : '-', block->busy_writing ? 'w' : '-',
2163 		block->is_writing ? 'W' : '-', block->is_dirty ? 'D' : '-',
2164 		block->unused ? 'U' : '-', block->discard ? 'D' : '-',
2165 		(addr_t)block->transaction,
2166 		(addr_t)block->previous_transaction);
2167 }
2168 
2169 
2170 static void
2171 dump_block_long(cached_block* block)
2172 {
2173 	kprintf("BLOCK %p\n", block);
2174 	kprintf(" current data:  %p\n", block->current_data);
2175 	kprintf(" original data: %p\n", block->original_data);
2176 	kprintf(" parent data:   %p\n", block->parent_data);
2177 #if BLOCK_CACHE_DEBUG_CHANGED
2178 	kprintf(" compare data:  %p\n", block->compare);
2179 #endif
2180 	kprintf(" ref_count:     %" B_PRId32 "\n", block->ref_count);
2181 	kprintf(" accessed:      %" B_PRId32 "\n", block->LastAccess());
2182 	kprintf(" flags:        ");
2183 	if (block->busy_reading)
2184 		kprintf(" busy_reading");
2185 	if (block->busy_writing)
2186 		kprintf(" busy_writing");
2187 	if (block->is_writing)
2188 		kprintf(" is-writing");
2189 	if (block->is_dirty)
2190 		kprintf(" is-dirty");
2191 	if (block->unused)
2192 		kprintf(" unused");
2193 	if (block->discard)
2194 		kprintf(" discard");
2195 	kprintf("\n");
2196 	if (block->transaction != NULL) {
2197 		kprintf(" transaction:   %p (%" B_PRId32 ")\n", block->transaction,
2198 			block->transaction->id);
2199 		if (block->transaction_next != NULL) {
2200 			kprintf(" next in transaction: %" B_PRIdOFF "\n",
2201 				block->transaction_next->block_number);
2202 		}
2203 	}
2204 	if (block->previous_transaction != NULL) {
2205 		kprintf(" previous transaction: %p (%" B_PRId32 ")\n",
2206 			block->previous_transaction,
2207 			block->previous_transaction->id);
2208 	}
2209 
2210 	set_debug_variable("_current", (addr_t)block->current_data);
2211 	set_debug_variable("_original", (addr_t)block->original_data);
2212 	set_debug_variable("_parent", (addr_t)block->parent_data);
2213 }
2214 
2215 
2216 static int
2217 dump_cached_block(int argc, char** argv)
2218 {
2219 	if (argc != 2) {
2220 		kprintf("usage: %s <block-address>\n", argv[0]);
2221 		return 0;
2222 	}
2223 
2224 	dump_block_long((struct cached_block*)(addr_t)parse_expression(argv[1]));
2225 	return 0;
2226 }
2227 
2228 
2229 static int
2230 dump_cache(int argc, char** argv)
2231 {
2232 	bool showTransactions = false;
2233 	bool showBlocks = false;
2234 	int32 i = 1;
2235 	while (argv[i] != NULL && argv[i][0] == '-') {
2236 		for (char* arg = &argv[i][1]; arg[0]; arg++) {
2237 			switch (arg[0]) {
2238 				case 'b':
2239 					showBlocks = true;
2240 					break;
2241 				case 't':
2242 					showTransactions = true;
2243 					break;
2244 				default:
2245 					print_debugger_command_usage(argv[0]);
2246 					return 0;
2247 			}
2248 		}
2249 		i++;
2250 	}
2251 
2252 	if (i >= argc) {
2253 		print_debugger_command_usage(argv[0]);
2254 		return 0;
2255 	}
2256 
2257 	block_cache* cache = (struct block_cache*)(addr_t)parse_expression(argv[i]);
2258 	if (cache == NULL) {
2259 		kprintf("invalid cache address\n");
2260 		return 0;
2261 	}
2262 
2263 	off_t blockNumber = -1;
2264 	if (i + 1 < argc) {
2265 		blockNumber = parse_expression(argv[i + 1]);
2266 		cached_block* block = cache->hash->Lookup(blockNumber);
2267 		if (block != NULL)
2268 			dump_block_long(block);
2269 		else
2270 			kprintf("block %" B_PRIdOFF " not found\n", blockNumber);
2271 		return 0;
2272 	}
2273 
2274 	kprintf("BLOCK CACHE: %p\n", cache);
2275 
2276 	kprintf(" fd:           %d\n", cache->fd);
2277 	kprintf(" max_blocks:   %" B_PRIdOFF "\n", cache->max_blocks);
2278 	kprintf(" block_size:   %zu\n", cache->block_size);
2279 	kprintf(" next_transaction_id: %" B_PRId32 "\n", cache->next_transaction_id);
2280 	kprintf(" buffer_cache: %p\n", cache->buffer_cache);
2281 	kprintf(" busy_reading: %" B_PRIu32 ", %s waiters\n", cache->busy_reading_count,
2282 		cache->busy_reading_waiters ? "has" : "no");
2283 	kprintf(" busy_writing: %" B_PRIu32 ", %s waiters\n", cache->busy_writing_count,
2284 		cache->busy_writing_waiters ? "has" : "no");
2285 
2286 	if (!cache->pending_notifications.IsEmpty()) {
2287 		kprintf(" pending notifications:\n");
2288 
2289 		NotificationList::Iterator iterator
2290 			= cache->pending_notifications.GetIterator();
2291 		while (iterator.HasNext()) {
2292 			cache_notification* notification = iterator.Next();
2293 
2294 			kprintf("  %p %5" B_PRIx32 " %p - %p\n", notification,
2295 				notification->events_pending, notification->hook,
2296 				notification->data);
2297 		}
2298 	}
2299 
2300 	if (showTransactions) {
2301 		kprintf(" transactions:\n");
2302 		kprintf("address       id state  blocks  main   sub\n");
2303 
2304 		TransactionTable::Iterator iterator(cache->transaction_hash);
2305 
2306 		while (iterator.HasNext()) {
2307 			cache_transaction* transaction = iterator.Next();
2308 			kprintf("%p %5" B_PRId32 " %-7s %5" B_PRId32 " %5" B_PRId32 " %5"
2309 				B_PRId32 "\n", transaction, transaction->id,
2310 				transaction->open ? "open" : "closed",
2311 				transaction->num_blocks, transaction->main_num_blocks,
2312 				transaction->sub_num_blocks);
2313 		}
2314 	}
2315 
2316 	if (showBlocks) {
2317 		kprintf(" blocks:\n");
2318 		kprintf("address  block no. current  original parent    refs access "
2319 			"flags transact prev. trans\n");
2320 	}
2321 
2322 	uint32 referenced = 0;
2323 	uint32 count = 0;
2324 	uint32 dirty = 0;
2325 	uint32 discarded = 0;
2326 	BlockTable::Iterator iterator(cache->hash);
2327 	while (iterator.HasNext()) {
2328 		cached_block* block = iterator.Next();
2329 		if (showBlocks)
2330 			dump_block(block);
2331 
2332 		if (block->is_dirty)
2333 			dirty++;
2334 		if (block->discard)
2335 			discarded++;
2336 		if (block->ref_count)
2337 			referenced++;
2338 		count++;
2339 	}
2340 
2341 	kprintf(" %" B_PRIu32 " blocks total, %" B_PRIu32 " dirty, %" B_PRIu32
2342 		" discarded, %" B_PRIu32 " referenced, %" B_PRIu32 " busy, %" B_PRIu32
2343 		" in unused.\n",
2344 		count, dirty, discarded, referenced, cache->busy_reading_count,
2345 		cache->unused_block_count);
2346 	return 0;
2347 }
2348 
2349 
2350 static int
2351 dump_transaction(int argc, char** argv)
2352 {
2353 	bool showBlocks = false;
2354 	int i = 1;
2355 	if (argc > 1 && !strcmp(argv[1], "-b")) {
2356 		showBlocks = true;
2357 		i++;
2358 	}
2359 
2360 	if (argc - i < 1 || argc - i > 2) {
2361 		print_debugger_command_usage(argv[0]);
2362 		return 0;
2363 	}
2364 
2365 	cache_transaction* transaction = NULL;
2366 
2367 	if (argc - i == 1) {
2368 		transaction = (cache_transaction*)(addr_t)parse_expression(argv[i]);
2369 	} else {
2370 		block_cache* cache = (block_cache*)(addr_t)parse_expression(argv[i]);
2371 		int32 id = parse_expression(argv[i + 1]);
2372 		transaction = lookup_transaction(cache, id);
2373 		if (transaction == NULL) {
2374 			kprintf("No transaction with ID %" B_PRId32 " found.\n", id);
2375 			return 0;
2376 		}
2377 	}
2378 
2379 	kprintf("TRANSACTION %p\n", transaction);
2380 
2381 	kprintf(" id:             %" B_PRId32 "\n", transaction->id);
2382 	kprintf(" num block:      %" B_PRId32 "\n", transaction->num_blocks);
2383 	kprintf(" main num block: %" B_PRId32 "\n", transaction->main_num_blocks);
2384 	kprintf(" sub num block:  %" B_PRId32 "\n", transaction->sub_num_blocks);
2385 	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
2386 	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
2387 	kprintf(" idle:           %" B_PRId64 " secs\n",
2388 		(system_time() - transaction->last_used) / 1000000);
2389 
2390 	kprintf(" listeners:\n");
2391 
2392 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2393 	while (iterator.HasNext()) {
2394 		cache_listener* listener = iterator.Next();
2395 
2396 		kprintf("  %p %5" B_PRIx32 " %p - %p\n", listener, listener->events_pending,
2397 			listener->hook, listener->data);
2398 	}
2399 
2400 	if (!showBlocks)
2401 		return 0;
2402 
2403 	kprintf(" blocks:\n");
2404 	kprintf("address  block no. current  original parent    refs access "
2405 		"flags transact prev. trans\n");
2406 
2407 	cached_block* block = transaction->first_block;
2408 	while (block != NULL) {
2409 		dump_block(block);
2410 		block = block->transaction_next;
2411 	}
2412 
2413 	kprintf("--\n");
2414 
2415 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
2416 	while (blockIterator.HasNext()) {
2417 		block = blockIterator.Next();
2418 		dump_block(block);
2419 	}
2420 
2421 	return 0;
2422 }
2423 
2424 
2425 static int
2426 dump_caches(int argc, char** argv)
2427 {
2428 	kprintf("Block caches:\n");
2429 	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
2430 	while (i.HasNext()) {
2431 		block_cache* cache = i.Next();
2432 		if (cache == (block_cache*)&sMarkCache)
2433 			continue;
2434 
2435 		kprintf("  %p\n", cache);
2436 	}
2437 
2438 	return 0;
2439 }
2440 
2441 
2442 #if BLOCK_CACHE_BLOCK_TRACING >= 2
2443 static int
2444 dump_block_data(int argc, char** argv)
2445 {
2446 	using namespace BlockTracing;
2447 
2448 	// Determine which blocks to show
2449 
2450 	bool printStackTrace = true;
2451 	uint32 which = 0;
2452 	int32 i = 1;
2453 	while (i < argc && argv[i][0] == '-') {
2454 		char* arg = &argv[i][1];
2455 		while (arg[0]) {
2456 			switch (arg[0]) {
2457 				case 'c':
2458 					which |= BlockData::kCurrent;
2459 					break;
2460 				case 'p':
2461 					which |= BlockData::kParent;
2462 					break;
2463 				case 'o':
2464 					which |= BlockData::kOriginal;
2465 					break;
2466 
2467 				default:
2468 					kprintf("invalid block specifier (only o/c/p are "
2469 						"allowed).\n");
2470 					return 0;
2471 			}
2472 			arg++;
2473 		}
2474 
2475 		i++;
2476 	}
2477 	if (which == 0)
2478 		which = BlockData::kCurrent | BlockData::kParent | BlockData::kOriginal;
2479 
2480 	if (i == argc) {
2481 		print_debugger_command_usage(argv[0]);
2482 		return 0;
2483 	}
2484 
2485 	// Get the range of blocks to print
2486 
2487 	int64 from = parse_expression(argv[i]);
2488 	int64 to = from;
2489 	if (argc > i + 1)
2490 		to = parse_expression(argv[i + 1]);
2491 	if (to < from)
2492 		to = from;
2493 
2494 	uint32 offset = 0;
2495 	uint32 size = LONG_MAX;
2496 	if (argc > i + 2)
2497 		offset = parse_expression(argv[i + 2]);
2498 	if (argc > i + 3)
2499 		size = parse_expression(argv[i + 3]);
2500 
2501 	TraceEntryIterator iterator;
2502 	iterator.MoveTo(from - 1);
2503 
2504 	static char sBuffer[1024];
2505 	LazyTraceOutput out(sBuffer, sizeof(sBuffer), TRACE_OUTPUT_TEAM_ID);
2506 
2507 	while (TraceEntry* entry = iterator.Next()) {
2508 		int32 index = iterator.Index();
2509 		if (index > to)
2510 			break;
2511 
2512 		Action* action = dynamic_cast<Action*>(entry);
2513 		if (action != NULL) {
2514 			out.Clear();
2515 			out.DumpEntry(action);
2516 			continue;
2517 		}
2518 
2519 		BlockData* blockData = dynamic_cast<BlockData*>(entry);
2520 		if (blockData == NULL)
2521 			continue;
2522 
2523 		out.Clear();
2524 
2525 		const char* dump = out.DumpEntry(entry);
2526 		int length = strlen(dump);
2527 		if (length > 0 && dump[length - 1] == '\n')
2528 			length--;
2529 
2530 		kprintf("%5" B_PRId32 ". %.*s\n", index, length, dump);
2531 
2532 		if (printStackTrace) {
2533 			out.Clear();
2534 			entry->DumpStackTrace(out);
2535 			if (out.Size() > 0)
2536 				kputs(out.Buffer());
2537 		}
2538 
2539 		blockData->DumpBlocks(which, offset, size);
2540 	}
2541 
2542 	return 0;
2543 }
2544 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
2545 
2546 
2547 #endif	// DEBUG_BLOCK_CACHE
2548 
2549 
2550 /*!	Traverses through the block_cache list, and returns one cache after the
2551 	other. The cache returned is automatically locked when you get it, and
2552 	unlocked with the next call to this function. Ignores caches that are in
2553 	deletion state.
2554 	Returns \c NULL when the end of the list is reached.
2555 */
2556 static block_cache*
2557 get_next_locked_block_cache(block_cache* last)
2558 {
2559 	MutexLocker _(sCachesLock);
2560 
2561 	block_cache* cache;
2562 	if (last != NULL) {
2563 		mutex_unlock(&last->lock);
2564 
2565 		cache = sCaches.GetNext((block_cache*)&sMarkCache);
2566 		sCaches.Remove((block_cache*)&sMarkCache);
2567 	} else
2568 		cache = sCaches.Head();
2569 
2570 	if (cache != NULL) {
2571 		mutex_lock(&cache->lock);
2572 		sCaches.InsertBefore(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
2573 	}
2574 
2575 	return cache;
2576 }
2577 
2578 
2579 /*!	Background thread that continuously checks for pending notifications of
2580 	all caches.
2581 	Every two seconds, it will also write back up to 64 blocks per cache.
2582 */
2583 static status_t
2584 block_notifier_and_writer(void* /*data*/)
2585 {
2586 	const bigtime_t kDefaultTimeout = 2000000LL;
2587 	bigtime_t timeout = kDefaultTimeout;
2588 
2589 	while (true) {
2590 		bigtime_t start = system_time();
2591 
2592 		status_t status = acquire_sem_etc(sEventSemaphore, 1,
2593 			B_RELATIVE_TIMEOUT, timeout);
2594 		if (status == B_OK) {
2595 			flush_pending_notifications();
2596 			timeout -= system_time() - start;
2597 			continue;
2598 		}
2599 
2600 		// Write 64 blocks of each block_cache roughly every 2 seconds,
2601 		// potentially more or less depending on congestion and drive speeds
2602 		// (usually much less.) We do not want to queue everything at once
2603 		// because a future transaction might then get held up waiting for
2604 		// a specific block to be written.
2605 		timeout = kDefaultTimeout;
2606 		size_t usedMemory;
2607 		object_cache_get_usage(sBlockCache, &usedMemory);
2608 
2609 		block_cache* cache = NULL;
2610 		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
2611 			// Give some breathing room: wait 2x the length of the potential
2612 			// maximum block count-sized write between writes, and also skip
2613 			// if there are more than 16 blocks currently being written.
2614 			const bigtime_t next = cache->last_block_write
2615 					+ cache->last_block_write_duration * 2 * 64;
2616 			if (cache->busy_writing_count > 16 || system_time() < next) {
2617 				if (cache->last_block_write_duration > 0) {
2618 					timeout = min_c(timeout,
2619 						cache->last_block_write_duration * 2 * 64);
2620 				}
2621 				continue;
2622 			}
2623 
2624 			BlockWriter writer(cache, 64);
2625 			bool hasMoreBlocks = false;
2626 
2627 			size_t cacheUsedMemory;
2628 			object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
2629 			usedMemory += cacheUsedMemory;
2630 
2631 			if (cache->num_dirty_blocks) {
2632 				// This cache is not using transactions, we'll scan the blocks
2633 				// directly
2634 				BlockTable::Iterator iterator(cache->hash);
2635 
2636 				while (iterator.HasNext()) {
2637 					cached_block* block = iterator.Next();
2638 					if (block->CanBeWritten() && !writer.Add(block)) {
2639 						hasMoreBlocks = true;
2640 						break;
2641 					}
2642 				}
2643 			} else {
2644 				TransactionTable::Iterator iterator(cache->transaction_hash);
2645 
2646 				while (iterator.HasNext()) {
2647 					cache_transaction* transaction = iterator.Next();
2648 					if (transaction->open) {
2649 						if (system_time() > transaction->last_used
2650 								+ kTransactionIdleTime) {
2651 							// Transaction is open but idle
2652 							notify_transaction_listeners(cache, transaction,
2653 								TRANSACTION_IDLE);
2654 						}
2655 						continue;
2656 					}
2657 
2658 					bool hasLeftOvers;
2659 						// we ignore this one
2660 					if (!writer.Add(transaction, hasLeftOvers)) {
2661 						hasMoreBlocks = true;
2662 						break;
2663 					}
2664 				}
2665 			}
2666 
2667 			writer.Write();
2668 
2669 			if (hasMoreBlocks && cache->last_block_write_duration > 0) {
2670 				// There are probably still more blocks that we could write, so
2671 				// see if we can decrease the timeout.
2672 				timeout = min_c(timeout,
2673 					cache->last_block_write_duration * 2 * 64);
2674 			}
2675 
2676 			if ((block_cache_used_memory() / B_PAGE_SIZE)
2677 					> vm_page_num_pages() / 2) {
2678 				// Try to reduce memory usage to half of the available
2679 				// RAM at maximum
2680 				cache->RemoveUnusedBlocks(1000, 10);
2681 			}
2682 		}
2683 
2684 		MutexLocker _(sCachesMemoryUseLock);
2685 		sUsedMemory = usedMemory;
2686 	}
2687 
2688 	// never can get here
2689 	return B_OK;
2690 }
2691 
2692 
2693 /*!	Notify function for wait_for_notifications(). */
2694 static void
2695 notify_sync(int32 transactionID, int32 event, void* _cache)
2696 {
2697 	block_cache* cache = (block_cache*)_cache;
2698 
2699 	cache->condition_variable.NotifyOne();
2700 }
2701 
2702 
2703 /*!	Must be called with the sCachesLock held. */
2704 static bool
2705 is_valid_cache(block_cache* cache)
2706 {
2707 	ASSERT_LOCKED_MUTEX(&sCachesLock);
2708 
2709 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
2710 	while (iterator.HasNext()) {
2711 		if (cache == iterator.Next())
2712 			return true;
2713 	}
2714 
2715 	return false;
2716 }
2717 
2718 
2719 /*!	Waits until all pending notifications are carried out.
2720 	Safe to be called from the block writer/notifier thread.
2721 	You must not hold the \a cache lock when calling this function.
2722 */
2723 static void
2724 wait_for_notifications(block_cache* cache)
2725 {
2726 	MutexLocker locker(sCachesLock);
2727 
2728 	if (find_thread(NULL) == sNotifierWriterThread) {
2729 		// We're the notifier thread, don't wait, but flush all pending
2730 		// notifications directly.
2731 		if (is_valid_cache(cache))
2732 			flush_pending_notifications(cache);
2733 		return;
2734 	}
2735 
2736 	// add sync notification
2737 	cache_notification notification;
2738 	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
2739 		cache);
2740 
2741 	ConditionVariableEntry entry;
2742 	cache->condition_variable.Add(&entry);
2743 
2744 	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
2745 	locker.Unlock();
2746 
2747 	// wait for notification hook to be called
2748 	entry.Wait();
2749 }
2750 
2751 
2752 status_t
2753 block_cache_init(void)
2754 {
2755 	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
2756 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
2757 	if (sBlockCache == NULL)
2758 		return B_NO_MEMORY;
2759 
2760 	sCacheNotificationCache = create_object_cache("cache notifications",
2761 		sizeof(cache_listener), 8, NULL, NULL, NULL);
2762 	if (sCacheNotificationCache == NULL)
2763 		return B_NO_MEMORY;
2764 
2765 	new (&sCaches) DoublyLinkedList<block_cache>;
2766 		// manually call constructor
2767 
2768 	sEventSemaphore = create_sem(0, "block cache event");
2769 	if (sEventSemaphore < B_OK)
2770 		return sEventSemaphore;
2771 
2772 	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
2773 		"block notifier/writer", B_LOW_PRIORITY, NULL);
2774 	if (sNotifierWriterThread >= B_OK)
2775 		resume_thread(sNotifierWriterThread);
2776 
2777 #if DEBUG_BLOCK_CACHE
2778 	add_debugger_command_etc("block_caches", &dump_caches,
2779 		"dumps all block caches", "\n", 0);
2780 	add_debugger_command_etc("block_cache", &dump_cache,
2781 		"dumps a specific block cache",
2782 		"[-bt] <cache-address> [block-number]\n"
2783 		"  -t lists the transactions\n"
2784 		"  -b lists all blocks\n", 0);
2785 	add_debugger_command("cached_block", &dump_cached_block,
2786 		"dumps the specified cached block");
2787 	add_debugger_command_etc("transaction", &dump_transaction,
2788 		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
2789 		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
2790 		"  -b lists all blocks that are part of this transaction\n", 0);
2791 #	if BLOCK_CACHE_BLOCK_TRACING >= 2
2792 	add_debugger_command_etc("block_cache_data", &dump_block_data,
2793 		"dumps the data blocks logged for the actions",
2794 		"[-cpo] <from> [<to> [<offset> [<size>]]]\n"
2795 		"If no data specifier is used, all blocks are shown by default.\n"
2796 		" -c       the current data is shown, if available.\n"
2797 		" -p       the parent data is shown, if available.\n"
2798 		" -o       the original data is shown, if available.\n"
2799 		" <from>   first index of tracing entries to show.\n"
2800 		" <to>     if given, the last entry. If not, only <from> is shown.\n"
2801 		" <offset> the offset of the block data.\n"
2802 		" <from>   the size of the block data that is dumped\n", 0);
2803 #	endif
2804 #endif	// DEBUG_BLOCK_CACHE
2805 
2806 	return B_OK;
2807 }
2808 
2809 
2810 size_t
2811 block_cache_used_memory(void)
2812 {
2813 	MutexLocker _(sCachesMemoryUseLock);
2814 	return sUsedMemory;
2815 }
2816 
2817 
2818 //	#pragma mark - public transaction API
2819 
2820 
2821 int32
2822 cache_start_transaction(void* _cache)
2823 {
2824 	block_cache* cache = (block_cache*)_cache;
2825 	TransactionLocker locker(cache);
2826 
2827 	if (cache->last_transaction && cache->last_transaction->open) {
2828 		panic("last transaction (%" B_PRId32 ") still open!\n",
2829 			cache->last_transaction->id);
2830 	}
2831 
2832 	cache_transaction* transaction = new(std::nothrow) cache_transaction;
2833 	if (transaction == NULL)
2834 		return B_NO_MEMORY;
2835 
2836 	transaction->id = atomic_add(&cache->next_transaction_id, 1);
2837 	cache->last_transaction = transaction;
2838 
2839 	TRACE(("cache_start_transaction(): id %" B_PRId32 " started\n", transaction->id));
2840 	T(Action("start", cache, transaction));
2841 
2842 	cache->transaction_hash->Insert(transaction);
2843 
2844 	return transaction->id;
2845 }
2846 
2847 
2848 status_t
2849 cache_sync_transaction(void* _cache, int32 id)
2850 {
2851 	block_cache* cache = (block_cache*)_cache;
2852 	bool hadBusy;
2853 
2854 	TRACE(("cache_sync_transaction(id %" B_PRId32 ")\n", id));
2855 
2856 	do {
2857 		TransactionLocker locker(cache);
2858 		hadBusy = false;
2859 
2860 		BlockWriter writer(cache);
2861 		TransactionTable::Iterator iterator(cache->transaction_hash);
2862 
2863 		while (iterator.HasNext()) {
2864 			// close all earlier transactions which haven't been closed yet
2865 			cache_transaction* transaction = iterator.Next();
2866 
2867 			if (transaction->busy_writing_count != 0) {
2868 				hadBusy = true;
2869 				continue;
2870 			}
2871 			if (transaction->id <= id && !transaction->open) {
2872 				// write back all of their remaining dirty blocks
2873 				T(Action("sync", cache, transaction));
2874 
2875 				bool hasLeftOvers;
2876 				writer.Add(transaction, hasLeftOvers);
2877 
2878 				if (hasLeftOvers) {
2879 					// This transaction contains blocks that a previous
2880 					// transaction is trying to write back in this write run
2881 					hadBusy = true;
2882 				}
2883 			}
2884 		}
2885 
2886 		status_t status = writer.Write();
2887 		if (status != B_OK)
2888 			return status;
2889 	} while (hadBusy);
2890 
2891 	wait_for_notifications(cache);
2892 		// make sure that all pending TRANSACTION_WRITTEN notifications
2893 		// are handled after we return
2894 	return B_OK;
2895 }
2896 
2897 
2898 status_t
2899 cache_end_transaction(void* _cache, int32 id,
2900 	transaction_notification_hook hook, void* data)
2901 {
2902 	block_cache* cache = (block_cache*)_cache;
2903 	TransactionLocker locker(cache);
2904 
2905 	TRACE(("cache_end_transaction(id = %" B_PRId32 ")\n", id));
2906 
2907 	cache_transaction* transaction = lookup_transaction(cache, id);
2908 	if (transaction == NULL) {
2909 		panic("cache_end_transaction(): invalid transaction ID\n");
2910 		return B_BAD_VALUE;
2911 	}
2912 
2913 	// Write back all pending transaction blocks
2914 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
2915 	if (status != B_OK)
2916 		return status;
2917 
2918 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
2919 
2920 	if (hook != NULL
2921 		&& add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN,
2922 			hook, data) != B_OK) {
2923 		return B_NO_MEMORY;
2924 	}
2925 
2926 	T(Action("end", cache, transaction));
2927 
2928 	// iterate through all blocks and free the unchanged original contents
2929 
2930 	cached_block* next;
2931 	for (cached_block* block = transaction->first_block; block != NULL;
2932 			block = next) {
2933 		next = block->transaction_next;
2934 		ASSERT(block->previous_transaction == NULL);
2935 
2936 		if (block->discard) {
2937 			// This block has been discarded in the transaction
2938 			cache->DiscardBlock(block);
2939 			transaction->num_blocks--;
2940 			continue;
2941 		}
2942 
2943 		if (block->original_data != NULL) {
2944 			cache->Free(block->original_data);
2945 			block->original_data = NULL;
2946 		}
2947 		if (block->parent_data != NULL) {
2948 			ASSERT(transaction->has_sub_transaction);
2949 			cache->FreeBlockParentData(block);
2950 		}
2951 
2952 		// move the block to the previous transaction list
2953 		transaction->blocks.Add(block);
2954 
2955 		block->previous_transaction = transaction;
2956 		block->transaction_next = NULL;
2957 		block->transaction = NULL;
2958 	}
2959 
2960 	transaction->open = false;
2961 	return B_OK;
2962 }
2963 
2964 
2965 status_t
2966 cache_abort_transaction(void* _cache, int32 id)
2967 {
2968 	block_cache* cache = (block_cache*)_cache;
2969 	TransactionLocker locker(cache);
2970 
2971 	TRACE(("cache_abort_transaction(id = %" B_PRId32 ")\n", id));
2972 
2973 	cache_transaction* transaction = lookup_transaction(cache, id);
2974 	if (transaction == NULL) {
2975 		panic("cache_abort_transaction(): invalid transaction ID\n");
2976 		return B_BAD_VALUE;
2977 	}
2978 
2979 	T(Abort(cache, transaction));
2980 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
2981 
2982 	// iterate through all blocks and restore their original contents
2983 
2984 	cached_block* block = transaction->first_block;
2985 	cached_block* next;
2986 	for (; block != NULL; block = next) {
2987 		next = block->transaction_next;
2988 
2989 		if (block->original_data != NULL) {
2990 			TRACE(("cache_abort_transaction(id = %" B_PRId32 "): restored contents of "
2991 				"block %" B_PRIdOFF "\n", transaction->id, block->block_number));
2992 			memcpy(block->current_data, block->original_data,
2993 				cache->block_size);
2994 			cache->Free(block->original_data);
2995 			block->original_data = NULL;
2996 		}
2997 		if (transaction->has_sub_transaction && block->parent_data != NULL)
2998 			cache->FreeBlockParentData(block);
2999 
3000 		block->transaction_next = NULL;
3001 		block->transaction = NULL;
3002 		block->discard = false;
3003 		if (block->previous_transaction == NULL)
3004 			block->is_dirty = false;
3005 	}
3006 
3007 	cache->transaction_hash->Remove(transaction);
3008 	delete_transaction(cache, transaction);
3009 	return B_OK;
3010 }
3011 
3012 
3013 /*!	Acknowledges the current parent transaction, and starts a new transaction
3014 	from its sub transaction.
3015 	The new transaction also gets a new transaction ID.
3016 */
3017 int32
3018 cache_detach_sub_transaction(void* _cache, int32 id,
3019 	transaction_notification_hook hook, void* data)
3020 {
3021 	block_cache* cache = (block_cache*)_cache;
3022 	TransactionLocker locker(cache);
3023 
3024 	TRACE(("cache_detach_sub_transaction(id = %" B_PRId32 ")\n", id));
3025 
3026 	cache_transaction* transaction = lookup_transaction(cache, id);
3027 	if (transaction == NULL) {
3028 		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
3029 		return B_BAD_VALUE;
3030 	}
3031 	if (!transaction->has_sub_transaction)
3032 		return B_BAD_VALUE;
3033 
3034 	// iterate through all blocks and free the unchanged original contents
3035 
3036 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
3037 	if (status != B_OK)
3038 		return status;
3039 
3040 	// create a new transaction for the sub transaction
3041 	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
3042 	if (newTransaction == NULL)
3043 		return B_NO_MEMORY;
3044 
3045 	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
3046 	T(Detach(cache, transaction, newTransaction));
3047 
3048 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3049 
3050 	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
3051 			data) != B_OK) {
3052 		delete newTransaction;
3053 		return B_NO_MEMORY;
3054 	}
3055 
3056 	cached_block* last = NULL;
3057 	cached_block* next;
3058 	for (cached_block* block = transaction->first_block; block != NULL;
3059 			block = next) {
3060 		next = block->transaction_next;
3061 		ASSERT(block->previous_transaction == NULL);
3062 
3063 		if (block->discard) {
3064 			cache->DiscardBlock(block);
3065 			transaction->main_num_blocks--;
3066 			continue;
3067 		}
3068 
3069 		if (block->parent_data != NULL) {
3070 			// The block changed in the parent - free the original data, since
3071 			// they will be replaced by what is in current.
3072 			ASSERT(block->original_data != NULL);
3073 			cache->Free(block->original_data);
3074 
3075 			if (block->parent_data != block->current_data) {
3076 				// The block had been changed in both transactions
3077 				block->original_data = block->parent_data;
3078 			} else {
3079 				// The block has only been changed in the parent
3080 				block->original_data = NULL;
3081 			}
3082 			block->parent_data = NULL;
3083 
3084 			// move the block to the previous transaction list
3085 			transaction->blocks.Add(block);
3086 			block->previous_transaction = transaction;
3087 		}
3088 
3089 		if (block->original_data != NULL) {
3090 			// This block had been changed in the current sub transaction,
3091 			// we need to move this block over to the new transaction.
3092 			ASSERT(block->parent_data == NULL);
3093 
3094 			if (last == NULL)
3095 				newTransaction->first_block = block;
3096 			else
3097 				last->transaction_next = block;
3098 
3099 			block->transaction = newTransaction;
3100 			last = block;
3101 		} else
3102 			block->transaction = NULL;
3103 
3104 		block->transaction_next = NULL;
3105 	}
3106 
3107 	newTransaction->num_blocks = transaction->sub_num_blocks;
3108 
3109 	transaction->open = false;
3110 	transaction->has_sub_transaction = false;
3111 	transaction->num_blocks = transaction->main_num_blocks;
3112 	transaction->sub_num_blocks = 0;
3113 
3114 	cache->transaction_hash->Insert(newTransaction);
3115 	cache->last_transaction = newTransaction;
3116 
3117 	return newTransaction->id;
3118 }
3119 
3120 
3121 status_t
3122 cache_abort_sub_transaction(void* _cache, int32 id)
3123 {
3124 	block_cache* cache = (block_cache*)_cache;
3125 	TransactionLocker locker(cache);
3126 
3127 	TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 ")\n", id));
3128 
3129 	cache_transaction* transaction = lookup_transaction(cache, id);
3130 	if (transaction == NULL) {
3131 		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
3132 		return B_BAD_VALUE;
3133 	}
3134 	if (!transaction->has_sub_transaction)
3135 		return B_BAD_VALUE;
3136 
3137 	T(Abort(cache, transaction));
3138 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3139 
3140 	// revert all changes back to the version of the parent
3141 
3142 	cached_block* block = transaction->first_block;
3143 	cached_block* last = NULL;
3144 	cached_block* next;
3145 	for (; block != NULL; block = next) {
3146 		next = block->transaction_next;
3147 
3148 		if (block->parent_data == NULL) {
3149 			// The parent transaction didn't change the block, but the sub
3150 			// transaction did - we need to revert to the original data.
3151 			// The block is no longer part of the transaction
3152 			if (block->original_data != NULL) {
3153 				// The block might not have original data if was empty
3154 				memcpy(block->current_data, block->original_data,
3155 					cache->block_size);
3156 			}
3157 
3158 			if (last != NULL)
3159 				last->transaction_next = next;
3160 			else
3161 				transaction->first_block = next;
3162 
3163 			block->transaction_next = NULL;
3164 			block->transaction = NULL;
3165 			transaction->num_blocks--;
3166 
3167 			if (block->previous_transaction == NULL) {
3168 				cache->Free(block->original_data);
3169 				block->original_data = NULL;
3170 				block->is_dirty = false;
3171 
3172 				if (block->ref_count == 0) {
3173 					// Move the block into the unused list if possible
3174 					block->unused = true;
3175 					cache->unused_blocks.Add(block);
3176 					cache->unused_block_count++;
3177 				}
3178 			}
3179 		} else {
3180 			if (block->parent_data != block->current_data) {
3181 				// The block has been changed and must be restored - the block
3182 				// is still dirty and part of the transaction
3183 				TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 "): "
3184 					"restored contents of block %" B_PRIdOFF "\n",
3185 					transaction->id, block->block_number));
3186 				memcpy(block->current_data, block->parent_data,
3187 					cache->block_size);
3188 				cache->Free(block->parent_data);
3189 				// The block stays dirty
3190 			}
3191 			block->parent_data = NULL;
3192 			last = block;
3193 		}
3194 
3195 		block->discard = false;
3196 	}
3197 
3198 	// all subsequent changes will go into the main transaction
3199 	transaction->has_sub_transaction = false;
3200 	transaction->sub_num_blocks = 0;
3201 
3202 	return B_OK;
3203 }
3204 
3205 
3206 status_t
3207 cache_start_sub_transaction(void* _cache, int32 id)
3208 {
3209 	block_cache* cache = (block_cache*)_cache;
3210 	TransactionLocker locker(cache);
3211 
3212 	TRACE(("cache_start_sub_transaction(id = %" B_PRId32 ")\n", id));
3213 
3214 	cache_transaction* transaction = lookup_transaction(cache, id);
3215 	if (transaction == NULL) {
3216 		panic("cache_start_sub_transaction(): invalid transaction ID %" B_PRId32 "\n",
3217 			id);
3218 		return B_BAD_VALUE;
3219 	}
3220 
3221 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3222 
3223 	// move all changed blocks up to the parent
3224 
3225 	cached_block* block = transaction->first_block;
3226 	cached_block* next;
3227 	for (; block != NULL; block = next) {
3228 		next = block->transaction_next;
3229 
3230 		if (block->parent_data != NULL) {
3231 			// There already is an older sub transaction - we acknowledge
3232 			// its changes and move its blocks up to the parent
3233 			ASSERT(transaction->has_sub_transaction);
3234 			cache->FreeBlockParentData(block);
3235 		}
3236 		if (block->discard) {
3237 			// This block has been discarded in the parent transaction.
3238 			// Just throw away any changes made in this transaction, so that
3239 			// it can still be reverted to its original contents if needed
3240 			ASSERT(block->previous_transaction == NULL);
3241 			if (block->original_data != NULL) {
3242 				memcpy(block->current_data, block->original_data,
3243 					cache->block_size);
3244 
3245 				cache->Free(block->original_data);
3246 				block->original_data = NULL;
3247 			}
3248 			continue;
3249 		}
3250 
3251 		// we "allocate" the parent data lazily, that means, we don't copy
3252 		// the data (and allocate memory for it) until we need to
3253 		block->parent_data = block->current_data;
3254 	}
3255 
3256 	// all subsequent changes will go into the sub transaction
3257 	transaction->has_sub_transaction = true;
3258 	transaction->main_num_blocks = transaction->num_blocks;
3259 	transaction->sub_num_blocks = 0;
3260 	T(Action("start-sub", cache, transaction));
3261 
3262 	return B_OK;
3263 }
3264 
3265 
3266 /*!	Adds a transaction listener that gets notified when the transaction
3267 	is ended, aborted, written, or idle as specified by \a events.
3268 	The listener gets automatically removed when the transaction ends.
3269 */
3270 status_t
3271 cache_add_transaction_listener(void* _cache, int32 id, int32 events,
3272 	transaction_notification_hook hook, void* data)
3273 {
3274 	block_cache* cache = (block_cache*)_cache;
3275 	TransactionLocker locker(cache);
3276 
3277 	cache_transaction* transaction = lookup_transaction(cache, id);
3278 	if (transaction == NULL)
3279 		return B_BAD_VALUE;
3280 
3281 	return add_transaction_listener(cache, transaction, events, hook, data);
3282 }
3283 
3284 
3285 status_t
3286 cache_remove_transaction_listener(void* _cache, int32 id,
3287 	transaction_notification_hook hookFunction, void* data)
3288 {
3289 	block_cache* cache = (block_cache*)_cache;
3290 	TransactionLocker locker(cache);
3291 
3292 	cache_transaction* transaction = lookup_transaction(cache, id);
3293 	if (transaction == NULL)
3294 		return B_BAD_VALUE;
3295 
3296 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
3297 	while (iterator.HasNext()) {
3298 		cache_listener* listener = iterator.Next();
3299 		if (listener->data == data && listener->hook == hookFunction) {
3300 			iterator.Remove();
3301 
3302 			if (listener->events_pending != 0) {
3303 				MutexLocker _(sNotificationsLock);
3304 				if (listener->events_pending != 0)
3305 					cache->pending_notifications.Remove(listener);
3306 			}
3307 			delete listener;
3308 			return B_OK;
3309 		}
3310 	}
3311 
3312 	return B_ENTRY_NOT_FOUND;
3313 }
3314 
3315 
3316 status_t
3317 cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
3318 	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
3319 {
3320 	cached_block* block = (cached_block*)*_cookie;
3321 	block_cache* cache = (block_cache*)_cache;
3322 	TransactionLocker locker(cache);
3323 
3324 	cache_transaction* transaction = lookup_transaction(cache, id);
3325 	if (transaction == NULL || !transaction->open)
3326 		return B_BAD_VALUE;
3327 
3328 	if (block == NULL)
3329 		block = transaction->first_block;
3330 	else
3331 		block = block->transaction_next;
3332 
3333 	if (transaction->has_sub_transaction) {
3334 		if (mainOnly) {
3335 			// find next block that the parent changed
3336 			while (block != NULL && block->parent_data == NULL)
3337 				block = block->transaction_next;
3338 		} else {
3339 			// find next non-discarded block
3340 			while (block != NULL && block->discard)
3341 				block = block->transaction_next;
3342 		}
3343 	}
3344 
3345 	if (block == NULL)
3346 		return B_ENTRY_NOT_FOUND;
3347 
3348 	if (_blockNumber)
3349 		*_blockNumber = block->block_number;
3350 	if (_data)
3351 		*_data = mainOnly ? block->parent_data : block->current_data;
3352 	if (_unchangedData)
3353 		*_unchangedData = block->original_data;
3354 
3355 	*_cookie = (addr_t)block;
3356 	return B_OK;
3357 }
3358 
3359 
3360 int32
3361 cache_blocks_in_transaction(void* _cache, int32 id)
3362 {
3363 	block_cache* cache = (block_cache*)_cache;
3364 	TransactionLocker locker(cache);
3365 
3366 	cache_transaction* transaction = lookup_transaction(cache, id);
3367 	if (transaction == NULL)
3368 		return B_BAD_VALUE;
3369 
3370 	return transaction->num_blocks;
3371 }
3372 
3373 
3374 /*!	Returns the number of blocks that are part of the main transaction. If this
3375 	transaction does not have a sub transaction yet, this is the same value as
3376 	cache_blocks_in_transaction() would return.
3377 */
3378 int32
3379 cache_blocks_in_main_transaction(void* _cache, int32 id)
3380 {
3381 	block_cache* cache = (block_cache*)_cache;
3382 	TransactionLocker locker(cache);
3383 
3384 	cache_transaction* transaction = lookup_transaction(cache, id);
3385 	if (transaction == NULL)
3386 		return B_BAD_VALUE;
3387 
3388 	if (transaction->has_sub_transaction)
3389 		return transaction->main_num_blocks;
3390 
3391 	return transaction->num_blocks;
3392 }
3393 
3394 
3395 int32
3396 cache_blocks_in_sub_transaction(void* _cache, int32 id)
3397 {
3398 	block_cache* cache = (block_cache*)_cache;
3399 	TransactionLocker locker(cache);
3400 
3401 	cache_transaction* transaction = lookup_transaction(cache, id);
3402 	if (transaction == NULL)
3403 		return B_BAD_VALUE;
3404 
3405 	return transaction->sub_num_blocks;
3406 }
3407 
3408 
3409 /*!	Check if block is in transaction
3410 */
3411 bool
3412 cache_has_block_in_transaction(void* _cache, int32 id, off_t blockNumber)
3413 {
3414 	block_cache* cache = (block_cache*)_cache;
3415 	TransactionLocker locker(cache);
3416 
3417 	cached_block* block = cache->hash->Lookup(blockNumber);
3418 
3419 	return (block != NULL && block->transaction != NULL
3420 		&& block->transaction->id == id);
3421 }
3422 
3423 
3424 //	#pragma mark - public block cache API
3425 
3426 
3427 void
3428 block_cache_delete(void* _cache, bool allowWrites)
3429 {
3430 	block_cache* cache = (block_cache*)_cache;
3431 
3432 	if (allowWrites)
3433 		block_cache_sync(cache);
3434 
3435 	mutex_lock(&sCachesLock);
3436 	sCaches.Remove(cache);
3437 	mutex_unlock(&sCachesLock);
3438 
3439 	mutex_lock(&cache->lock);
3440 
3441 	// wait for all blocks to become unbusy
3442 	wait_for_busy_reading_blocks(cache);
3443 	wait_for_busy_writing_blocks(cache);
3444 
3445 	// free all blocks
3446 
3447 	cached_block* block = cache->hash->Clear(true);
3448 	while (block != NULL) {
3449 		cached_block* next = block->next;
3450 		cache->FreeBlock(block);
3451 		block = next;
3452 	}
3453 
3454 	// free all transactions (they will all be aborted)
3455 
3456 	cache_transaction* transaction = cache->transaction_hash->Clear(true);
3457 	while (transaction != NULL) {
3458 		cache_transaction* next = transaction->next;
3459 		delete transaction;
3460 		transaction = next;
3461 	}
3462 
3463 	delete cache;
3464 }
3465 
3466 
3467 void*
3468 block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
3469 {
3470 	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
3471 		readOnly);
3472 	if (cache == NULL)
3473 		return NULL;
3474 
3475 	if (cache->Init() != B_OK) {
3476 		delete cache;
3477 		return NULL;
3478 	}
3479 
3480 	MutexLocker _(sCachesLock);
3481 	sCaches.Add(cache);
3482 
3483 	return cache;
3484 }
3485 
3486 
3487 status_t
3488 block_cache_sync(void* _cache)
3489 {
3490 	block_cache* cache = (block_cache*)_cache;
3491 
3492 	// We will sync all dirty blocks to disk that have a completed
3493 	// transaction or no transaction only
3494 
3495 	MutexLocker locker(&cache->lock);
3496 
3497 	BlockWriter writer(cache);
3498 	BlockTable::Iterator iterator(cache->hash);
3499 
3500 	while (iterator.HasNext()) {
3501 		cached_block* block = iterator.Next();
3502 		if (block->CanBeWritten())
3503 			writer.Add(block);
3504 	}
3505 
3506 	status_t status = writer.Write();
3507 
3508 	locker.Unlock();
3509 
3510 	wait_for_notifications(cache);
3511 		// make sure that all pending TRANSACTION_WRITTEN notifications
3512 		// are handled after we return
3513 	return status;
3514 }
3515 
3516 
3517 status_t
3518 block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
3519 {
3520 	block_cache* cache = (block_cache*)_cache;
3521 
3522 	// We will sync all dirty blocks to disk that have a completed
3523 	// transaction or no transaction only
3524 
3525 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
3526 		panic("block_cache_sync_etc: invalid block number %" B_PRIdOFF
3527 			" (max %" B_PRIdOFF ")",
3528 			blockNumber, cache->max_blocks - 1);
3529 		return B_BAD_VALUE;
3530 	}
3531 
3532 	MutexLocker locker(&cache->lock);
3533 	BlockWriter writer(cache);
3534 
3535 	for (; numBlocks > 0; numBlocks--, blockNumber++) {
3536 		cached_block* block = cache->hash->Lookup(blockNumber);
3537 		if (block == NULL)
3538 			continue;
3539 
3540 		if (block->CanBeWritten())
3541 			writer.Add(block);
3542 	}
3543 
3544 	status_t status = writer.Write();
3545 
3546 	locker.Unlock();
3547 
3548 	wait_for_notifications(cache);
3549 		// make sure that all pending TRANSACTION_WRITTEN notifications
3550 		// are handled after we return
3551 	return status;
3552 }
3553 
3554 
3555 /*!	Discards a block from the current transaction or from the cache.
3556 	You have to call this function when you no longer use a block, ie. when it
3557 	might be reclaimed by the file cache in order to make sure they won't
3558 	interfere.
3559 */
3560 void
3561 block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
3562 {
3563 	// TODO: this could be a nice place to issue the ATA trim command
3564 	block_cache* cache = (block_cache*)_cache;
3565 	TransactionLocker locker(cache);
3566 
3567 	BlockWriter writer(cache);
3568 
3569 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3570 		cached_block* block = cache->hash->Lookup(blockNumber);
3571 		if (block != NULL && block->previous_transaction != NULL)
3572 			writer.Add(block);
3573 	}
3574 
3575 	writer.Write();
3576 		// TODO: this can fail, too!
3577 
3578 	blockNumber -= numBlocks;
3579 		// reset blockNumber to its original value
3580 
3581 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3582 		cached_block* block = cache->hash->Lookup(blockNumber);
3583 		if (block == NULL)
3584 			continue;
3585 
3586 		ASSERT(block->previous_transaction == NULL);
3587 
3588 		if (block->unused) {
3589 			cache->unused_blocks.Remove(block);
3590 			cache->unused_block_count--;
3591 			cache->RemoveBlock(block);
3592 		} else {
3593 			if (block->transaction != NULL && block->parent_data != NULL
3594 				&& block->parent_data != block->current_data) {
3595 				panic("Discarded block %" B_PRIdOFF " has already been changed in this "
3596 					"transaction!", blockNumber);
3597 			}
3598 
3599 			// mark it as discarded (in the current transaction only, if any)
3600 			block->discard = true;
3601 		}
3602 	}
3603 }
3604 
3605 
3606 status_t
3607 block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
3608 {
3609 	block_cache* cache = (block_cache*)_cache;
3610 	MutexLocker locker(&cache->lock);
3611 
3612 	if (cache->read_only) {
3613 		panic("tried to make block writable on a read-only cache!");
3614 		return B_ERROR;
3615 	}
3616 
3617 	// TODO: this can be done better!
3618 	void* block;
3619 	status_t status = get_writable_cached_block(cache, blockNumber,
3620 		blockNumber, 1, transaction, false, &block);
3621 	if (status == B_OK) {
3622 		put_cached_block((block_cache*)_cache, blockNumber);
3623 		return B_OK;
3624 	}
3625 
3626 	return status;
3627 }
3628 
3629 
3630 status_t
3631 block_cache_get_writable_etc(void* _cache, off_t blockNumber, off_t base,
3632 	off_t length, int32 transaction, void** _block)
3633 {
3634 	block_cache* cache = (block_cache*)_cache;
3635 	MutexLocker locker(&cache->lock);
3636 
3637 	TRACE(("block_cache_get_writable_etc(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3638 		blockNumber, transaction));
3639 	if (cache->read_only)
3640 		panic("tried to get writable block on a read-only cache!");
3641 
3642 	return get_writable_cached_block(cache, blockNumber, base, length,
3643 		transaction, false, _block);
3644 }
3645 
3646 
3647 void*
3648 block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
3649 {
3650 	void* block;
3651 	if (block_cache_get_writable_etc(_cache, blockNumber,
3652 			blockNumber, 1, transaction, &block) == B_OK)
3653 		return block;
3654 
3655 	return NULL;
3656 }
3657 
3658 
3659 void*
3660 block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
3661 {
3662 	block_cache* cache = (block_cache*)_cache;
3663 	MutexLocker locker(&cache->lock);
3664 
3665 	TRACE(("block_cache_get_empty(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3666 		blockNumber, transaction));
3667 	if (cache->read_only)
3668 		panic("tried to get empty writable block on a read-only cache!");
3669 
3670 	void* block;
3671 	if (get_writable_cached_block((block_cache*)_cache, blockNumber,
3672 			blockNumber, 1, transaction, true, &block) == B_OK)
3673 		return block;
3674 
3675 	return NULL;
3676 }
3677 
3678 
3679 status_t
3680 block_cache_get_etc(void* _cache, off_t blockNumber, off_t base, off_t length,
3681 	const void** _block)
3682 {
3683 	block_cache* cache = (block_cache*)_cache;
3684 	MutexLocker locker(&cache->lock);
3685 	bool allocated;
3686 
3687 	cached_block* block;
3688 	status_t status = get_cached_block(cache, blockNumber, &allocated, true,
3689 		&block);
3690 	if (status != B_OK)
3691 		return status;
3692 
3693 #if BLOCK_CACHE_DEBUG_CHANGED
3694 	if (block->compare == NULL)
3695 		block->compare = cache->Allocate();
3696 	if (block->compare != NULL)
3697 		memcpy(block->compare, block->current_data, cache->block_size);
3698 #endif
3699 	TB(Get(cache, block));
3700 
3701 	*_block = block->current_data;
3702 	return B_OK;
3703 }
3704 
3705 
3706 const void*
3707 block_cache_get(void* _cache, off_t blockNumber)
3708 {
3709 	const void* block;
3710 	if (block_cache_get_etc(_cache, blockNumber, blockNumber, 1, &block)
3711 			== B_OK)
3712 		return block;
3713 
3714 	return NULL;
3715 }
3716 
3717 
3718 /*!	Changes the internal status of a writable block to \a dirty. This can be
3719 	helpful in case you realize you don't need to change that block anymore
3720 	for whatever reason.
3721 
3722 	Note, you must only use this function on blocks that were acquired
3723 	writable!
3724 */
3725 status_t
3726 block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
3727 	int32 transaction)
3728 {
3729 	block_cache* cache = (block_cache*)_cache;
3730 	MutexLocker locker(&cache->lock);
3731 
3732 	cached_block* block = cache->hash->Lookup(blockNumber);
3733 	if (block == NULL)
3734 		return B_BAD_VALUE;
3735 	if (block->is_dirty == dirty) {
3736 		// there is nothing to do for us
3737 		return B_OK;
3738 	}
3739 
3740 	// TODO: not yet implemented
3741 	if (dirty)
3742 		panic("block_cache_set_dirty(): not yet implemented that way!\n");
3743 
3744 	return B_OK;
3745 }
3746 
3747 
3748 void
3749 block_cache_put(void* _cache, off_t blockNumber)
3750 {
3751 	block_cache* cache = (block_cache*)_cache;
3752 	MutexLocker locker(&cache->lock);
3753 
3754 	put_cached_block(cache, blockNumber);
3755 }
3756 
3757