xref: /haiku/src/system/kernel/cache/block_cache.cpp (revision 344ded80d400028c8f561b4b876257b94c12db4a)
1 /*
2  * Copyright 2004-2020, Axel Dörfler, axeld@pinc-software.de.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include <block_cache.h>
8 
9 #include <unistd.h>
10 #include <stdlib.h>
11 #include <string.h>
12 #include <errno.h>
13 #include <sys/uio.h>
14 
15 #include <KernelExport.h>
16 #include <fs_cache.h>
17 
18 #include <condition_variable.h>
19 #include <lock.h>
20 #include <low_resource_manager.h>
21 #include <slab/Slab.h>
22 #include <tracing.h>
23 #include <util/kernel_cpp.h>
24 #include <util/DoublyLinkedList.h>
25 #include <util/AutoLock.h>
26 #include <StackOrHeapArray.h>
27 #include <vm/vm_page.h>
28 
29 #ifndef BUILDING_USERLAND_FS_SERVER
30 #include "IORequest.h"
31 #endif // !BUILDING_USERLAND_FS_SERVER
32 #include "kernel_debug_config.h"
33 
34 
35 // TODO: this is a naive but growing implementation to test the API:
36 //	block reading/writing is not at all optimized for speed, it will
37 //	just read and write single blocks.
38 // TODO: the retrieval/copy of the original data could be delayed until the
39 //		new data must be written, ie. in low memory situations.
40 
41 #ifdef _KERNEL_MODE
42 #	define TRACE_ALWAYS(x...) dprintf(x)
43 #else
44 #	define TRACE_ALWAYS(x...) printf(x)
45 #endif
46 
47 //#define TRACE_BLOCK_CACHE
48 #ifdef TRACE_BLOCK_CACHE
49 #	define TRACE(x)	TRACE_ALWAYS(x)
50 #else
51 #	define TRACE(x) ;
52 #endif
53 
54 
55 // This macro is used for fatal situations that are acceptable in a running
56 // system, like out of memory situations - should only panic for debugging.
57 #define FATAL(x) panic x
58 
59 static const bigtime_t kTransactionIdleTime = 2000000LL;
60 	// a transaction is considered idle after 2 seconds of inactivity
61 
62 
63 namespace {
64 
65 struct cache_transaction;
66 struct cached_block;
67 struct block_cache;
68 typedef DoublyLinkedListLink<cached_block> block_link;
69 
70 struct cached_block {
71 	cached_block*	next;			// next in hash
72 	cached_block*	transaction_next;
73 	block_link		link;
74 	off_t			block_number;
75 	void*			current_data;
76 		// The data that is seen by everyone using the API; this one is always
77 		// present.
78 	void*			original_data;
79 		// When in a transaction, this contains the original data from before
80 		// the transaction.
81 	void*			parent_data;
82 		// This is a lazily alloced buffer that represents the contents of the
83 		// block in the parent transaction. It may point to current_data if the
84 		// contents have been changed only in the parent transaction, or, if the
85 		// block has been changed in the current sub transaction already, to a
86 		// new block containing the contents changed in the parent transaction.
87 		// If this is NULL, the block has not been changed in the parent
88 		// transaction at all.
89 #if BLOCK_CACHE_DEBUG_CHANGED
90 	void*			compare;
91 #endif
92 	int32			ref_count;
93 	int32			last_accessed;
94 	bool			busy_reading : 1;
95 	bool			busy_writing : 1;
96 	bool			is_writing : 1;
97 		// Block has been checked out for writing without transactions, and
98 		// cannot be written back if set
99 	bool			is_dirty : 1;
100 	bool			unused : 1;
101 	bool			discard : 1;
102 	bool			busy_reading_waiters : 1;
103 	bool			busy_writing_waiters : 1;
104 	cache_transaction* transaction;
105 		// This is the current active transaction, if any, the block is
106 		// currently in (meaning was changed as a part of it).
107 	cache_transaction* previous_transaction;
108 		// This is set to the last transaction that was ended containing this
109 		// block. In this case, the block has not yet written back yet, and
110 		// the changed data is either in current_data, or original_data -- the
111 		// latter if the block is already being part of another transaction.
112 		// There can only be one previous transaction, so when the active
113 		// transaction ends, the changes of the previous transaction have to
114 		// be written back before that transaction becomes the next previous
115 		// transaction.
116 
117 	bool CanBeWritten() const;
118 	int32 LastAccess() const
119 		{ return system_time() / 1000000L - last_accessed; }
120 };
121 
122 typedef DoublyLinkedList<cached_block,
123 	DoublyLinkedListMemberGetLink<cached_block,
124 		&cached_block::link> > block_list;
125 
126 struct cache_notification : DoublyLinkedListLinkImpl<cache_notification> {
127 	static inline void* operator new(size_t size);
128 	static inline void operator delete(void* block);
129 
130 	int32			transaction_id;
131 	int32			events_pending;
132 	int32			events;
133 	transaction_notification_hook hook;
134 	void*			data;
135 	bool			delete_after_event;
136 };
137 
138 typedef DoublyLinkedList<cache_notification> NotificationList;
139 
140 static object_cache* sCacheNotificationCache;
141 
142 struct cache_listener;
143 typedef DoublyLinkedListLink<cache_listener> listener_link;
144 
145 struct cache_listener : cache_notification {
146 	listener_link	link;
147 };
148 
149 typedef DoublyLinkedList<cache_listener,
150 	DoublyLinkedListMemberGetLink<cache_listener,
151 		&cache_listener::link> > ListenerList;
152 
153 void*
154 cache_notification::operator new(size_t size)
155 {
156 	// We can't really know whether something is a cache_notification or a
157 	// cache_listener at runtime, so we just use one object_cache for both
158 	// with the size set to that of the (slightly larger) cache_listener.
159 	// In practice, the vast majority of cache_notifications are really
160 	// cache_listeners, so this is a more than acceptable trade-off.
161 	ASSERT(size <= sizeof(cache_listener));
162 	return object_cache_alloc(sCacheNotificationCache, 0);
163 }
164 
165 void
166 cache_notification::operator delete(void* block)
167 {
168 	object_cache_free(sCacheNotificationCache, block, 0);
169 }
170 
171 
172 struct BlockHash {
173 	typedef off_t			KeyType;
174 	typedef	cached_block	ValueType;
175 
176 	size_t HashKey(KeyType key) const
177 	{
178 		return key;
179 	}
180 
181 	size_t Hash(ValueType* block) const
182 	{
183 		return block->block_number;
184 	}
185 
186 	bool Compare(KeyType key, ValueType* block) const
187 	{
188 		return block->block_number == key;
189 	}
190 
191 	ValueType*& GetLink(ValueType* value) const
192 	{
193 		return value->next;
194 	}
195 };
196 
197 typedef BOpenHashTable<BlockHash> BlockTable;
198 
199 
200 struct TransactionHash {
201 	typedef int32				KeyType;
202 	typedef	cache_transaction	ValueType;
203 
204 	size_t HashKey(KeyType key) const
205 	{
206 		return key;
207 	}
208 
209 	size_t Hash(ValueType* transaction) const;
210 	bool Compare(KeyType key, ValueType* transaction) const;
211 	ValueType*& GetLink(ValueType* value) const;
212 };
213 
214 typedef BOpenHashTable<TransactionHash> TransactionTable;
215 
216 
217 struct block_cache : DoublyLinkedListLinkImpl<block_cache> {
218 	BlockTable*		hash;
219 	mutex			lock;
220 	int				fd;
221 	off_t			max_blocks;
222 	size_t			block_size;
223 	int32			next_transaction_id;
224 	cache_transaction* last_transaction;
225 	TransactionTable* transaction_hash;
226 
227 	object_cache*	buffer_cache;
228 	block_list		unused_blocks;
229 	uint32			unused_block_count;
230 
231 	ConditionVariable busy_reading_condition;
232 	uint32			busy_reading_count;
233 	bool			busy_reading_waiters;
234 
235 	ConditionVariable busy_writing_condition;
236 	uint32			busy_writing_count;
237 	bool			busy_writing_waiters;
238 
239 	bigtime_t		last_block_write;
240 	bigtime_t		last_block_write_duration;
241 
242 	uint32			num_dirty_blocks;
243 	bool			read_only;
244 
245 	NotificationList pending_notifications;
246 	ConditionVariable condition_variable;
247 
248 					block_cache(int fd, off_t numBlocks, size_t blockSize,
249 						bool readOnly);
250 					~block_cache();
251 
252 	status_t		Init();
253 
254 	void			Free(void* buffer);
255 	void*			Allocate();
256 	void			FreeBlock(cached_block* block);
257 	cached_block*	NewBlock(off_t blockNumber);
258 	void			FreeBlockParentData(cached_block* block);
259 
260 	void			RemoveUnusedBlocks(int32 count, int32 minSecondsOld = 0);
261 	void			RemoveBlock(cached_block* block);
262 	void			DiscardBlock(cached_block* block);
263 
264 private:
265 	static void		_LowMemoryHandler(void* data, uint32 resources,
266 						int32 level);
267 	cached_block*	_GetUnusedBlock();
268 };
269 
270 struct cache_transaction {
271 	cache_transaction();
272 
273 	cache_transaction* next;
274 	int32			id;
275 	int32			num_blocks;
276 	int32			main_num_blocks;
277 	int32			sub_num_blocks;
278 	cached_block*	first_block;
279 	block_list		blocks;
280 	ListenerList	listeners;
281 	bool			open;
282 	bool			has_sub_transaction;
283 	bigtime_t		last_used;
284 	int32			busy_writing_count;
285 };
286 
287 
288 class BlockWriter {
289 public:
290 								BlockWriter(block_cache* cache,
291 									size_t max = SIZE_MAX);
292 								~BlockWriter();
293 
294 			bool				Add(cached_block* block,
295 									cache_transaction* transaction = NULL);
296 			bool				Add(cache_transaction* transaction,
297 									bool& hasLeftOvers);
298 
299 			status_t			Write(cache_transaction* transaction = NULL,
300 									bool canUnlock = true);
301 
302 			bool				DeletedTransaction() const
303 									{ return fDeletedTransaction; }
304 
305 	static	status_t			WriteBlock(block_cache* cache,
306 									cached_block* block);
307 
308 private:
309 			void*				_Data(cached_block* block) const;
310 			status_t			_WriteBlocks(cached_block** blocks, uint32 count);
311 			void				_BlockDone(cached_block* block,
312 									cache_transaction* transaction);
313 			void				_UnmarkWriting(cached_block* block);
314 
315 	static	int					_CompareBlocks(const void* _blockA,
316 									const void* _blockB);
317 
318 private:
319 	static	const size_t		kBufferSize = 64;
320 
321 			block_cache*		fCache;
322 			cached_block*		fBuffer[kBufferSize];
323 			cached_block**		fBlocks;
324 			size_t				fCount;
325 			size_t				fTotal;
326 			size_t				fCapacity;
327 			size_t				fMax;
328 			status_t			fStatus;
329 			bool				fDeletedTransaction;
330 };
331 
332 
333 #ifndef BUILDING_USERLAND_FS_SERVER
334 class BlockPrefetcher {
335 public:
336 								BlockPrefetcher(block_cache* cache, off_t fBlockNumber,
337 									size_t numBlocks);
338 								~BlockPrefetcher();
339 
340 			status_t			Allocate();
341 			status_t			ReadAsync();
342 
343 	static	status_t			IterativeIOGetVecsHook(void* cookie, io_request* request,
344 									off_t offset, size_t size, struct file_io_vec* vecs,
345 									size_t* _count);
346 	static	status_t			IterativeIOFinishedHook(void* cookie, io_request* request,
347 									status_t status, bool partialTransfer,
348 									size_t bytesTransferred);
349 
350 			size_t				NumAllocated() { return fNumAllocated; }
351 
352 private:
353 			void				_RemoveAllocated(size_t unbusyCount, size_t removeCount);
354 
355 private:
356 			block_cache* 		fCache;
357 			off_t				fBlockNumber;
358 			size_t				fNumRequested;
359 			size_t				fNumAllocated;
360 			cached_block** 		fBlocks;
361 			generic_io_vec* 	fDestVecs;
362 };
363 #endif // !BUILDING_USERLAND_FS_SERVER
364 
365 
366 class TransactionLocking {
367 public:
368 	inline bool Lock(block_cache* cache)
369 	{
370 		mutex_lock(&cache->lock);
371 
372 		while (cache->busy_writing_count != 0) {
373 			// wait for all blocks to be written
374 			ConditionVariableEntry entry;
375 			cache->busy_writing_condition.Add(&entry);
376 			cache->busy_writing_waiters = true;
377 
378 			mutex_unlock(&cache->lock);
379 
380 			entry.Wait();
381 
382 			mutex_lock(&cache->lock);
383 		}
384 
385 		return true;
386 	}
387 
388 	inline void Unlock(block_cache* cache)
389 	{
390 		mutex_unlock(&cache->lock);
391 	}
392 };
393 
394 typedef AutoLocker<block_cache, TransactionLocking> TransactionLocker;
395 
396 } // namespace
397 
398 
399 #if BLOCK_CACHE_BLOCK_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
400 namespace BlockTracing {
401 
402 class Action : public AbstractTraceEntry {
403 public:
404 	Action(block_cache* cache, cached_block* block)
405 		:
406 		fCache(cache),
407 		fBlockNumber(block->block_number),
408 		fIsDirty(block->is_dirty),
409 		fHasOriginal(block->original_data != NULL),
410 		fHasParent(block->parent_data != NULL),
411 		fTransactionID(-1),
412 		fPreviousID(-1)
413 	{
414 		if (block->transaction != NULL)
415 			fTransactionID = block->transaction->id;
416 		if (block->previous_transaction != NULL)
417 			fPreviousID = block->previous_transaction->id;
418 	}
419 
420 	virtual void AddDump(TraceOutput& out)
421 	{
422 		out.Print("block cache %p, %s %" B_PRIu64 ", %c%c%c transaction %" B_PRId32
423 			" (previous id %" B_PRId32 ")\n", fCache, _Action(), fBlockNumber,
424 			fIsDirty ? 'd' : '-', fHasOriginal ? 'o' : '-',
425 			fHasParent ? 'p' : '-', fTransactionID, fPreviousID);
426 	}
427 
428 	virtual const char* _Action() const = 0;
429 
430 private:
431 	block_cache*		fCache;
432 	uint64				fBlockNumber;
433 	bool				fIsDirty;
434 	bool				fHasOriginal;
435 	bool				fHasParent;
436 	int32				fTransactionID;
437 	int32				fPreviousID;
438 };
439 
440 class Get : public Action {
441 public:
442 	Get(block_cache* cache, cached_block* block)
443 		:
444 		Action(cache, block)
445 	{
446 		Initialized();
447 	}
448 
449 	virtual const char* _Action() const { return "get"; }
450 };
451 
452 class Put : public Action {
453 public:
454 	Put(block_cache* cache, cached_block* block)
455 		:
456 		Action(cache, block)
457 	{
458 		Initialized();
459 	}
460 
461 	virtual const char* _Action() const { return "put"; }
462 };
463 
464 class Read : public Action {
465 public:
466 	Read(block_cache* cache, cached_block* block)
467 		:
468 		Action(cache, block)
469 	{
470 		Initialized();
471 	}
472 
473 	virtual const char* _Action() const { return "read"; }
474 };
475 
476 class Write : public Action {
477 public:
478 	Write(block_cache* cache, cached_block* block)
479 		:
480 		Action(cache, block)
481 	{
482 		Initialized();
483 	}
484 
485 	virtual const char* _Action() const { return "write"; }
486 };
487 
488 class Flush : public Action {
489 public:
490 	Flush(block_cache* cache, cached_block* block, bool getUnused = false)
491 		:
492 		Action(cache, block),
493 		fGetUnused(getUnused)
494 	{
495 		Initialized();
496 	}
497 
498 	virtual const char* _Action() const
499 		{ return fGetUnused ? "get-unused" : "flush"; }
500 
501 private:
502 	bool	fGetUnused;
503 };
504 
505 class Error : public AbstractTraceEntry {
506 public:
507 	Error(block_cache* cache, uint64 blockNumber, const char* message,
508 			status_t status = B_OK)
509 		:
510 		fCache(cache),
511 		fBlockNumber(blockNumber),
512 		fMessage(message),
513 		fStatus(status)
514 	{
515 		Initialized();
516 	}
517 
518 	virtual void AddDump(TraceOutput& out)
519 	{
520 		out.Print("block cache %p, error %" B_PRIu64 ", %s%s%s",
521 			fCache, fBlockNumber, fMessage, fStatus != B_OK ? ": " : "",
522 			fStatus != B_OK ? strerror(fStatus) : "");
523 	}
524 
525 private:
526 	block_cache*	fCache;
527 	uint64			fBlockNumber;
528 	const char*		fMessage;
529 	status_t		fStatus;
530 };
531 
532 #if BLOCK_CACHE_BLOCK_TRACING >= 2
533 class BlockData : public AbstractTraceEntry {
534 public:
535 	enum {
536 		kCurrent	= 0x01,
537 		kParent		= 0x02,
538 		kOriginal	= 0x04
539 	};
540 
541 	BlockData(block_cache* cache, cached_block* block, const char* message)
542 		:
543 		fCache(cache),
544 		fSize(cache->block_size),
545 		fBlockNumber(block->block_number),
546 		fMessage(message)
547 	{
548 		_Allocate(fCurrent, block->current_data);
549 		_Allocate(fParent, block->parent_data);
550 		_Allocate(fOriginal, block->original_data);
551 
552 #if KTRACE_PRINTF_STACK_TRACE
553 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
554 			false);
555 #endif
556 
557 		Initialized();
558 	}
559 
560 	virtual void AddDump(TraceOutput& out)
561 	{
562 		out.Print("block cache %p, block %" B_PRIu64 ", data %c%c%c: %s",
563 			fCache, fBlockNumber, fCurrent != NULL ? 'c' : '-',
564 			fParent != NULL ? 'p' : '-', fOriginal != NULL ? 'o' : '-',
565 			fMessage);
566 	}
567 
568 #if KTRACE_PRINTF_STACK_TRACE
569 	virtual void DumpStackTrace(TraceOutput& out)
570 	{
571 		out.PrintStackTrace(fStackTrace);
572 	}
573 #endif
574 
575 	void DumpBlocks(uint32 which, uint32 offset, uint32 size)
576 	{
577 		if ((which & kCurrent) != 0)
578 			DumpBlock(kCurrent, offset, size);
579 		if ((which & kParent) != 0)
580 			DumpBlock(kParent, offset, size);
581 		if ((which & kOriginal) != 0)
582 			DumpBlock(kOriginal, offset, size);
583 	}
584 
585 	void DumpBlock(uint32 which, uint32 offset, uint32 size)
586 	{
587 		if (offset > fSize) {
588 			kprintf("invalid offset (block size %" B_PRIu32 ")\n", fSize);
589 			return;
590 		}
591 		if (offset + size > fSize)
592 			size = fSize - offset;
593 
594 		const char* label;
595 		uint8* data;
596 
597 		if ((which & kCurrent) != 0) {
598 			label = "current";
599 			data = fCurrent;
600 		} else if ((which & kParent) != 0) {
601 			label = "parent";
602 			data = fParent;
603 		} else if ((which & kOriginal) != 0) {
604 			label = "original";
605 			data = fOriginal;
606 		} else
607 			return;
608 
609 		kprintf("%s: offset %" B_PRIu32 ", %" B_PRIu32 " bytes\n", label, offset, size);
610 
611 		static const uint32 kBlockSize = 16;
612 		data += offset;
613 
614 		for (uint32 i = 0; i < size;) {
615 			int start = i;
616 
617 			kprintf("  %04" B_PRIx32 " ", i);
618 			for (; i < start + kBlockSize; i++) {
619 				if (!(i % 4))
620 					kprintf(" ");
621 
622 				if (i >= size)
623 					kprintf("  ");
624 				else
625 					kprintf("%02x", data[i]);
626 			}
627 
628 			kprintf("\n");
629 		}
630 	}
631 
632 private:
633 	void _Allocate(uint8*& target, void* source)
634 	{
635 		if (source == NULL) {
636 			target = NULL;
637 			return;
638 		}
639 
640 		target = alloc_tracing_buffer_memcpy(source, fSize, false);
641 	}
642 
643 	block_cache*	fCache;
644 	uint32			fSize;
645 	uint64			fBlockNumber;
646 	const char*		fMessage;
647 	uint8*			fCurrent;
648 	uint8*			fParent;
649 	uint8*			fOriginal;
650 #if KTRACE_PRINTF_STACK_TRACE
651 	tracing_stack_trace* fStackTrace;
652 #endif
653 };
654 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
655 
656 }	// namespace BlockTracing
657 
658 #	define TB(x) new(std::nothrow) BlockTracing::x;
659 #else
660 #	define TB(x) ;
661 #endif
662 
663 #if BLOCK_CACHE_BLOCK_TRACING >= 2
664 #	define TB2(x) new(std::nothrow) BlockTracing::x;
665 #else
666 #	define TB2(x) ;
667 #endif
668 
669 
670 #if BLOCK_CACHE_TRANSACTION_TRACING && !defined(BUILDING_USERLAND_FS_SERVER)
671 namespace TransactionTracing {
672 
673 class Action : public AbstractTraceEntry {
674 public:
675 	Action(const char* label, block_cache* cache,
676 			cache_transaction* transaction)
677 		:
678 		fCache(cache),
679 		fTransaction(transaction),
680 		fID(transaction->id),
681 		fSub(transaction->has_sub_transaction),
682 		fNumBlocks(transaction->num_blocks),
683 		fSubNumBlocks(transaction->sub_num_blocks)
684 	{
685 		strlcpy(fLabel, label, sizeof(fLabel));
686 		Initialized();
687 	}
688 
689 	virtual void AddDump(TraceOutput& out)
690 	{
691 		out.Print("block cache %p, %s transaction %p (id %" B_PRId32 ")%s"
692 			", %" B_PRId32 "/%" B_PRId32 " blocks", fCache, fLabel, fTransaction,
693 			fID, fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
694 	}
695 
696 private:
697 	char				fLabel[12];
698 	block_cache*		fCache;
699 	cache_transaction*	fTransaction;
700 	int32				fID;
701 	bool				fSub;
702 	int32				fNumBlocks;
703 	int32				fSubNumBlocks;
704 };
705 
706 class Detach : public AbstractTraceEntry {
707 public:
708 	Detach(block_cache* cache, cache_transaction* transaction,
709 			cache_transaction* newTransaction)
710 		:
711 		fCache(cache),
712 		fTransaction(transaction),
713 		fID(transaction->id),
714 		fSub(transaction->has_sub_transaction),
715 		fNewTransaction(newTransaction),
716 		fNewID(newTransaction->id)
717 	{
718 		Initialized();
719 	}
720 
721 	virtual void AddDump(TraceOutput& out)
722 	{
723 		out.Print("block cache %p, detach transaction %p (id %" B_PRId32 ")"
724 			"from transaction %p (id %" B_PRId32 ")%s",
725 			fCache, fNewTransaction, fNewID, fTransaction, fID,
726 			fSub ? " sub" : "");
727 	}
728 
729 private:
730 	block_cache*		fCache;
731 	cache_transaction*	fTransaction;
732 	int32				fID;
733 	bool				fSub;
734 	cache_transaction*	fNewTransaction;
735 	int32				fNewID;
736 };
737 
738 class Abort : public AbstractTraceEntry {
739 public:
740 	Abort(block_cache* cache, cache_transaction* transaction)
741 		:
742 		fCache(cache),
743 		fTransaction(transaction),
744 		fID(transaction->id),
745 		fNumBlocks(0)
746 	{
747 		bool isSub = transaction->has_sub_transaction;
748 		fNumBlocks = isSub ? transaction->sub_num_blocks
749 			: transaction->num_blocks;
750 		fBlocks = (off_t*)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
751 		if (fBlocks != NULL) {
752 			cached_block* block = transaction->first_block;
753 			for (int32 i = 0; block != NULL && i < fNumBlocks;
754 					block = block->transaction_next) {
755 				fBlocks[i++] = block->block_number;
756 			}
757 		} else
758 			fNumBlocks = 0;
759 
760 #if KTRACE_PRINTF_STACK_TRACE
761 		fStackTrace = capture_tracing_stack_trace(KTRACE_PRINTF_STACK_TRACE, 1,
762 			false);
763 #endif
764 
765 		Initialized();
766 	}
767 
768 	virtual void AddDump(TraceOutput& out)
769 	{
770 		out.Print("block cache %p, abort transaction "
771 			"%p (id %" B_PRId32 "), blocks", fCache, fTransaction, fID);
772 		for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
773 			out.Print(" %" B_PRIdOFF, fBlocks[i]);
774 	}
775 
776 #if KTRACE_PRINTF_STACK_TRACE
777 	virtual void DumpStackTrace(TraceOutput& out)
778 	{
779 		out.PrintStackTrace(fStackTrace);
780 	}
781 #endif
782 
783 private:
784 	block_cache*		fCache;
785 	cache_transaction*	fTransaction;
786 	int32				fID;
787 	off_t*				fBlocks;
788 	int32				fNumBlocks;
789 #if KTRACE_PRINTF_STACK_TRACE
790 	tracing_stack_trace* fStackTrace;
791 #endif
792 };
793 
794 }	// namespace TransactionTracing
795 
796 #	define T(x) new(std::nothrow) TransactionTracing::x;
797 #else
798 #	define T(x) ;
799 #endif
800 
801 
802 static DoublyLinkedList<block_cache> sCaches;
803 static mutex sCachesLock = MUTEX_INITIALIZER("block caches");
804 static mutex sCachesMemoryUseLock
805 	= MUTEX_INITIALIZER("block caches memory use");
806 static size_t sUsedMemory;
807 static sem_id sEventSemaphore;
808 static mutex sNotificationsLock
809 	= MUTEX_INITIALIZER("block cache notifications");
810 static thread_id sNotifierWriterThread;
811 static DoublyLinkedListLink<block_cache> sMarkCache;
812 	// TODO: this only works if the link is the first entry of block_cache
813 static object_cache* sBlockCache;
814 
815 
816 static void mark_block_busy_reading(block_cache* cache, cached_block* block);
817 static void mark_block_unbusy_reading(block_cache* cache, cached_block* block);
818 
819 
820 //	#pragma mark - notifications/listener
821 
822 
823 /*!	Checks whether or not this is an event that closes a transaction. */
824 static inline bool
825 is_closing_event(int32 event)
826 {
827 	return (event & (TRANSACTION_ABORTED | TRANSACTION_ENDED)) != 0;
828 }
829 
830 
831 static inline bool
832 is_written_event(int32 event)
833 {
834 	return (event & TRANSACTION_WRITTEN) != 0;
835 }
836 
837 
838 /*!	From the specified \a notification, it will remove the lowest pending
839 	event, and return that one in \a _event.
840 	If there is no pending event anymore, it will return \c false.
841 */
842 static bool
843 get_next_pending_event(cache_notification* notification, int32* _event)
844 {
845 	for (int32 eventMask = 1; eventMask <= TRANSACTION_IDLE; eventMask <<= 1) {
846 		int32 pending = atomic_and(&notification->events_pending,
847 			~eventMask);
848 
849 		bool more = (pending & ~eventMask) != 0;
850 
851 		if ((pending & eventMask) != 0) {
852 			*_event = eventMask;
853 			return more;
854 		}
855 	}
856 
857 	return false;
858 }
859 
860 
861 static void
862 flush_pending_notifications(block_cache* cache)
863 {
864 	ASSERT_LOCKED_MUTEX(&sCachesLock);
865 
866 	while (true) {
867 		MutexLocker locker(sNotificationsLock);
868 
869 		cache_notification* notification = cache->pending_notifications.Head();
870 		if (notification == NULL)
871 			return;
872 
873 		bool deleteAfterEvent = false;
874 		int32 event = -1;
875 		if (!get_next_pending_event(notification, &event)) {
876 			// remove the notification if this was the last pending event
877 			cache->pending_notifications.Remove(notification);
878 			deleteAfterEvent = notification->delete_after_event;
879 		}
880 
881 		if (event >= 0) {
882 			// Notify listener, we need to copy the notification, as it might
883 			// be removed when we unlock the list.
884 			cache_notification copy = *notification;
885 			locker.Unlock();
886 
887 			copy.hook(copy.transaction_id, event, copy.data);
888 
889 			locker.Lock();
890 		}
891 
892 		if (deleteAfterEvent)
893 			delete notification;
894 	}
895 }
896 
897 
898 /*!	Flushes all pending notifications by calling the appropriate hook
899 	functions.
900 	Must not be called with a cache lock held.
901 */
902 static void
903 flush_pending_notifications()
904 {
905 	MutexLocker _(sCachesLock);
906 
907 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
908 	while (iterator.HasNext()) {
909 		block_cache* cache = iterator.Next();
910 
911 		flush_pending_notifications(cache);
912 	}
913 }
914 
915 
916 /*!	Initializes the \a notification as specified. */
917 static void
918 set_notification(cache_transaction* transaction,
919 	cache_notification &notification, int32 events,
920 	transaction_notification_hook hook, void* data)
921 {
922 	notification.transaction_id = transaction != NULL ? transaction->id : -1;
923 	notification.events_pending = 0;
924 	notification.events = events;
925 	notification.hook = hook;
926 	notification.data = data;
927 	notification.delete_after_event = false;
928 }
929 
930 
931 /*!	Makes sure the notification is deleted. It either deletes it directly,
932 	when possible, or marks it for deletion if the notification is pending.
933 */
934 static void
935 delete_notification(cache_notification* notification)
936 {
937 	MutexLocker locker(sNotificationsLock);
938 
939 	if (notification->events_pending != 0)
940 		notification->delete_after_event = true;
941 	else
942 		delete notification;
943 }
944 
945 
946 /*!	Adds the notification to the pending notifications list, or, if it's
947 	already part of it, updates its events_pending field.
948 	Also marks the notification to be deleted if \a deleteNotification
949 	is \c true.
950 	Triggers the notifier thread to run.
951 */
952 static void
953 add_notification(block_cache* cache, cache_notification* notification,
954 	int32 event, bool deleteNotification)
955 {
956 	if (notification->hook == NULL)
957 		return;
958 
959 	int32 pending = atomic_or(&notification->events_pending, event);
960 	if (pending == 0) {
961 		// not yet part of the notification list
962 		MutexLocker locker(sNotificationsLock);
963 		if (deleteNotification)
964 			notification->delete_after_event = true;
965 		cache->pending_notifications.Add(notification);
966 	} else if (deleteNotification) {
967 		// we might need to delete it ourselves if we're late
968 		delete_notification(notification);
969 	}
970 
971 	release_sem_etc(sEventSemaphore, 1, B_DO_NOT_RESCHEDULE);
972 		// We're probably still holding some locks that makes rescheduling
973 		// not a good idea at this point.
974 }
975 
976 
977 /*!	Notifies all interested listeners of this transaction about the \a event.
978 	If \a event is a closing event (ie. TRANSACTION_ENDED, and
979 	TRANSACTION_ABORTED), all listeners except those listening to
980 	TRANSACTION_WRITTEN will be removed.
981 */
982 static void
983 notify_transaction_listeners(block_cache* cache, cache_transaction* transaction,
984 	int32 event)
985 {
986 	T(Action("notify", cache, transaction));
987 
988 	bool isClosing = is_closing_event(event);
989 	bool isWritten = is_written_event(event);
990 
991 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
992 	while (iterator.HasNext()) {
993 		cache_listener* listener = iterator.Next();
994 
995 		bool remove = (isClosing && !is_written_event(listener->events))
996 			|| (isWritten && is_written_event(listener->events));
997 		if (remove)
998 			iterator.Remove();
999 
1000 		if ((listener->events & event) != 0)
1001 			add_notification(cache, listener, event, remove);
1002 		else if (remove)
1003 			delete_notification(listener);
1004 	}
1005 }
1006 
1007 
1008 /*!	Removes and deletes all listeners that are still monitoring this
1009 	transaction.
1010 */
1011 static void
1012 remove_transaction_listeners(block_cache* cache, cache_transaction* transaction)
1013 {
1014 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
1015 	while (iterator.HasNext()) {
1016 		cache_listener* listener = iterator.Next();
1017 		iterator.Remove();
1018 
1019 		delete_notification(listener);
1020 	}
1021 }
1022 
1023 
1024 static status_t
1025 add_transaction_listener(block_cache* cache, cache_transaction* transaction,
1026 	int32 events, transaction_notification_hook hookFunction, void* data)
1027 {
1028 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
1029 	while (iterator.HasNext()) {
1030 		cache_listener* listener = iterator.Next();
1031 
1032 		if (listener->data == data && listener->hook == hookFunction) {
1033 			// this listener already exists, just update it
1034 			listener->events |= events;
1035 			return B_OK;
1036 		}
1037 	}
1038 
1039 	cache_listener* listener = new cache_listener;
1040 	if (listener == NULL)
1041 		return B_NO_MEMORY;
1042 
1043 	set_notification(transaction, *listener, events, hookFunction, data);
1044 	transaction->listeners.Add(listener);
1045 	return B_OK;
1046 }
1047 
1048 
1049 //	#pragma mark - private transaction
1050 
1051 
1052 cache_transaction::cache_transaction()
1053 {
1054 	num_blocks = 0;
1055 	main_num_blocks = 0;
1056 	sub_num_blocks = 0;
1057 	first_block = NULL;
1058 	open = true;
1059 	has_sub_transaction = false;
1060 	last_used = system_time();
1061 	busy_writing_count = 0;
1062 }
1063 
1064 
1065 static void
1066 delete_transaction(block_cache* cache, cache_transaction* transaction)
1067 {
1068 	if (cache->last_transaction == transaction)
1069 		cache->last_transaction = NULL;
1070 
1071 	remove_transaction_listeners(cache, transaction);
1072 	delete transaction;
1073 }
1074 
1075 
1076 static cache_transaction*
1077 lookup_transaction(block_cache* cache, int32 id)
1078 {
1079 	return cache->transaction_hash->Lookup(id);
1080 }
1081 
1082 
1083 size_t TransactionHash::Hash(cache_transaction* transaction) const
1084 {
1085 	return transaction->id;
1086 }
1087 
1088 
1089 bool TransactionHash::Compare(int32 key, cache_transaction* transaction) const
1090 {
1091 	return transaction->id == key;
1092 }
1093 
1094 
1095 cache_transaction*& TransactionHash::GetLink(cache_transaction* value) const
1096 {
1097 	return value->next;
1098 }
1099 
1100 
1101 /*!	Writes back any changes made to blocks in \a transaction that are still
1102 	part of a previous transacton.
1103 */
1104 static status_t
1105 write_blocks_in_previous_transaction(block_cache* cache,
1106 	cache_transaction* transaction)
1107 {
1108 	BlockWriter writer(cache);
1109 
1110 	cached_block* block = transaction->first_block;
1111 	for (; block != NULL; block = block->transaction_next) {
1112 		if (block->previous_transaction != NULL) {
1113 			// need to write back pending changes
1114 			writer.Add(block);
1115 		}
1116 	}
1117 
1118 	return writer.Write();
1119 }
1120 
1121 
1122 //	#pragma mark - cached_block
1123 
1124 
1125 bool
1126 cached_block::CanBeWritten() const
1127 {
1128 	return !busy_writing && !busy_reading
1129 		&& (previous_transaction != NULL
1130 			|| (transaction == NULL && is_dirty && !is_writing));
1131 }
1132 
1133 
1134 //	#pragma mark - BlockWriter
1135 
1136 
1137 BlockWriter::BlockWriter(block_cache* cache, size_t max)
1138 	:
1139 	fCache(cache),
1140 	fBlocks(fBuffer),
1141 	fCount(0),
1142 	fTotal(0),
1143 	fCapacity(kBufferSize),
1144 	fMax(max),
1145 	fStatus(B_OK),
1146 	fDeletedTransaction(false)
1147 {
1148 }
1149 
1150 
1151 BlockWriter::~BlockWriter()
1152 {
1153 	if (fBlocks != fBuffer)
1154 		free(fBlocks);
1155 }
1156 
1157 
1158 /*!	Adds the specified block to the to be written array. If no more blocks can
1159 	be added, false is returned, otherwise true.
1160 */
1161 bool
1162 BlockWriter::Add(cached_block* block, cache_transaction* transaction)
1163 {
1164 	ASSERT(block->CanBeWritten());
1165 
1166 	if (fTotal == fMax)
1167 		return false;
1168 
1169 	if (fCount >= fCapacity) {
1170 		// Enlarge array if necessary
1171 		cached_block** newBlocks;
1172 		size_t newCapacity = max_c(256, fCapacity * 2);
1173 		if (fBlocks == fBuffer)
1174 			newBlocks = (cached_block**)malloc(newCapacity * sizeof(void*));
1175 		else {
1176 			newBlocks = (cached_block**)realloc(fBlocks,
1177 				newCapacity * sizeof(void*));
1178 		}
1179 
1180 		if (newBlocks == NULL) {
1181 			// Allocating a larger array failed - we need to write back what
1182 			// we have synchronously now (this will also clear the array)
1183 			Write(transaction, false);
1184 		} else {
1185 			if (fBlocks == fBuffer)
1186 				memcpy(newBlocks, fBuffer, kBufferSize * sizeof(void*));
1187 
1188 			fBlocks = newBlocks;
1189 			fCapacity = newCapacity;
1190 		}
1191 	}
1192 
1193 	fBlocks[fCount++] = block;
1194 	fTotal++;
1195 	block->busy_writing = true;
1196 	fCache->busy_writing_count++;
1197 	if (block->previous_transaction != NULL)
1198 		block->previous_transaction->busy_writing_count++;
1199 
1200 	return true;
1201 }
1202 
1203 
1204 /*!	Adds all blocks of the specified transaction to the to be written array.
1205 	If no more blocks can be added, false is returned, otherwise true.
1206 */
1207 bool
1208 BlockWriter::Add(cache_transaction* transaction, bool& hasLeftOvers)
1209 {
1210 	ASSERT(!transaction->open);
1211 
1212 	if (transaction->busy_writing_count != 0) {
1213 		hasLeftOvers = true;
1214 		return true;
1215 	}
1216 
1217 	hasLeftOvers = false;
1218 
1219 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
1220 	while (cached_block* block = blockIterator.Next()) {
1221 		if (!block->CanBeWritten()) {
1222 			// This block was already part of a previous transaction within this
1223 			// writer
1224 			hasLeftOvers = true;
1225 			continue;
1226 		}
1227 		if (!Add(block, transaction))
1228 			return false;
1229 
1230 		if (DeletedTransaction())
1231 			break;
1232 	}
1233 
1234 	return true;
1235 }
1236 
1237 
1238 /*! Cache must be locked when calling this method, but it will be unlocked
1239 	while the blocks are written back.
1240 */
1241 status_t
1242 BlockWriter::Write(cache_transaction* transaction, bool canUnlock)
1243 {
1244 	if (fCount == 0)
1245 		return B_OK;
1246 
1247 	if (canUnlock)
1248 		mutex_unlock(&fCache->lock);
1249 
1250 	// Sort blocks in their on-disk order, so we can merge consecutive writes.
1251 	qsort(fBlocks, fCount, sizeof(void*), &_CompareBlocks);
1252 	fDeletedTransaction = false;
1253 
1254 	bigtime_t start = system_time();
1255 
1256 	for (uint32 i = 0; i < fCount; i++) {
1257 		uint32 blocks = 1;
1258 		for (; (i + blocks) < fCount && blocks < IOV_MAX; blocks++) {
1259 			const uint32 j = i + blocks;
1260 			if (fBlocks[j]->block_number != (fBlocks[j - 1]->block_number + 1))
1261 				break;
1262 		}
1263 
1264 		status_t status = _WriteBlocks(fBlocks + i, blocks);
1265 		if (status != B_OK) {
1266 			// propagate to global error handling
1267 			if (fStatus == B_OK)
1268 				fStatus = status;
1269 
1270 			for (uint32 j = i; j < (i + blocks); j++) {
1271 				_UnmarkWriting(fBlocks[j]);
1272 				fBlocks[j] = NULL;
1273 					// This block will not be marked clean
1274 			}
1275 		}
1276 
1277 		i += (blocks - 1);
1278 	}
1279 
1280 	bigtime_t finish = system_time();
1281 
1282 	if (canUnlock)
1283 		mutex_lock(&fCache->lock);
1284 
1285 	if (fStatus == B_OK && fCount >= 8) {
1286 		fCache->last_block_write = finish;
1287 		fCache->last_block_write_duration = (fCache->last_block_write - start)
1288 			/ fCount;
1289 	}
1290 
1291 	for (uint32 i = 0; i < fCount; i++)
1292 		_BlockDone(fBlocks[i], transaction);
1293 
1294 	fCount = 0;
1295 	return fStatus;
1296 }
1297 
1298 
1299 /*!	Writes the specified \a block back to disk. It will always only write back
1300 	the oldest change of the block if it is part of more than one transaction.
1301 	It will automatically send out TRANSACTION_WRITTEN notices, as well as
1302 	delete transactions when they are no longer used, and \a deleteTransaction
1303 	is \c true.
1304 */
1305 /*static*/ status_t
1306 BlockWriter::WriteBlock(block_cache* cache, cached_block* block)
1307 {
1308 	BlockWriter writer(cache);
1309 
1310 	writer.Add(block);
1311 	return writer.Write();
1312 }
1313 
1314 
1315 void*
1316 BlockWriter::_Data(cached_block* block) const
1317 {
1318 	return block->previous_transaction != NULL && block->original_data != NULL
1319 		? block->original_data : block->current_data;
1320 		// We first need to write back changes from previous transactions
1321 }
1322 
1323 
1324 status_t
1325 BlockWriter::_WriteBlocks(cached_block** blocks, uint32 count)
1326 {
1327 	const size_t blockSize = fCache->block_size;
1328 
1329 	BStackOrHeapArray<iovec, 8> vecs(count);
1330 	for (uint32 i = 0; i < count; i++) {
1331 		cached_block* block = blocks[i];
1332 		ASSERT(block->busy_writing);
1333 		ASSERT(i == 0 || block->block_number == (blocks[i - 1]->block_number + 1));
1334 
1335 		TRACE(("BlockWriter::_WriteBlocks(block %" B_PRIdOFF ", count %" B_PRIu32 ")\n",
1336 			block->block_number, count));
1337 		TB(Write(fCache, block));
1338 		TB2(BlockData(fCache, block, "before write"));
1339 
1340 		vecs[i].iov_base = _Data(block);
1341 		vecs[i].iov_len = blockSize;
1342 	}
1343 
1344 	ssize_t written = writev_pos(fCache->fd,
1345 		blocks[0]->block_number * blockSize, vecs, count);
1346 
1347 	if (written != (ssize_t)(blockSize * count)) {
1348 		TB(Error(fCache, block->block_number, "write failed", written));
1349 		TRACE_ALWAYS("could not write back %" B_PRIu32 " blocks (start block %" B_PRIdOFF "): %s\n",
1350 			count, blocks[0]->block_number, strerror(errno));
1351 		if (written < 0)
1352 			return errno;
1353 
1354 		return B_IO_ERROR;
1355 	}
1356 
1357 	return B_OK;
1358 }
1359 
1360 
1361 void
1362 BlockWriter::_BlockDone(cached_block* block,
1363 	cache_transaction* transaction)
1364 {
1365 	if (block == NULL) {
1366 		// An error occured when trying to write this block
1367 		return;
1368 	}
1369 
1370 	if (fCache->num_dirty_blocks > 0)
1371 		fCache->num_dirty_blocks--;
1372 
1373 	if (_Data(block) == block->current_data)
1374 		block->is_dirty = false;
1375 
1376 	_UnmarkWriting(block);
1377 
1378 	cache_transaction* previous = block->previous_transaction;
1379 	if (previous != NULL) {
1380 		previous->blocks.Remove(block);
1381 		block->previous_transaction = NULL;
1382 
1383 		if (block->original_data != NULL && block->transaction == NULL) {
1384 			// This block is not part of a transaction, so it does not need
1385 			// its original pointer anymore.
1386 			fCache->Free(block->original_data);
1387 			block->original_data = NULL;
1388 		}
1389 
1390 		// Has the previous transaction been finished with that write?
1391 		if (--previous->num_blocks == 0) {
1392 			TRACE(("cache transaction %" B_PRId32 " finished!\n", previous->id));
1393 			T(Action("written", fCache, previous));
1394 
1395 			notify_transaction_listeners(fCache, previous,
1396 				TRANSACTION_WRITTEN);
1397 
1398 			if (transaction != NULL) {
1399 				// This function is called while iterating transaction_hash. We
1400 				// use RemoveUnchecked so the iterator is still valid. A regular
1401 				// Remove can trigger a resize of the hash table which would
1402 				// result in the linked items in the table changing order.
1403 				fCache->transaction_hash->RemoveUnchecked(transaction);
1404 			} else
1405 				fCache->transaction_hash->Remove(previous);
1406 
1407 			delete_transaction(fCache, previous);
1408 			fDeletedTransaction = true;
1409 		}
1410 	}
1411 	if (block->transaction == NULL && block->ref_count == 0 && !block->unused) {
1412 		// the block is no longer used
1413 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1414 		block->unused = true;
1415 		fCache->unused_blocks.Add(block);
1416 		fCache->unused_block_count++;
1417 	}
1418 
1419 	TB2(BlockData(fCache, block, "after write"));
1420 }
1421 
1422 
1423 void
1424 BlockWriter::_UnmarkWriting(cached_block* block)
1425 {
1426 	block->busy_writing = false;
1427 	if (block->previous_transaction != NULL)
1428 		block->previous_transaction->busy_writing_count--;
1429 	fCache->busy_writing_count--;
1430 
1431 	if ((fCache->busy_writing_waiters && fCache->busy_writing_count == 0)
1432 		|| block->busy_writing_waiters) {
1433 		fCache->busy_writing_waiters = false;
1434 		block->busy_writing_waiters = false;
1435 		fCache->busy_writing_condition.NotifyAll();
1436 	}
1437 }
1438 
1439 
1440 /*static*/ int
1441 BlockWriter::_CompareBlocks(const void* _blockA, const void* _blockB)
1442 {
1443 	cached_block* blockA = *(cached_block**)_blockA;
1444 	cached_block* blockB = *(cached_block**)_blockB;
1445 
1446 	off_t diff = blockA->block_number - blockB->block_number;
1447 	if (diff > 0)
1448 		return 1;
1449 
1450 	return diff < 0 ? -1 : 0;
1451 }
1452 
1453 
1454 #ifndef BUILDING_USERLAND_FS_SERVER
1455 //	#pragma mark - BlockPrefetcher
1456 
1457 
1458 BlockPrefetcher::BlockPrefetcher(block_cache* cache, off_t blockNumber, size_t numBlocks)
1459 	:
1460 	fCache(cache),
1461 	fBlockNumber(blockNumber),
1462 	fNumRequested(numBlocks),
1463 	fNumAllocated(0)
1464 {
1465 	fBlocks = new cached_block*[numBlocks];
1466 	fDestVecs = new generic_io_vec[numBlocks];
1467 }
1468 
1469 
1470 BlockPrefetcher::~BlockPrefetcher()
1471 {
1472 	delete[] fBlocks;
1473 	delete[] fDestVecs;
1474 }
1475 
1476 
1477 /*!	Allocates cached_block objects in preparation for prefetching.
1478 	@return If an error is returned, then no blocks have been allocated.
1479 	@post Blocks have been constructed (including allocating the current_data member)
1480 	but current_data is uninitialized.
1481 */
1482 status_t
1483 BlockPrefetcher::Allocate()
1484 {
1485 	TRACE(("BlockPrefetcher::Allocate: looking up %" B_PRIuSIZE " blocks, starting with %"
1486 		B_PRIdOFF "\n", fNumBlocks, fBlockNumber));
1487 
1488 	ASSERT_LOCKED_MUTEX(&fCache->lock);
1489 
1490 	size_t finalNumBlocks = fNumRequested;
1491 
1492 	// determine whether any requested blocks are already cached
1493 	for (size_t i = 0; i < fNumRequested; ++i) {
1494 		off_t blockNumIter = fBlockNumber + i;
1495 		if (blockNumIter < 0 || blockNumIter >= fCache->max_blocks) {
1496 			panic("BlockPrefetcher::Allocate: invalid block number %" B_PRIdOFF " (max %"
1497 				B_PRIdOFF ")", blockNumIter, fCache->max_blocks - 1);
1498 			return B_BAD_VALUE;
1499 		}
1500 		cached_block* block = fCache->hash->Lookup(blockNumIter);
1501 		if (block != NULL) {
1502 			// truncate the request
1503 			TRACE(("BlockPrefetcher::Allocate: found an existing block (%" B_PRIdOFF ")\n",
1504 				blockNumIter));
1505 			fBlocks[i] = NULL;
1506 			finalNumBlocks = i;
1507 			break;
1508 		}
1509 	}
1510 
1511 	// allocate the blocks
1512 	for (size_t i = 0; i < finalNumBlocks; ++i) {
1513 		cached_block* block = fCache->NewBlock(fBlockNumber + i);
1514 		if (block == NULL) {
1515 			_RemoveAllocated(0, i);
1516 			return B_NO_MEMORY;
1517 		}
1518 		fCache->hash->Insert(block);
1519 
1520 		block->unused = true;
1521 		fCache->unused_blocks.Add(block);
1522 		fCache->unused_block_count++;
1523 
1524 		fBlocks[i] = block;
1525 	}
1526 
1527 	fNumAllocated = finalNumBlocks;
1528 
1529 	return B_OK;
1530 }
1531 
1532 
1533 /*!	Schedules reads from disk to cache.
1534 	@return If an error is returned, then the previously allocated blocks have been cleaned up.
1535 	@post The calling object will eventually be deleted by IterativeIOFinishedHook.
1536 */
1537 status_t
1538 BlockPrefetcher::ReadAsync()
1539 {
1540 	TRACE(("BlockPrefetcher::Read: reading %" B_PRIuSIZE " blocks\n", fNumAllocated));
1541 
1542 	size_t blockSize = fCache->block_size;
1543 	generic_io_vec* vecs = fDestVecs;
1544 	for (size_t i = 0; i < fNumAllocated; ++i) {
1545 		vecs[i].base = reinterpret_cast<generic_addr_t>(fBlocks[i]->current_data);
1546 		vecs[i].length = blockSize;
1547 		mark_block_busy_reading(fCache, fBlocks[i]);
1548 	}
1549 
1550 	IORequest* request = new IORequest;
1551 	status_t status = request->Init(fBlockNumber * blockSize, vecs, fNumAllocated,
1552 		fNumAllocated * blockSize, false, B_DELETE_IO_REQUEST);
1553 	if (status != B_OK) {
1554 		TB(Error(fCache, fBlockNumber, "IORequest::Init starting here failed", status));
1555 		TRACE_ALWAYS("BlockPrefetcher::Read: failed to initialize IO request for %" B_PRIuSIZE
1556 			" blocks starting with %" B_PRIdOFF ": %s\n",
1557 			fNumAllocated, fBlockNumber, strerror(status));
1558 		_RemoveAllocated(fNumAllocated, fNumAllocated);
1559 		delete request;
1560 		return status;
1561 	}
1562 
1563 	return do_iterative_fd_io(fCache->fd, request, IterativeIOGetVecsHook, IterativeIOFinishedHook,
1564 		this);
1565 }
1566 
1567 
1568 /*static*/ status_t
1569 BlockPrefetcher::IterativeIOGetVecsHook(void* cookie, io_request* request, off_t offset,
1570 	size_t size, struct file_io_vec* vecs, size_t* _count)
1571 {
1572 	TRACE(("BlockPrefetcher::IterativeIOGetVecsHook: setting offset %" B_PRIdOFF " and length %"
1573 		B_PRIuSIZE "\n", offset, size));
1574 
1575 	if (*_count == 0)
1576 		return B_OK;
1577 
1578 	vecs[0].offset = offset;
1579 		// the requested offset was volume-relative to begin with
1580 	vecs[0].length = size;
1581 		// the request is always for a contiguous run of blocks
1582 	*_count = 1;
1583 
1584 	return B_OK;
1585 }
1586 
1587 
1588 /*static*/ status_t
1589 BlockPrefetcher::IterativeIOFinishedHook(void* cookie, io_request* request, status_t status,
1590 	bool partialTransfer, size_t bytesTransferred)
1591 {
1592 	TRACE(("BlockPrefetcher::IterativeIOFinishedHook: status %s, partial %d\n", strerror(status),
1593 		partialTransfer));
1594 
1595 	BlockPrefetcher* blockPrefetcher = reinterpret_cast<BlockPrefetcher*>(cookie);
1596 	block_cache* cache = blockPrefetcher->fCache;
1597 	cached_block** blocks = blockPrefetcher->fBlocks;
1598 	size_t blockSize = cache->block_size;
1599 	off_t blockNumber = blockPrefetcher->fBlockNumber;
1600 	size_t numBlocks = blockPrefetcher->fNumAllocated;
1601 
1602 	MutexLocker locker(&cache->lock);
1603 
1604 	if (bytesTransferred < numBlocks * blockSize) {
1605 		blockPrefetcher->_RemoveAllocated(numBlocks, numBlocks);
1606 		TB(Error(cache, blockNumber, "prefetch starting here failed", status));
1607 		TRACE_ALWAYS("prefetch_iterative_io_finished_hook: transferred only %" B_PRIuSIZE
1608 			" bytes in attempt to read %" B_PRIuSIZE " blocks (start block %" B_PRIdOFF "): %s\n",
1609 			bytesTransferred, numBlocks, blockNumber, strerror(status));
1610 	} else {
1611 		for (size_t i = 0; i < numBlocks; ++i) {
1612 			TB(Read(cache, blockNumber + i));
1613 			mark_block_unbusy_reading(cache, blocks[i]);
1614 			blocks[i]->last_accessed = system_time() / 1000000L;
1615 		}
1616 	}
1617 
1618 	delete blockPrefetcher;
1619 
1620 	return status;
1621 }
1622 
1623 
1624 /*!	Cleans up blocks that were allocated for prefetching when an in-progress prefetch
1625 	is cancelled.
1626 */
1627 void
1628 BlockPrefetcher::_RemoveAllocated(size_t unbusyCount, size_t removeCount)
1629 {
1630 	TRACE(("BlockPrefetcher::_RemoveAllocated:  unbusy %" B_PRIuSIZE " and remove %" B_PRIuSIZE
1631 		" starting with %" B_PRIdOFF "\n", unbusyCount, removeCount, (*fBlocks)->block_number));
1632 
1633 	ASSERT_LOCKED_MUTEX(&fCache->lock);
1634 
1635 	for (size_t i = 0; i < unbusyCount; ++i)
1636 		mark_block_unbusy_reading(fCache, fBlocks[i]);
1637 
1638 	for (size_t i = 0; i < removeCount; ++i) {
1639 		ASSERT(fBlocks[i]->is_dirty == false && fBlocks[i]->unused == true);
1640 
1641 		fCache->unused_blocks.Remove(fBlocks[i]);
1642 		fCache->unused_block_count--;
1643 
1644 		fCache->RemoveBlock(fBlocks[i]);
1645 		fBlocks[i] = NULL;
1646 	}
1647 
1648 	fNumAllocated = 0;
1649 
1650 	return;
1651 }
1652 #endif // !BUILDING_USERLAND_FS_SERVER
1653 
1654 
1655 //	#pragma mark - block_cache
1656 
1657 
1658 block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
1659 		bool readOnly)
1660 	:
1661 	hash(NULL),
1662 	fd(_fd),
1663 	max_blocks(numBlocks),
1664 	block_size(blockSize),
1665 	next_transaction_id(1),
1666 	last_transaction(NULL),
1667 	transaction_hash(NULL),
1668 	buffer_cache(NULL),
1669 	unused_block_count(0),
1670 	busy_reading_count(0),
1671 	busy_reading_waiters(false),
1672 	busy_writing_count(0),
1673 	busy_writing_waiters(0),
1674 	last_block_write(0),
1675 	last_block_write_duration(0),
1676 	num_dirty_blocks(0),
1677 	read_only(readOnly)
1678 {
1679 }
1680 
1681 
1682 /*! Should be called with the cache's lock held. */
1683 block_cache::~block_cache()
1684 {
1685 	unregister_low_resource_handler(&_LowMemoryHandler, this);
1686 
1687 	delete transaction_hash;
1688 	delete hash;
1689 
1690 	delete_object_cache(buffer_cache);
1691 
1692 	mutex_destroy(&lock);
1693 }
1694 
1695 
1696 status_t
1697 block_cache::Init()
1698 {
1699 	busy_reading_condition.Init(this, "cache block busy_reading");
1700 	busy_writing_condition.Init(this, "cache block busy writing");
1701 	condition_variable.Init(this, "cache transaction sync");
1702 	mutex_init(&lock, "block cache");
1703 
1704 	buffer_cache = create_object_cache_etc("block cache buffers", block_size,
1705 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1706 	if (buffer_cache == NULL)
1707 		return B_NO_MEMORY;
1708 
1709 	hash = new BlockTable();
1710 	if (hash == NULL || hash->Init(1024) != B_OK)
1711 		return B_NO_MEMORY;
1712 
1713 	transaction_hash = new(std::nothrow) TransactionTable();
1714 	if (transaction_hash == NULL || transaction_hash->Init(16) != B_OK)
1715 		return B_NO_MEMORY;
1716 
1717 	return register_low_resource_handler(&_LowMemoryHandler, this,
1718 		B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1719 			| B_KERNEL_RESOURCE_ADDRESS_SPACE, 0);
1720 }
1721 
1722 
1723 void
1724 block_cache::Free(void* buffer)
1725 {
1726 	if (buffer != NULL)
1727 		object_cache_free(buffer_cache, buffer, 0);
1728 }
1729 
1730 
1731 void*
1732 block_cache::Allocate()
1733 {
1734 	void* block = object_cache_alloc(buffer_cache, 0);
1735 	if (block != NULL)
1736 		return block;
1737 
1738 	// recycle existing before allocating a new one
1739 	RemoveUnusedBlocks(100);
1740 
1741 	return object_cache_alloc(buffer_cache, 0);
1742 }
1743 
1744 
1745 void
1746 block_cache::FreeBlock(cached_block* block)
1747 {
1748 	Free(block->current_data);
1749 
1750 	if (block->original_data != NULL || block->parent_data != NULL) {
1751 		panic("block_cache::FreeBlock(): %" B_PRIdOFF ", original %p, parent %p\n",
1752 			block->block_number, block->original_data, block->parent_data);
1753 	}
1754 
1755 #if BLOCK_CACHE_DEBUG_CHANGED
1756 	Free(block->compare);
1757 #endif
1758 
1759 	object_cache_free(sBlockCache, block, 0);
1760 }
1761 
1762 
1763 /*! Allocates a new block for \a blockNumber, ready for use */
1764 cached_block*
1765 block_cache::NewBlock(off_t blockNumber)
1766 {
1767 	cached_block* block = NULL;
1768 
1769 	if (low_resource_state(B_KERNEL_RESOURCE_PAGES | B_KERNEL_RESOURCE_MEMORY
1770 			| B_KERNEL_RESOURCE_ADDRESS_SPACE) != B_NO_LOW_RESOURCE) {
1771 		// recycle existing instead of allocating a new one
1772 		block = _GetUnusedBlock();
1773 	}
1774 	if (block == NULL) {
1775 		block = (cached_block*)object_cache_alloc(sBlockCache, 0);
1776 		if (block != NULL) {
1777 			block->current_data = Allocate();
1778 			if (block->current_data == NULL) {
1779 				object_cache_free(sBlockCache, block, 0);
1780 				return NULL;
1781 			}
1782 		} else {
1783 			TB(Error(this, blockNumber, "allocation failed"));
1784 			TRACE_ALWAYS("block allocation failed, unused list is %sempty.\n",
1785 				unused_blocks.IsEmpty() ? "" : "not ");
1786 
1787 			// allocation failed, try to reuse an unused block
1788 			block = _GetUnusedBlock();
1789 			if (block == NULL) {
1790 				TB(Error(this, blockNumber, "get unused failed"));
1791 				FATAL(("could not allocate block!\n"));
1792 				return NULL;
1793 			}
1794 		}
1795 	}
1796 
1797 	block->block_number = blockNumber;
1798 	block->ref_count = 0;
1799 	block->last_accessed = 0;
1800 	block->transaction_next = NULL;
1801 	block->transaction = block->previous_transaction = NULL;
1802 	block->original_data = NULL;
1803 	block->parent_data = NULL;
1804 	block->busy_reading = false;
1805 	block->busy_writing = false;
1806 	block->is_writing = false;
1807 	block->is_dirty = false;
1808 	block->unused = false;
1809 	block->discard = false;
1810 	block->busy_reading_waiters = false;
1811 	block->busy_writing_waiters = false;
1812 #if BLOCK_CACHE_DEBUG_CHANGED
1813 	block->compare = NULL;
1814 #endif
1815 
1816 	return block;
1817 }
1818 
1819 
1820 void
1821 block_cache::FreeBlockParentData(cached_block* block)
1822 {
1823 	ASSERT(block->parent_data != NULL);
1824 	if (block->parent_data != block->current_data)
1825 		Free(block->parent_data);
1826 	block->parent_data = NULL;
1827 }
1828 
1829 
1830 void
1831 block_cache::RemoveUnusedBlocks(int32 count, int32 minSecondsOld)
1832 {
1833 	TRACE(("block_cache: remove up to %" B_PRId32 " unused blocks\n", count));
1834 
1835 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1836 			cached_block* block = iterator.Next();) {
1837 		if (minSecondsOld >= block->LastAccess()) {
1838 			// The list is sorted by last access
1839 			break;
1840 		}
1841 		if (block->busy_reading || block->busy_writing)
1842 			continue;
1843 
1844 		TB(Flush(this, block));
1845 		TRACE(("  remove block %" B_PRIdOFF ", last accessed %" B_PRId32 "\n",
1846 			block->block_number, block->last_accessed));
1847 
1848 		// this can only happen if no transactions are used
1849 		if (block->is_dirty && !block->discard) {
1850 			if (block->busy_writing)
1851 				continue;
1852 
1853 			BlockWriter::WriteBlock(this, block);
1854 		}
1855 
1856 		// remove block from lists
1857 		iterator.Remove();
1858 		unused_block_count--;
1859 		RemoveBlock(block);
1860 
1861 		if (--count <= 0)
1862 			break;
1863 	}
1864 }
1865 
1866 
1867 void
1868 block_cache::RemoveBlock(cached_block* block)
1869 {
1870 	hash->Remove(block);
1871 	FreeBlock(block);
1872 }
1873 
1874 
1875 /*!	Discards the block from a transaction (this method must not be called
1876 	for blocks not part of a transaction).
1877 */
1878 void
1879 block_cache::DiscardBlock(cached_block* block)
1880 {
1881 	ASSERT(block->discard);
1882 	ASSERT(block->previous_transaction == NULL);
1883 
1884 	if (block->parent_data != NULL)
1885 		FreeBlockParentData(block);
1886 
1887 	if (block->original_data != NULL) {
1888 		Free(block->original_data);
1889 		block->original_data = NULL;
1890 	}
1891 
1892 	RemoveBlock(block);
1893 }
1894 
1895 
1896 void
1897 block_cache::_LowMemoryHandler(void* data, uint32 resources, int32 level)
1898 {
1899 	TRACE(("block_cache: low memory handler called with level %" B_PRId32 "\n", level));
1900 
1901 	// free some blocks according to the low memory state
1902 	// (if there is enough memory left, we don't free any)
1903 
1904 	block_cache* cache = (block_cache*)data;
1905 	if (cache->unused_block_count <= 1)
1906 		return;
1907 
1908 	int32 free = 0;
1909 	int32 secondsOld = 0;
1910 	switch (level) {
1911 		case B_NO_LOW_RESOURCE:
1912 			return;
1913 		case B_LOW_RESOURCE_NOTE:
1914 			free = cache->unused_block_count / 4;
1915 			secondsOld = 120;
1916 			break;
1917 		case B_LOW_RESOURCE_WARNING:
1918 			free = cache->unused_block_count / 2;
1919 			secondsOld = 10;
1920 			break;
1921 		case B_LOW_RESOURCE_CRITICAL:
1922 			free = cache->unused_block_count - 1;
1923 			secondsOld = 0;
1924 			break;
1925 	}
1926 
1927 	MutexLocker locker(&cache->lock);
1928 
1929 	if (!locker.IsLocked()) {
1930 		// If our block_cache were deleted, it could be that we had
1931 		// been called before that deletion went through, therefore,
1932 		// acquiring its lock might fail.
1933 		return;
1934 	}
1935 
1936 #ifdef TRACE_BLOCK_CACHE
1937 	uint32 oldUnused = cache->unused_block_count;
1938 #endif
1939 
1940 	cache->RemoveUnusedBlocks(free, secondsOld);
1941 
1942 	TRACE(("block_cache::_LowMemoryHandler(): %p: unused: %" B_PRIu32 " -> %" B_PRIu32 "\n",
1943 		cache, oldUnused, cache->unused_block_count));
1944 }
1945 
1946 
1947 cached_block*
1948 block_cache::_GetUnusedBlock()
1949 {
1950 	TRACE(("block_cache: get unused block\n"));
1951 
1952 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
1953 			cached_block* block = iterator.Next();) {
1954 		TB(Flush(this, block, true));
1955 		// this can only happen if no transactions are used
1956 		if (block->is_dirty && !block->busy_writing && !block->discard)
1957 			BlockWriter::WriteBlock(this, block);
1958 
1959 		// remove block from lists
1960 		iterator.Remove();
1961 		unused_block_count--;
1962 		hash->Remove(block);
1963 
1964 		ASSERT(block->original_data == NULL && block->parent_data == NULL);
1965 		block->unused = false;
1966 
1967 		// TODO: see if compare data is handled correctly here!
1968 #if BLOCK_CACHE_DEBUG_CHANGED
1969 		if (block->compare != NULL)
1970 			Free(block->compare);
1971 #endif
1972 		return block;
1973 	}
1974 
1975 	return NULL;
1976 }
1977 
1978 
1979 //	#pragma mark - private block functions
1980 
1981 
1982 /*!	Cache must be locked.
1983 */
1984 static void
1985 mark_block_busy_reading(block_cache* cache, cached_block* block)
1986 {
1987 	block->busy_reading = true;
1988 	cache->busy_reading_count++;
1989 }
1990 
1991 
1992 /*!	Cache must be locked.
1993 */
1994 static void
1995 mark_block_unbusy_reading(block_cache* cache, cached_block* block)
1996 {
1997 	block->busy_reading = false;
1998 	cache->busy_reading_count--;
1999 
2000 	if ((cache->busy_reading_waiters && cache->busy_reading_count == 0)
2001 		|| block->busy_reading_waiters) {
2002 		cache->busy_reading_waiters = false;
2003 		block->busy_reading_waiters = false;
2004 		cache->busy_reading_condition.NotifyAll();
2005 	}
2006 }
2007 
2008 
2009 /*!	Cache must be locked.
2010 */
2011 static void
2012 wait_for_busy_reading_block(block_cache* cache, cached_block* block)
2013 {
2014 	while (block->busy_reading) {
2015 		// wait for at least the specified block to be read in
2016 		ConditionVariableEntry entry;
2017 		cache->busy_reading_condition.Add(&entry);
2018 		block->busy_reading_waiters = true;
2019 
2020 		mutex_unlock(&cache->lock);
2021 
2022 		entry.Wait();
2023 
2024 		mutex_lock(&cache->lock);
2025 	}
2026 }
2027 
2028 
2029 /*!	Cache must be locked.
2030 */
2031 static void
2032 wait_for_busy_reading_blocks(block_cache* cache)
2033 {
2034 	while (cache->busy_reading_count != 0) {
2035 		// wait for all blocks to be read in
2036 		ConditionVariableEntry entry;
2037 		cache->busy_reading_condition.Add(&entry);
2038 		cache->busy_reading_waiters = true;
2039 
2040 		mutex_unlock(&cache->lock);
2041 
2042 		entry.Wait();
2043 
2044 		mutex_lock(&cache->lock);
2045 	}
2046 }
2047 
2048 
2049 /*!	Cache must be locked.
2050 */
2051 static void
2052 wait_for_busy_writing_block(block_cache* cache, cached_block* block)
2053 {
2054 	while (block->busy_writing) {
2055 		// wait for all blocks to be written back
2056 		ConditionVariableEntry entry;
2057 		cache->busy_writing_condition.Add(&entry);
2058 		block->busy_writing_waiters = true;
2059 
2060 		mutex_unlock(&cache->lock);
2061 
2062 		entry.Wait();
2063 
2064 		mutex_lock(&cache->lock);
2065 	}
2066 }
2067 
2068 
2069 /*!	Cache must be locked.
2070 */
2071 static void
2072 wait_for_busy_writing_blocks(block_cache* cache)
2073 {
2074 	while (cache->busy_writing_count != 0) {
2075 		// wait for all blocks to be written back
2076 		ConditionVariableEntry entry;
2077 		cache->busy_writing_condition.Add(&entry);
2078 		cache->busy_writing_waiters = true;
2079 
2080 		mutex_unlock(&cache->lock);
2081 
2082 		entry.Wait();
2083 
2084 		mutex_lock(&cache->lock);
2085 	}
2086 }
2087 
2088 
2089 /*!	Removes a reference from the specified \a block. If this was the last
2090 	reference, the block is moved into the unused list.
2091 	In low memory situations, it will also free some blocks from that list,
2092 	but not necessarily the \a block it just released.
2093 */
2094 static void
2095 put_cached_block(block_cache* cache, cached_block* block)
2096 {
2097 #if BLOCK_CACHE_DEBUG_CHANGED
2098 	if (!block->is_dirty && block->compare != NULL
2099 		&& memcmp(block->current_data, block->compare, cache->block_size)) {
2100 		TRACE_ALWAYS("new block:\n");
2101 		dump_block((const char*)block->current_data, 256, "  ");
2102 		TRACE_ALWAYS("unchanged block:\n");
2103 		dump_block((const char*)block->compare, 256, "  ");
2104 		BlockWriter::WriteBlock(cache, block);
2105 		panic("block_cache: supposed to be clean block was changed!\n");
2106 
2107 		cache->Free(block->compare);
2108 		block->compare = NULL;
2109 	}
2110 #endif
2111 	TB(Put(cache, block));
2112 
2113 	if (block->ref_count < 1) {
2114 		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
2115 		return;
2116 	}
2117 
2118 	if (--block->ref_count == 0
2119 		&& block->transaction == NULL && block->previous_transaction == NULL) {
2120 		// This block is not used anymore, and not part of any transaction
2121 		block->is_writing = false;
2122 
2123 		if (block->discard) {
2124 			cache->RemoveBlock(block);
2125 		} else {
2126 			// put this block in the list of unused blocks
2127 			ASSERT(!block->unused);
2128 			block->unused = true;
2129 
2130 			ASSERT(block->original_data == NULL && block->parent_data == NULL);
2131 			cache->unused_blocks.Add(block);
2132 			cache->unused_block_count++;
2133 		}
2134 	}
2135 }
2136 
2137 
2138 static void
2139 put_cached_block(block_cache* cache, off_t blockNumber)
2140 {
2141 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
2142 		panic("put_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
2143 			blockNumber, cache->max_blocks - 1);
2144 	}
2145 
2146 	cached_block* block = cache->hash->Lookup(blockNumber);
2147 	if (block != NULL)
2148 		put_cached_block(cache, block);
2149 	else {
2150 		TB(Error(cache, blockNumber, "put unknown"));
2151 	}
2152 }
2153 
2154 
2155 /*!	Retrieves the block \a blockNumber from the hash table, if it's already
2156 	there, or reads it from the disk.
2157 	You need to have the cache locked when calling this function.
2158 
2159 	\param _allocated tells you whether or not a new block has been allocated
2160 		to satisfy your request.
2161 	\param readBlock if \c false, the block will not be read in case it was
2162 		not already in the cache. The block you retrieve may contain random
2163 		data. If \c true, the cache will be temporarily unlocked while the
2164 		block is read in.
2165 */
2166 static status_t
2167 get_cached_block(block_cache* cache, off_t blockNumber, bool* _allocated,
2168 	bool readBlock, cached_block** _block)
2169 {
2170 	ASSERT_LOCKED_MUTEX(&cache->lock);
2171 
2172 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
2173 		panic("get_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
2174 			blockNumber, cache->max_blocks - 1);
2175 		return B_BAD_VALUE;
2176 	}
2177 
2178 retry:
2179 	cached_block* block = cache->hash->Lookup(blockNumber);
2180 	*_allocated = false;
2181 
2182 	if (block == NULL) {
2183 		// put block into cache
2184 		block = cache->NewBlock(blockNumber);
2185 		if (block == NULL)
2186 			return B_NO_MEMORY;
2187 
2188 		cache->hash->Insert(block);
2189 		*_allocated = true;
2190 	} else if (block->busy_reading) {
2191 		// The block is currently busy_reading - wait and try again later
2192 		wait_for_busy_reading_block(cache, block);
2193 		goto retry;
2194 	}
2195 
2196 	if (block->unused) {
2197 		//TRACE(("remove block %" B_PRIdOFF " from unused\n", blockNumber));
2198 		block->unused = false;
2199 		cache->unused_blocks.Remove(block);
2200 		cache->unused_block_count--;
2201 	}
2202 
2203 	if (*_allocated && readBlock) {
2204 		// read block into cache
2205 		int32 blockSize = cache->block_size;
2206 
2207 		mark_block_busy_reading(cache, block);
2208 		mutex_unlock(&cache->lock);
2209 
2210 		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
2211 			block->current_data, blockSize);
2212 
2213 		mutex_lock(&cache->lock);
2214 		if (bytesRead < blockSize) {
2215 			cache->RemoveBlock(block);
2216 			TB(Error(cache, blockNumber, "read failed", bytesRead));
2217 
2218 			TRACE_ALWAYS("could not read block %" B_PRIdOFF ": bytesRead: %zd,"
2219 				" error: %s\n", blockNumber, bytesRead, strerror(errno));
2220 			return errno;
2221 		}
2222 		TB(Read(cache, block));
2223 
2224 		mark_block_unbusy_reading(cache, block);
2225 	}
2226 
2227 	block->ref_count++;
2228 	block->last_accessed = system_time() / 1000000L;
2229 
2230 	*_block = block;
2231 	return B_OK;
2232 }
2233 
2234 
2235 /*!	Returns the writable block data for the requested blockNumber.
2236 	If \a cleared is true, the block is not read from disk; an empty block
2237 	is returned.
2238 
2239 	This is the only method to insert a block into a transaction. It makes
2240 	sure that the previous block contents are preserved in that case.
2241 */
2242 static status_t
2243 get_writable_cached_block(block_cache* cache, off_t blockNumber,
2244 	int32 transactionID, bool cleared, void** _block)
2245 {
2246 	TRACE(("get_writable_cached_block(blockNumber = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
2247 		blockNumber, transactionID));
2248 
2249 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
2250 		panic("get_writable_cached_block: invalid block number %" B_PRIdOFF " (max %" B_PRIdOFF ")",
2251 			blockNumber, cache->max_blocks - 1);
2252 		return B_BAD_VALUE;
2253 	}
2254 
2255 	bool allocated;
2256 	cached_block* block;
2257 	status_t status = get_cached_block(cache, blockNumber, &allocated,
2258 		!cleared, &block);
2259 	if (status != B_OK)
2260 		return status;
2261 
2262 	if (block->busy_writing)
2263 		wait_for_busy_writing_block(cache, block);
2264 
2265 	block->discard = false;
2266 
2267 	// if there is no transaction support, we just return the current block
2268 	if (transactionID == -1) {
2269 		if (cleared) {
2270 			mark_block_busy_reading(cache, block);
2271 			mutex_unlock(&cache->lock);
2272 
2273 			memset(block->current_data, 0, cache->block_size);
2274 
2275 			mutex_lock(&cache->lock);
2276 			mark_block_unbusy_reading(cache, block);
2277 		}
2278 
2279 		block->is_writing = true;
2280 
2281 		if (!block->is_dirty) {
2282 			cache->num_dirty_blocks++;
2283 			block->is_dirty = true;
2284 				// mark the block as dirty
2285 		}
2286 
2287 		TB(Get(cache, block));
2288 		*_block = block->current_data;
2289 		return B_OK;
2290 	}
2291 
2292 	cache_transaction* transaction = block->transaction;
2293 
2294 	if (transaction != NULL && transaction->id != transactionID) {
2295 		// TODO: we have to wait here until the other transaction is done.
2296 		//	Maybe we should even panic, since we can't prevent any deadlocks.
2297 		panic("get_writable_cached_block(): asked to get busy writable block "
2298 			"(transaction %" B_PRId32 ")\n", block->transaction->id);
2299 		put_cached_block(cache, block);
2300 		return B_BAD_VALUE;
2301 	}
2302 	if (transaction == NULL && transactionID != -1) {
2303 		// get new transaction
2304 		transaction = lookup_transaction(cache, transactionID);
2305 		if (transaction == NULL) {
2306 			panic("get_writable_cached_block(): invalid transaction %" B_PRId32 "!\n",
2307 				transactionID);
2308 			put_cached_block(cache, block);
2309 			return B_BAD_VALUE;
2310 		}
2311 		if (!transaction->open) {
2312 			panic("get_writable_cached_block(): transaction already done!\n");
2313 			put_cached_block(cache, block);
2314 			return B_BAD_VALUE;
2315 		}
2316 
2317 		block->transaction = transaction;
2318 
2319 		// attach the block to the transaction block list
2320 		block->transaction_next = transaction->first_block;
2321 		transaction->first_block = block;
2322 		transaction->num_blocks++;
2323 	}
2324 	if (transaction != NULL)
2325 		transaction->last_used = system_time();
2326 
2327 	bool wasUnchanged = block->original_data == NULL
2328 		|| block->previous_transaction != NULL;
2329 
2330 	if (!(allocated && cleared) && block->original_data == NULL) {
2331 		// we already have data, so we need to preserve it
2332 		block->original_data = cache->Allocate();
2333 		if (block->original_data == NULL) {
2334 			TB(Error(cache, blockNumber, "allocate original failed"));
2335 			FATAL(("could not allocate original_data\n"));
2336 			put_cached_block(cache, block);
2337 			return B_NO_MEMORY;
2338 		}
2339 
2340 		mark_block_busy_reading(cache, block);
2341 		mutex_unlock(&cache->lock);
2342 
2343 		memcpy(block->original_data, block->current_data, cache->block_size);
2344 
2345 		mutex_lock(&cache->lock);
2346 		mark_block_unbusy_reading(cache, block);
2347 	}
2348 	if (block->parent_data == block->current_data) {
2349 		// remember any previous contents for the parent transaction
2350 		block->parent_data = cache->Allocate();
2351 		if (block->parent_data == NULL) {
2352 			// TODO: maybe we should just continue the current transaction in
2353 			// this case...
2354 			TB(Error(cache, blockNumber, "allocate parent failed"));
2355 			FATAL(("could not allocate parent\n"));
2356 			put_cached_block(cache, block);
2357 			return B_NO_MEMORY;
2358 		}
2359 
2360 		mark_block_busy_reading(cache, block);
2361 		mutex_unlock(&cache->lock);
2362 
2363 		memcpy(block->parent_data, block->current_data, cache->block_size);
2364 
2365 		mutex_lock(&cache->lock);
2366 		mark_block_unbusy_reading(cache, block);
2367 
2368 		transaction->sub_num_blocks++;
2369 	} else if (transaction != NULL && transaction->has_sub_transaction
2370 		&& block->parent_data == NULL && wasUnchanged)
2371 		transaction->sub_num_blocks++;
2372 
2373 	if (cleared) {
2374 		mark_block_busy_reading(cache, block);
2375 		mutex_unlock(&cache->lock);
2376 
2377 		memset(block->current_data, 0, cache->block_size);
2378 
2379 		mutex_lock(&cache->lock);
2380 		mark_block_unbusy_reading(cache, block);
2381 	}
2382 
2383 	block->is_dirty = true;
2384 	TB(Get(cache, block));
2385 	TB2(BlockData(cache, block, "get writable"));
2386 
2387 	*_block = block->current_data;
2388 	return B_OK;
2389 }
2390 
2391 
2392 #if DEBUG_BLOCK_CACHE
2393 
2394 
2395 static void
2396 dump_block(cached_block* block)
2397 {
2398 	kprintf("%08lx %9" B_PRIdOFF " %08lx %08lx %08lx %5" B_PRId32 " %6" B_PRId32
2399 		" %c%c%c%c%c%c %08lx %08lx\n",
2400 		(addr_t)block, block->block_number,
2401 		(addr_t)block->current_data, (addr_t)block->original_data,
2402 		(addr_t)block->parent_data, block->ref_count, block->LastAccess(),
2403 		block->busy_reading ? 'r' : '-', block->busy_writing ? 'w' : '-',
2404 		block->is_writing ? 'W' : '-', block->is_dirty ? 'D' : '-',
2405 		block->unused ? 'U' : '-', block->discard ? 'D' : '-',
2406 		(addr_t)block->transaction,
2407 		(addr_t)block->previous_transaction);
2408 }
2409 
2410 
2411 static void
2412 dump_block_long(cached_block* block)
2413 {
2414 	kprintf("BLOCK %p\n", block);
2415 	kprintf(" current data:  %p\n", block->current_data);
2416 	kprintf(" original data: %p\n", block->original_data);
2417 	kprintf(" parent data:   %p\n", block->parent_data);
2418 #if BLOCK_CACHE_DEBUG_CHANGED
2419 	kprintf(" compare data:  %p\n", block->compare);
2420 #endif
2421 	kprintf(" ref_count:     %" B_PRId32 "\n", block->ref_count);
2422 	kprintf(" accessed:      %" B_PRId32 "\n", block->LastAccess());
2423 	kprintf(" flags:        ");
2424 	if (block->busy_reading)
2425 		kprintf(" busy_reading");
2426 	if (block->busy_writing)
2427 		kprintf(" busy_writing");
2428 	if (block->is_writing)
2429 		kprintf(" is-writing");
2430 	if (block->is_dirty)
2431 		kprintf(" is-dirty");
2432 	if (block->unused)
2433 		kprintf(" unused");
2434 	if (block->discard)
2435 		kprintf(" discard");
2436 	kprintf("\n");
2437 	if (block->transaction != NULL) {
2438 		kprintf(" transaction:   %p (%" B_PRId32 ")\n", block->transaction,
2439 			block->transaction->id);
2440 		if (block->transaction_next != NULL) {
2441 			kprintf(" next in transaction: %" B_PRIdOFF "\n",
2442 				block->transaction_next->block_number);
2443 		}
2444 	}
2445 	if (block->previous_transaction != NULL) {
2446 		kprintf(" previous transaction: %p (%" B_PRId32 ")\n",
2447 			block->previous_transaction,
2448 			block->previous_transaction->id);
2449 	}
2450 
2451 	set_debug_variable("_current", (addr_t)block->current_data);
2452 	set_debug_variable("_original", (addr_t)block->original_data);
2453 	set_debug_variable("_parent", (addr_t)block->parent_data);
2454 }
2455 
2456 
2457 static int
2458 dump_cached_block(int argc, char** argv)
2459 {
2460 	if (argc != 2) {
2461 		kprintf("usage: %s <block-address>\n", argv[0]);
2462 		return 0;
2463 	}
2464 
2465 	dump_block_long((struct cached_block*)(addr_t)parse_expression(argv[1]));
2466 	return 0;
2467 }
2468 
2469 
2470 static int
2471 dump_cache(int argc, char** argv)
2472 {
2473 	bool showTransactions = false;
2474 	bool showBlocks = false;
2475 	int32 i = 1;
2476 	while (argv[i] != NULL && argv[i][0] == '-') {
2477 		for (char* arg = &argv[i][1]; arg[0]; arg++) {
2478 			switch (arg[0]) {
2479 				case 'b':
2480 					showBlocks = true;
2481 					break;
2482 				case 't':
2483 					showTransactions = true;
2484 					break;
2485 				default:
2486 					print_debugger_command_usage(argv[0]);
2487 					return 0;
2488 			}
2489 		}
2490 		i++;
2491 	}
2492 
2493 	if (i >= argc) {
2494 		print_debugger_command_usage(argv[0]);
2495 		return 0;
2496 	}
2497 
2498 	block_cache* cache = (struct block_cache*)(addr_t)parse_expression(argv[i]);
2499 	if (cache == NULL) {
2500 		kprintf("invalid cache address\n");
2501 		return 0;
2502 	}
2503 
2504 	off_t blockNumber = -1;
2505 	if (i + 1 < argc) {
2506 		blockNumber = parse_expression(argv[i + 1]);
2507 		cached_block* block = cache->hash->Lookup(blockNumber);
2508 		if (block != NULL)
2509 			dump_block_long(block);
2510 		else
2511 			kprintf("block %" B_PRIdOFF " not found\n", blockNumber);
2512 		return 0;
2513 	}
2514 
2515 	kprintf("BLOCK CACHE: %p\n", cache);
2516 
2517 	kprintf(" fd:           %d\n", cache->fd);
2518 	kprintf(" max_blocks:   %" B_PRIdOFF "\n", cache->max_blocks);
2519 	kprintf(" block_size:   %zu\n", cache->block_size);
2520 	kprintf(" next_transaction_id: %" B_PRId32 "\n", cache->next_transaction_id);
2521 	kprintf(" buffer_cache: %p\n", cache->buffer_cache);
2522 	kprintf(" busy_reading: %" B_PRIu32 ", %s waiters\n", cache->busy_reading_count,
2523 		cache->busy_reading_waiters ? "has" : "no");
2524 	kprintf(" busy_writing: %" B_PRIu32 ", %s waiters\n", cache->busy_writing_count,
2525 		cache->busy_writing_waiters ? "has" : "no");
2526 
2527 	if (!cache->pending_notifications.IsEmpty()) {
2528 		kprintf(" pending notifications:\n");
2529 
2530 		NotificationList::Iterator iterator
2531 			= cache->pending_notifications.GetIterator();
2532 		while (iterator.HasNext()) {
2533 			cache_notification* notification = iterator.Next();
2534 
2535 			kprintf("  %p %5" B_PRIx32 " %p - %p\n", notification,
2536 				notification->events_pending, notification->hook,
2537 				notification->data);
2538 		}
2539 	}
2540 
2541 	if (showTransactions) {
2542 		kprintf(" transactions:\n");
2543 		kprintf("address       id state  blocks  main   sub\n");
2544 
2545 		TransactionTable::Iterator iterator(cache->transaction_hash);
2546 
2547 		while (iterator.HasNext()) {
2548 			cache_transaction* transaction = iterator.Next();
2549 			kprintf("%p %5" B_PRId32 " %-7s %5" B_PRId32 " %5" B_PRId32 " %5"
2550 				B_PRId32 "\n", transaction, transaction->id,
2551 				transaction->open ? "open" : "closed",
2552 				transaction->num_blocks, transaction->main_num_blocks,
2553 				transaction->sub_num_blocks);
2554 		}
2555 	}
2556 
2557 	if (showBlocks) {
2558 		kprintf(" blocks:\n");
2559 		kprintf("address  block no. current  original parent    refs access "
2560 			"flags transact prev. trans\n");
2561 	}
2562 
2563 	uint32 referenced = 0;
2564 	uint32 count = 0;
2565 	uint32 dirty = 0;
2566 	uint32 discarded = 0;
2567 	BlockTable::Iterator iterator(cache->hash);
2568 	while (iterator.HasNext()) {
2569 		cached_block* block = iterator.Next();
2570 		if (showBlocks)
2571 			dump_block(block);
2572 
2573 		if (block->is_dirty)
2574 			dirty++;
2575 		if (block->discard)
2576 			discarded++;
2577 		if (block->ref_count)
2578 			referenced++;
2579 		count++;
2580 	}
2581 
2582 	kprintf(" %" B_PRIu32 " blocks total, %" B_PRIu32 " dirty, %" B_PRIu32
2583 		" discarded, %" B_PRIu32 " referenced, %" B_PRIu32 " busy, %" B_PRIu32
2584 		" in unused.\n",
2585 		count, dirty, discarded, referenced, cache->busy_reading_count,
2586 		cache->unused_block_count);
2587 	return 0;
2588 }
2589 
2590 
2591 static int
2592 dump_transaction(int argc, char** argv)
2593 {
2594 	bool showBlocks = false;
2595 	int i = 1;
2596 	if (argc > 1 && !strcmp(argv[1], "-b")) {
2597 		showBlocks = true;
2598 		i++;
2599 	}
2600 
2601 	if (argc - i < 1 || argc - i > 2) {
2602 		print_debugger_command_usage(argv[0]);
2603 		return 0;
2604 	}
2605 
2606 	cache_transaction* transaction = NULL;
2607 
2608 	if (argc - i == 1) {
2609 		transaction = (cache_transaction*)(addr_t)parse_expression(argv[i]);
2610 	} else {
2611 		block_cache* cache = (block_cache*)(addr_t)parse_expression(argv[i]);
2612 		int32 id = parse_expression(argv[i + 1]);
2613 		transaction = lookup_transaction(cache, id);
2614 		if (transaction == NULL) {
2615 			kprintf("No transaction with ID %" B_PRId32 " found.\n", id);
2616 			return 0;
2617 		}
2618 	}
2619 
2620 	kprintf("TRANSACTION %p\n", transaction);
2621 
2622 	kprintf(" id:             %" B_PRId32 "\n", transaction->id);
2623 	kprintf(" num block:      %" B_PRId32 "\n", transaction->num_blocks);
2624 	kprintf(" main num block: %" B_PRId32 "\n", transaction->main_num_blocks);
2625 	kprintf(" sub num block:  %" B_PRId32 "\n", transaction->sub_num_blocks);
2626 	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
2627 	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
2628 	kprintf(" idle:           %" B_PRId64 " secs\n",
2629 		(system_time() - transaction->last_used) / 1000000);
2630 
2631 	kprintf(" listeners:\n");
2632 
2633 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
2634 	while (iterator.HasNext()) {
2635 		cache_listener* listener = iterator.Next();
2636 
2637 		kprintf("  %p %5" B_PRIx32 " %p - %p\n", listener, listener->events_pending,
2638 			listener->hook, listener->data);
2639 	}
2640 
2641 	if (!showBlocks)
2642 		return 0;
2643 
2644 	kprintf(" blocks:\n");
2645 	kprintf("address  block no. current  original parent    refs access "
2646 		"flags transact prev. trans\n");
2647 
2648 	cached_block* block = transaction->first_block;
2649 	while (block != NULL) {
2650 		dump_block(block);
2651 		block = block->transaction_next;
2652 	}
2653 
2654 	kprintf("--\n");
2655 
2656 	block_list::Iterator blockIterator = transaction->blocks.GetIterator();
2657 	while (blockIterator.HasNext()) {
2658 		block = blockIterator.Next();
2659 		dump_block(block);
2660 	}
2661 
2662 	return 0;
2663 }
2664 
2665 
2666 static int
2667 dump_caches(int argc, char** argv)
2668 {
2669 	kprintf("Block caches:\n");
2670 	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
2671 	while (i.HasNext()) {
2672 		block_cache* cache = i.Next();
2673 		if (cache == (block_cache*)&sMarkCache)
2674 			continue;
2675 
2676 		kprintf("  %p\n", cache);
2677 	}
2678 
2679 	return 0;
2680 }
2681 
2682 
2683 #if BLOCK_CACHE_BLOCK_TRACING >= 2
2684 static int
2685 dump_block_data(int argc, char** argv)
2686 {
2687 	using namespace BlockTracing;
2688 
2689 	// Determine which blocks to show
2690 
2691 	bool printStackTrace = true;
2692 	uint32 which = 0;
2693 	int32 i = 1;
2694 	while (i < argc && argv[i][0] == '-') {
2695 		char* arg = &argv[i][1];
2696 		while (arg[0]) {
2697 			switch (arg[0]) {
2698 				case 'c':
2699 					which |= BlockData::kCurrent;
2700 					break;
2701 				case 'p':
2702 					which |= BlockData::kParent;
2703 					break;
2704 				case 'o':
2705 					which |= BlockData::kOriginal;
2706 					break;
2707 
2708 				default:
2709 					kprintf("invalid block specifier (only o/c/p are "
2710 						"allowed).\n");
2711 					return 0;
2712 			}
2713 			arg++;
2714 		}
2715 
2716 		i++;
2717 	}
2718 	if (which == 0)
2719 		which = BlockData::kCurrent | BlockData::kParent | BlockData::kOriginal;
2720 
2721 	if (i == argc) {
2722 		print_debugger_command_usage(argv[0]);
2723 		return 0;
2724 	}
2725 
2726 	// Get the range of blocks to print
2727 
2728 	int64 from = parse_expression(argv[i]);
2729 	int64 to = from;
2730 	if (argc > i + 1)
2731 		to = parse_expression(argv[i + 1]);
2732 	if (to < from)
2733 		to = from;
2734 
2735 	uint32 offset = 0;
2736 	uint32 size = LONG_MAX;
2737 	if (argc > i + 2)
2738 		offset = parse_expression(argv[i + 2]);
2739 	if (argc > i + 3)
2740 		size = parse_expression(argv[i + 3]);
2741 
2742 	TraceEntryIterator iterator;
2743 	iterator.MoveTo(from - 1);
2744 
2745 	static char sBuffer[1024];
2746 	LazyTraceOutput out(sBuffer, sizeof(sBuffer), TRACE_OUTPUT_TEAM_ID);
2747 
2748 	while (TraceEntry* entry = iterator.Next()) {
2749 		int32 index = iterator.Index();
2750 		if (index > to)
2751 			break;
2752 
2753 		Action* action = dynamic_cast<Action*>(entry);
2754 		if (action != NULL) {
2755 			out.Clear();
2756 			out.DumpEntry(action);
2757 			continue;
2758 		}
2759 
2760 		BlockData* blockData = dynamic_cast<BlockData*>(entry);
2761 		if (blockData == NULL)
2762 			continue;
2763 
2764 		out.Clear();
2765 
2766 		const char* dump = out.DumpEntry(entry);
2767 		int length = strlen(dump);
2768 		if (length > 0 && dump[length - 1] == '\n')
2769 			length--;
2770 
2771 		kprintf("%5" B_PRId32 ". %.*s\n", index, length, dump);
2772 
2773 		if (printStackTrace) {
2774 			out.Clear();
2775 			entry->DumpStackTrace(out);
2776 			if (out.Size() > 0)
2777 				kputs(out.Buffer());
2778 		}
2779 
2780 		blockData->DumpBlocks(which, offset, size);
2781 	}
2782 
2783 	return 0;
2784 }
2785 #endif	// BLOCK_CACHE_BLOCK_TRACING >= 2
2786 
2787 
2788 #endif	// DEBUG_BLOCK_CACHE
2789 
2790 
2791 /*!	Traverses through the block_cache list, and returns one cache after the
2792 	other. The cache returned is automatically locked when you get it, and
2793 	unlocked with the next call to this function. Ignores caches that are in
2794 	deletion state.
2795 	Returns \c NULL when the end of the list is reached.
2796 */
2797 static block_cache*
2798 get_next_locked_block_cache(block_cache* last)
2799 {
2800 	MutexLocker _(sCachesLock);
2801 
2802 	block_cache* cache;
2803 	if (last != NULL) {
2804 		mutex_unlock(&last->lock);
2805 
2806 		cache = sCaches.GetNext((block_cache*)&sMarkCache);
2807 		sCaches.Remove((block_cache*)&sMarkCache);
2808 	} else
2809 		cache = sCaches.Head();
2810 
2811 	if (cache != NULL) {
2812 		mutex_lock(&cache->lock);
2813 		sCaches.InsertBefore(sCaches.GetNext(cache), (block_cache*)&sMarkCache);
2814 	}
2815 
2816 	return cache;
2817 }
2818 
2819 
2820 /*!	Background thread that continuously checks for pending notifications of
2821 	all caches.
2822 	Every two seconds, it will also write back up to 64 blocks per cache.
2823 */
2824 static status_t
2825 block_notifier_and_writer(void* /*data*/)
2826 {
2827 	const bigtime_t kDefaultTimeout = 2000000LL;
2828 	bigtime_t timeout = kDefaultTimeout;
2829 
2830 	while (true) {
2831 		bigtime_t start = system_time();
2832 
2833 		status_t status = acquire_sem_etc(sEventSemaphore, 1,
2834 			B_RELATIVE_TIMEOUT, timeout);
2835 		if (status == B_OK) {
2836 			flush_pending_notifications();
2837 			timeout -= system_time() - start;
2838 			continue;
2839 		}
2840 
2841 		// Write 64 blocks of each block_cache roughly every 2 seconds,
2842 		// potentially more or less depending on congestion and drive speeds
2843 		// (usually much less.) We do not want to queue everything at once
2844 		// because a future transaction might then get held up waiting for
2845 		// a specific block to be written.
2846 		timeout = kDefaultTimeout;
2847 		size_t usedMemory;
2848 		object_cache_get_usage(sBlockCache, &usedMemory);
2849 
2850 		block_cache* cache = NULL;
2851 		while ((cache = get_next_locked_block_cache(cache)) != NULL) {
2852 			// Give some breathing room: wait 2x the length of the potential
2853 			// maximum block count-sized write between writes, and also skip
2854 			// if there are more than 16 blocks currently being written.
2855 			const bigtime_t next = cache->last_block_write
2856 					+ cache->last_block_write_duration * 2 * 64;
2857 			if (cache->busy_writing_count > 16 || system_time() < next) {
2858 				if (cache->last_block_write_duration > 0) {
2859 					timeout = min_c(timeout,
2860 						cache->last_block_write_duration * 2 * 64);
2861 				}
2862 				continue;
2863 			}
2864 
2865 			BlockWriter writer(cache, 64);
2866 			bool hasMoreBlocks = false;
2867 
2868 			size_t cacheUsedMemory;
2869 			object_cache_get_usage(cache->buffer_cache, &cacheUsedMemory);
2870 			usedMemory += cacheUsedMemory;
2871 
2872 			if (cache->num_dirty_blocks) {
2873 				// This cache is not using transactions, we'll scan the blocks
2874 				// directly
2875 				BlockTable::Iterator iterator(cache->hash);
2876 
2877 				while (iterator.HasNext()) {
2878 					cached_block* block = iterator.Next();
2879 					if (block->CanBeWritten() && !writer.Add(block)) {
2880 						hasMoreBlocks = true;
2881 						break;
2882 					}
2883 				}
2884 			} else {
2885 				TransactionTable::Iterator iterator(cache->transaction_hash);
2886 
2887 				while (iterator.HasNext()) {
2888 					cache_transaction* transaction = iterator.Next();
2889 					if (transaction->open) {
2890 						if (system_time() > transaction->last_used
2891 								+ kTransactionIdleTime) {
2892 							// Transaction is open but idle
2893 							notify_transaction_listeners(cache, transaction,
2894 								TRANSACTION_IDLE);
2895 						}
2896 						continue;
2897 					}
2898 
2899 					bool hasLeftOvers;
2900 						// we ignore this one
2901 					if (!writer.Add(transaction, hasLeftOvers)) {
2902 						hasMoreBlocks = true;
2903 						break;
2904 					}
2905 				}
2906 			}
2907 
2908 			writer.Write();
2909 
2910 			if (hasMoreBlocks && cache->last_block_write_duration > 0) {
2911 				// There are probably still more blocks that we could write, so
2912 				// see if we can decrease the timeout.
2913 				timeout = min_c(timeout,
2914 					cache->last_block_write_duration * 2 * 64);
2915 			}
2916 
2917 			if ((block_cache_used_memory() / B_PAGE_SIZE)
2918 					> vm_page_num_pages() / 2) {
2919 				// Try to reduce memory usage to half of the available
2920 				// RAM at maximum
2921 				cache->RemoveUnusedBlocks(1000, 10);
2922 			}
2923 		}
2924 
2925 		MutexLocker _(sCachesMemoryUseLock);
2926 		sUsedMemory = usedMemory;
2927 	}
2928 
2929 	// never can get here
2930 	return B_OK;
2931 }
2932 
2933 
2934 /*!	Notify function for wait_for_notifications(). */
2935 static void
2936 notify_sync(int32 transactionID, int32 event, void* _cache)
2937 {
2938 	block_cache* cache = (block_cache*)_cache;
2939 
2940 	cache->condition_variable.NotifyOne();
2941 }
2942 
2943 
2944 /*!	Must be called with the sCachesLock held. */
2945 static bool
2946 is_valid_cache(block_cache* cache)
2947 {
2948 	ASSERT_LOCKED_MUTEX(&sCachesLock);
2949 
2950 	DoublyLinkedList<block_cache>::Iterator iterator = sCaches.GetIterator();
2951 	while (iterator.HasNext()) {
2952 		if (cache == iterator.Next())
2953 			return true;
2954 	}
2955 
2956 	return false;
2957 }
2958 
2959 
2960 /*!	Waits until all pending notifications are carried out.
2961 	Safe to be called from the block writer/notifier thread.
2962 	You must not hold the \a cache lock when calling this function.
2963 */
2964 static void
2965 wait_for_notifications(block_cache* cache)
2966 {
2967 	MutexLocker locker(sCachesLock);
2968 
2969 	if (find_thread(NULL) == sNotifierWriterThread) {
2970 		// We're the notifier thread, don't wait, but flush all pending
2971 		// notifications directly.
2972 		if (is_valid_cache(cache))
2973 			flush_pending_notifications(cache);
2974 		return;
2975 	}
2976 
2977 	// add sync notification
2978 	cache_notification notification;
2979 	set_notification(NULL, notification, TRANSACTION_WRITTEN, notify_sync,
2980 		cache);
2981 
2982 	ConditionVariableEntry entry;
2983 	cache->condition_variable.Add(&entry);
2984 
2985 	add_notification(cache, &notification, TRANSACTION_WRITTEN, false);
2986 	locker.Unlock();
2987 
2988 	// wait for notification hook to be called
2989 	entry.Wait();
2990 }
2991 
2992 
2993 status_t
2994 block_cache_init(void)
2995 {
2996 	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
2997 		8, 0, 0, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
2998 	if (sBlockCache == NULL)
2999 		return B_NO_MEMORY;
3000 
3001 	sCacheNotificationCache = create_object_cache("cache notifications",
3002 		sizeof(cache_listener), 8, NULL, NULL, NULL);
3003 	if (sCacheNotificationCache == NULL)
3004 		return B_NO_MEMORY;
3005 
3006 	new (&sCaches) DoublyLinkedList<block_cache>;
3007 		// manually call constructor
3008 
3009 	sEventSemaphore = create_sem(0, "block cache event");
3010 	if (sEventSemaphore < B_OK)
3011 		return sEventSemaphore;
3012 
3013 	sNotifierWriterThread = spawn_kernel_thread(&block_notifier_and_writer,
3014 		"block notifier/writer", B_LOW_PRIORITY, NULL);
3015 	if (sNotifierWriterThread >= B_OK)
3016 		resume_thread(sNotifierWriterThread);
3017 
3018 #if DEBUG_BLOCK_CACHE
3019 	add_debugger_command_etc("block_caches", &dump_caches,
3020 		"dumps all block caches", "\n", 0);
3021 	add_debugger_command_etc("block_cache", &dump_cache,
3022 		"dumps a specific block cache",
3023 		"[-bt] <cache-address> [block-number]\n"
3024 		"  -t lists the transactions\n"
3025 		"  -b lists all blocks\n", 0);
3026 	add_debugger_command("cached_block", &dump_cached_block,
3027 		"dumps the specified cached block");
3028 	add_debugger_command_etc("transaction", &dump_transaction,
3029 		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
3030 		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
3031 		"  -b lists all blocks that are part of this transaction\n", 0);
3032 #	if BLOCK_CACHE_BLOCK_TRACING >= 2
3033 	add_debugger_command_etc("block_cache_data", &dump_block_data,
3034 		"dumps the data blocks logged for the actions",
3035 		"[-cpo] <from> [<to> [<offset> [<size>]]]\n"
3036 		"If no data specifier is used, all blocks are shown by default.\n"
3037 		" -c       the current data is shown, if available.\n"
3038 		" -p       the parent data is shown, if available.\n"
3039 		" -o       the original data is shown, if available.\n"
3040 		" <from>   first index of tracing entries to show.\n"
3041 		" <to>     if given, the last entry. If not, only <from> is shown.\n"
3042 		" <offset> the offset of the block data.\n"
3043 		" <from>   the size of the block data that is dumped\n", 0);
3044 #	endif
3045 #endif	// DEBUG_BLOCK_CACHE
3046 
3047 	return B_OK;
3048 }
3049 
3050 
3051 size_t
3052 block_cache_used_memory(void)
3053 {
3054 	MutexLocker _(sCachesMemoryUseLock);
3055 	return sUsedMemory;
3056 }
3057 
3058 
3059 //	#pragma mark - public transaction API
3060 
3061 
3062 int32
3063 cache_start_transaction(void* _cache)
3064 {
3065 	block_cache* cache = (block_cache*)_cache;
3066 	TransactionLocker locker(cache);
3067 
3068 	if (cache->last_transaction && cache->last_transaction->open) {
3069 		panic("last transaction (%" B_PRId32 ") still open!\n",
3070 			cache->last_transaction->id);
3071 	}
3072 
3073 	cache_transaction* transaction = new(std::nothrow) cache_transaction;
3074 	if (transaction == NULL)
3075 		return B_NO_MEMORY;
3076 
3077 	transaction->id = atomic_add(&cache->next_transaction_id, 1);
3078 	cache->last_transaction = transaction;
3079 
3080 	TRACE(("cache_start_transaction(): id %" B_PRId32 " started\n", transaction->id));
3081 	T(Action("start", cache, transaction));
3082 
3083 	cache->transaction_hash->Insert(transaction);
3084 
3085 	return transaction->id;
3086 }
3087 
3088 
3089 status_t
3090 cache_sync_transaction(void* _cache, int32 id)
3091 {
3092 	block_cache* cache = (block_cache*)_cache;
3093 	bool hadBusy;
3094 
3095 	TRACE(("cache_sync_transaction(id %" B_PRId32 ")\n", id));
3096 
3097 	do {
3098 		TransactionLocker locker(cache);
3099 		hadBusy = false;
3100 
3101 		BlockWriter writer(cache);
3102 		TransactionTable::Iterator iterator(cache->transaction_hash);
3103 
3104 		while (iterator.HasNext()) {
3105 			// close all earlier transactions which haven't been closed yet
3106 			cache_transaction* transaction = iterator.Next();
3107 
3108 			if (transaction->busy_writing_count != 0) {
3109 				hadBusy = true;
3110 				continue;
3111 			}
3112 			if (transaction->id <= id && !transaction->open) {
3113 				// write back all of their remaining dirty blocks
3114 				T(Action("sync", cache, transaction));
3115 
3116 				bool hasLeftOvers;
3117 				writer.Add(transaction, hasLeftOvers);
3118 
3119 				if (hasLeftOvers) {
3120 					// This transaction contains blocks that a previous
3121 					// transaction is trying to write back in this write run
3122 					hadBusy = true;
3123 				}
3124 			}
3125 		}
3126 
3127 		status_t status = writer.Write();
3128 		if (status != B_OK)
3129 			return status;
3130 	} while (hadBusy);
3131 
3132 	wait_for_notifications(cache);
3133 		// make sure that all pending TRANSACTION_WRITTEN notifications
3134 		// are handled after we return
3135 	return B_OK;
3136 }
3137 
3138 
3139 status_t
3140 cache_end_transaction(void* _cache, int32 id,
3141 	transaction_notification_hook hook, void* data)
3142 {
3143 	block_cache* cache = (block_cache*)_cache;
3144 	TransactionLocker locker(cache);
3145 
3146 	TRACE(("cache_end_transaction(id = %" B_PRId32 ")\n", id));
3147 
3148 	cache_transaction* transaction = lookup_transaction(cache, id);
3149 	if (transaction == NULL) {
3150 		panic("cache_end_transaction(): invalid transaction ID\n");
3151 		return B_BAD_VALUE;
3152 	}
3153 
3154 	// Write back all pending transaction blocks
3155 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
3156 	if (status != B_OK)
3157 		return status;
3158 
3159 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3160 
3161 	if (hook != NULL
3162 		&& add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN,
3163 			hook, data) != B_OK) {
3164 		return B_NO_MEMORY;
3165 	}
3166 
3167 	T(Action("end", cache, transaction));
3168 
3169 	// iterate through all blocks and free the unchanged original contents
3170 
3171 	cached_block* next;
3172 	for (cached_block* block = transaction->first_block; block != NULL;
3173 			block = next) {
3174 		next = block->transaction_next;
3175 		ASSERT(block->previous_transaction == NULL);
3176 
3177 		if (block->discard) {
3178 			// This block has been discarded in the transaction
3179 			cache->DiscardBlock(block);
3180 			transaction->num_blocks--;
3181 			continue;
3182 		}
3183 
3184 		if (block->original_data != NULL) {
3185 			cache->Free(block->original_data);
3186 			block->original_data = NULL;
3187 		}
3188 		if (block->parent_data != NULL) {
3189 			ASSERT(transaction->has_sub_transaction);
3190 			cache->FreeBlockParentData(block);
3191 		}
3192 
3193 		// move the block to the previous transaction list
3194 		transaction->blocks.Add(block);
3195 
3196 		block->previous_transaction = transaction;
3197 		block->transaction_next = NULL;
3198 		block->transaction = NULL;
3199 	}
3200 
3201 	transaction->open = false;
3202 	return B_OK;
3203 }
3204 
3205 
3206 status_t
3207 cache_abort_transaction(void* _cache, int32 id)
3208 {
3209 	block_cache* cache = (block_cache*)_cache;
3210 	TransactionLocker locker(cache);
3211 
3212 	TRACE(("cache_abort_transaction(id = %" B_PRId32 ")\n", id));
3213 
3214 	cache_transaction* transaction = lookup_transaction(cache, id);
3215 	if (transaction == NULL) {
3216 		panic("cache_abort_transaction(): invalid transaction ID\n");
3217 		return B_BAD_VALUE;
3218 	}
3219 
3220 	T(Abort(cache, transaction));
3221 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3222 
3223 	// iterate through all blocks and restore their original contents
3224 
3225 	cached_block* block = transaction->first_block;
3226 	cached_block* next;
3227 	for (; block != NULL; block = next) {
3228 		next = block->transaction_next;
3229 
3230 		if (block->original_data != NULL) {
3231 			TRACE(("cache_abort_transaction(id = %" B_PRId32 "): restored contents of "
3232 				"block %" B_PRIdOFF "\n", transaction->id, block->block_number));
3233 			memcpy(block->current_data, block->original_data,
3234 				cache->block_size);
3235 			cache->Free(block->original_data);
3236 			block->original_data = NULL;
3237 		}
3238 		if (transaction->has_sub_transaction && block->parent_data != NULL)
3239 			cache->FreeBlockParentData(block);
3240 
3241 		block->transaction_next = NULL;
3242 		block->transaction = NULL;
3243 		block->discard = false;
3244 		if (block->previous_transaction == NULL)
3245 			block->is_dirty = false;
3246 	}
3247 
3248 	cache->transaction_hash->Remove(transaction);
3249 	delete_transaction(cache, transaction);
3250 	return B_OK;
3251 }
3252 
3253 
3254 /*!	Acknowledges the current parent transaction, and starts a new transaction
3255 	from its sub transaction.
3256 	The new transaction also gets a new transaction ID.
3257 */
3258 int32
3259 cache_detach_sub_transaction(void* _cache, int32 id,
3260 	transaction_notification_hook hook, void* data)
3261 {
3262 	block_cache* cache = (block_cache*)_cache;
3263 	TransactionLocker locker(cache);
3264 
3265 	TRACE(("cache_detach_sub_transaction(id = %" B_PRId32 ")\n", id));
3266 
3267 	cache_transaction* transaction = lookup_transaction(cache, id);
3268 	if (transaction == NULL) {
3269 		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
3270 		return B_BAD_VALUE;
3271 	}
3272 	if (!transaction->has_sub_transaction)
3273 		return B_BAD_VALUE;
3274 
3275 	// iterate through all blocks and free the unchanged original contents
3276 
3277 	status_t status = write_blocks_in_previous_transaction(cache, transaction);
3278 	if (status != B_OK)
3279 		return status;
3280 
3281 	// create a new transaction for the sub transaction
3282 	cache_transaction* newTransaction = new(std::nothrow) cache_transaction;
3283 	if (newTransaction == NULL)
3284 		return B_NO_MEMORY;
3285 
3286 	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
3287 	T(Detach(cache, transaction, newTransaction));
3288 
3289 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3290 
3291 	if (add_transaction_listener(cache, transaction, TRANSACTION_WRITTEN, hook,
3292 			data) != B_OK) {
3293 		delete newTransaction;
3294 		return B_NO_MEMORY;
3295 	}
3296 
3297 	cached_block* last = NULL;
3298 	cached_block* next;
3299 	for (cached_block* block = transaction->first_block; block != NULL;
3300 			block = next) {
3301 		next = block->transaction_next;
3302 		ASSERT(block->previous_transaction == NULL);
3303 
3304 		if (block->discard) {
3305 			cache->DiscardBlock(block);
3306 			transaction->main_num_blocks--;
3307 			continue;
3308 		}
3309 
3310 		if (block->parent_data != NULL) {
3311 			// The block changed in the parent - free the original data, since
3312 			// they will be replaced by what is in current.
3313 			ASSERT(block->original_data != NULL);
3314 			cache->Free(block->original_data);
3315 
3316 			if (block->parent_data != block->current_data) {
3317 				// The block had been changed in both transactions
3318 				block->original_data = block->parent_data;
3319 			} else {
3320 				// The block has only been changed in the parent
3321 				block->original_data = NULL;
3322 			}
3323 			block->parent_data = NULL;
3324 
3325 			// move the block to the previous transaction list
3326 			transaction->blocks.Add(block);
3327 			block->previous_transaction = transaction;
3328 		}
3329 
3330 		if (block->original_data != NULL) {
3331 			// This block had been changed in the current sub transaction,
3332 			// we need to move this block over to the new transaction.
3333 			ASSERT(block->parent_data == NULL);
3334 
3335 			if (last == NULL)
3336 				newTransaction->first_block = block;
3337 			else
3338 				last->transaction_next = block;
3339 
3340 			block->transaction = newTransaction;
3341 			last = block;
3342 		} else
3343 			block->transaction = NULL;
3344 
3345 		block->transaction_next = NULL;
3346 	}
3347 
3348 	newTransaction->num_blocks = transaction->sub_num_blocks;
3349 
3350 	transaction->open = false;
3351 	transaction->has_sub_transaction = false;
3352 	transaction->num_blocks = transaction->main_num_blocks;
3353 	transaction->sub_num_blocks = 0;
3354 
3355 	cache->transaction_hash->Insert(newTransaction);
3356 	cache->last_transaction = newTransaction;
3357 
3358 	return newTransaction->id;
3359 }
3360 
3361 
3362 status_t
3363 cache_abort_sub_transaction(void* _cache, int32 id)
3364 {
3365 	block_cache* cache = (block_cache*)_cache;
3366 	TransactionLocker locker(cache);
3367 
3368 	TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 ")\n", id));
3369 
3370 	cache_transaction* transaction = lookup_transaction(cache, id);
3371 	if (transaction == NULL) {
3372 		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
3373 		return B_BAD_VALUE;
3374 	}
3375 	if (!transaction->has_sub_transaction)
3376 		return B_BAD_VALUE;
3377 
3378 	T(Abort(cache, transaction));
3379 	notify_transaction_listeners(cache, transaction, TRANSACTION_ABORTED);
3380 
3381 	// revert all changes back to the version of the parent
3382 
3383 	cached_block* block = transaction->first_block;
3384 	cached_block* last = NULL;
3385 	cached_block* next;
3386 	for (; block != NULL; block = next) {
3387 		next = block->transaction_next;
3388 
3389 		if (block->parent_data == NULL) {
3390 			// The parent transaction didn't change the block, but the sub
3391 			// transaction did - we need to revert to the original data.
3392 			// The block is no longer part of the transaction
3393 			if (block->original_data != NULL) {
3394 				// The block might not have original data if was empty
3395 				memcpy(block->current_data, block->original_data,
3396 					cache->block_size);
3397 			}
3398 
3399 			if (last != NULL)
3400 				last->transaction_next = next;
3401 			else
3402 				transaction->first_block = next;
3403 
3404 			block->transaction_next = NULL;
3405 			block->transaction = NULL;
3406 			transaction->num_blocks--;
3407 
3408 			if (block->previous_transaction == NULL) {
3409 				cache->Free(block->original_data);
3410 				block->original_data = NULL;
3411 				block->is_dirty = false;
3412 
3413 				if (block->ref_count == 0) {
3414 					// Move the block into the unused list if possible
3415 					block->unused = true;
3416 					cache->unused_blocks.Add(block);
3417 					cache->unused_block_count++;
3418 				}
3419 			}
3420 		} else {
3421 			if (block->parent_data != block->current_data) {
3422 				// The block has been changed and must be restored - the block
3423 				// is still dirty and part of the transaction
3424 				TRACE(("cache_abort_sub_transaction(id = %" B_PRId32 "): "
3425 					"restored contents of block %" B_PRIdOFF "\n",
3426 					transaction->id, block->block_number));
3427 				memcpy(block->current_data, block->parent_data,
3428 					cache->block_size);
3429 				cache->Free(block->parent_data);
3430 				// The block stays dirty
3431 			}
3432 			block->parent_data = NULL;
3433 			last = block;
3434 		}
3435 
3436 		block->discard = false;
3437 	}
3438 
3439 	// all subsequent changes will go into the main transaction
3440 	transaction->has_sub_transaction = false;
3441 	transaction->sub_num_blocks = 0;
3442 
3443 	return B_OK;
3444 }
3445 
3446 
3447 status_t
3448 cache_start_sub_transaction(void* _cache, int32 id)
3449 {
3450 	block_cache* cache = (block_cache*)_cache;
3451 	TransactionLocker locker(cache);
3452 
3453 	TRACE(("cache_start_sub_transaction(id = %" B_PRId32 ")\n", id));
3454 
3455 	cache_transaction* transaction = lookup_transaction(cache, id);
3456 	if (transaction == NULL) {
3457 		panic("cache_start_sub_transaction(): invalid transaction ID %" B_PRId32 "\n",
3458 			id);
3459 		return B_BAD_VALUE;
3460 	}
3461 
3462 	notify_transaction_listeners(cache, transaction, TRANSACTION_ENDED);
3463 
3464 	// move all changed blocks up to the parent
3465 
3466 	cached_block* block = transaction->first_block;
3467 	cached_block* next;
3468 	for (; block != NULL; block = next) {
3469 		next = block->transaction_next;
3470 
3471 		if (block->parent_data != NULL) {
3472 			// There already is an older sub transaction - we acknowledge
3473 			// its changes and move its blocks up to the parent
3474 			ASSERT(transaction->has_sub_transaction);
3475 			cache->FreeBlockParentData(block);
3476 		}
3477 		if (block->discard) {
3478 			// This block has been discarded in the parent transaction.
3479 			// Just throw away any changes made in this transaction, so that
3480 			// it can still be reverted to its original contents if needed
3481 			ASSERT(block->previous_transaction == NULL);
3482 			if (block->original_data != NULL) {
3483 				memcpy(block->current_data, block->original_data,
3484 					cache->block_size);
3485 
3486 				cache->Free(block->original_data);
3487 				block->original_data = NULL;
3488 			}
3489 			continue;
3490 		}
3491 
3492 		// we "allocate" the parent data lazily, that means, we don't copy
3493 		// the data (and allocate memory for it) until we need to
3494 		block->parent_data = block->current_data;
3495 	}
3496 
3497 	// all subsequent changes will go into the sub transaction
3498 	transaction->has_sub_transaction = true;
3499 	transaction->main_num_blocks = transaction->num_blocks;
3500 	transaction->sub_num_blocks = 0;
3501 	T(Action("start-sub", cache, transaction));
3502 
3503 	return B_OK;
3504 }
3505 
3506 
3507 /*!	Adds a transaction listener that gets notified when the transaction
3508 	is ended, aborted, written, or idle as specified by \a events.
3509 	The listener gets automatically removed when the transaction ends.
3510 */
3511 status_t
3512 cache_add_transaction_listener(void* _cache, int32 id, int32 events,
3513 	transaction_notification_hook hook, void* data)
3514 {
3515 	block_cache* cache = (block_cache*)_cache;
3516 	TransactionLocker locker(cache);
3517 
3518 	cache_transaction* transaction = lookup_transaction(cache, id);
3519 	if (transaction == NULL)
3520 		return B_BAD_VALUE;
3521 
3522 	return add_transaction_listener(cache, transaction, events, hook, data);
3523 }
3524 
3525 
3526 status_t
3527 cache_remove_transaction_listener(void* _cache, int32 id,
3528 	transaction_notification_hook hookFunction, void* data)
3529 {
3530 	block_cache* cache = (block_cache*)_cache;
3531 	TransactionLocker locker(cache);
3532 
3533 	cache_transaction* transaction = lookup_transaction(cache, id);
3534 	if (transaction == NULL)
3535 		return B_BAD_VALUE;
3536 
3537 	ListenerList::Iterator iterator = transaction->listeners.GetIterator();
3538 	while (iterator.HasNext()) {
3539 		cache_listener* listener = iterator.Next();
3540 		if (listener->data == data && listener->hook == hookFunction) {
3541 			iterator.Remove();
3542 
3543 			if (listener->events_pending != 0) {
3544 				MutexLocker _(sNotificationsLock);
3545 				if (listener->events_pending != 0)
3546 					cache->pending_notifications.Remove(listener);
3547 			}
3548 			delete listener;
3549 			return B_OK;
3550 		}
3551 	}
3552 
3553 	return B_ENTRY_NOT_FOUND;
3554 }
3555 
3556 
3557 status_t
3558 cache_next_block_in_transaction(void* _cache, int32 id, bool mainOnly,
3559 	long* _cookie, off_t* _blockNumber, void** _data, void** _unchangedData)
3560 {
3561 	cached_block* block = (cached_block*)*_cookie;
3562 	block_cache* cache = (block_cache*)_cache;
3563 	TransactionLocker locker(cache);
3564 
3565 	cache_transaction* transaction = lookup_transaction(cache, id);
3566 	if (transaction == NULL || !transaction->open)
3567 		return B_BAD_VALUE;
3568 
3569 	if (block == NULL)
3570 		block = transaction->first_block;
3571 	else
3572 		block = block->transaction_next;
3573 
3574 	if (transaction->has_sub_transaction) {
3575 		if (mainOnly) {
3576 			// find next block that the parent changed
3577 			while (block != NULL && block->parent_data == NULL)
3578 				block = block->transaction_next;
3579 		} else {
3580 			// find next non-discarded block
3581 			while (block != NULL && block->discard)
3582 				block = block->transaction_next;
3583 		}
3584 	}
3585 
3586 	if (block == NULL)
3587 		return B_ENTRY_NOT_FOUND;
3588 
3589 	if (_blockNumber)
3590 		*_blockNumber = block->block_number;
3591 	if (_data)
3592 		*_data = mainOnly ? block->parent_data : block->current_data;
3593 	if (_unchangedData)
3594 		*_unchangedData = block->original_data;
3595 
3596 	*_cookie = (addr_t)block;
3597 	return B_OK;
3598 }
3599 
3600 
3601 int32
3602 cache_blocks_in_transaction(void* _cache, int32 id)
3603 {
3604 	block_cache* cache = (block_cache*)_cache;
3605 	TransactionLocker locker(cache);
3606 
3607 	cache_transaction* transaction = lookup_transaction(cache, id);
3608 	if (transaction == NULL)
3609 		return B_BAD_VALUE;
3610 
3611 	return transaction->num_blocks;
3612 }
3613 
3614 
3615 /*!	Returns the number of blocks that are part of the main transaction. If this
3616 	transaction does not have a sub transaction yet, this is the same value as
3617 	cache_blocks_in_transaction() would return.
3618 */
3619 int32
3620 cache_blocks_in_main_transaction(void* _cache, int32 id)
3621 {
3622 	block_cache* cache = (block_cache*)_cache;
3623 	TransactionLocker locker(cache);
3624 
3625 	cache_transaction* transaction = lookup_transaction(cache, id);
3626 	if (transaction == NULL)
3627 		return B_BAD_VALUE;
3628 
3629 	if (transaction->has_sub_transaction)
3630 		return transaction->main_num_blocks;
3631 
3632 	return transaction->num_blocks;
3633 }
3634 
3635 
3636 int32
3637 cache_blocks_in_sub_transaction(void* _cache, int32 id)
3638 {
3639 	block_cache* cache = (block_cache*)_cache;
3640 	TransactionLocker locker(cache);
3641 
3642 	cache_transaction* transaction = lookup_transaction(cache, id);
3643 	if (transaction == NULL)
3644 		return B_BAD_VALUE;
3645 
3646 	return transaction->sub_num_blocks;
3647 }
3648 
3649 
3650 /*!	Check if block is in transaction
3651 */
3652 bool
3653 cache_has_block_in_transaction(void* _cache, int32 id, off_t blockNumber)
3654 {
3655 	block_cache* cache = (block_cache*)_cache;
3656 	TransactionLocker locker(cache);
3657 
3658 	cached_block* block = cache->hash->Lookup(blockNumber);
3659 
3660 	return (block != NULL && block->transaction != NULL
3661 		&& block->transaction->id == id);
3662 }
3663 
3664 
3665 //	#pragma mark - public block cache API
3666 
3667 
3668 void
3669 block_cache_delete(void* _cache, bool allowWrites)
3670 {
3671 	block_cache* cache = (block_cache*)_cache;
3672 
3673 	if (allowWrites)
3674 		block_cache_sync(cache);
3675 
3676 	mutex_lock(&sCachesLock);
3677 	sCaches.Remove(cache);
3678 	mutex_unlock(&sCachesLock);
3679 
3680 	mutex_lock(&cache->lock);
3681 
3682 	// wait for all blocks to become unbusy
3683 	wait_for_busy_reading_blocks(cache);
3684 	wait_for_busy_writing_blocks(cache);
3685 
3686 	// free all blocks
3687 
3688 	cached_block* block = cache->hash->Clear(true);
3689 	while (block != NULL) {
3690 		cached_block* next = block->next;
3691 		cache->FreeBlock(block);
3692 		block = next;
3693 	}
3694 
3695 	// free all transactions (they will all be aborted)
3696 
3697 	cache_transaction* transaction = cache->transaction_hash->Clear(true);
3698 	while (transaction != NULL) {
3699 		cache_transaction* next = transaction->next;
3700 		delete transaction;
3701 		transaction = next;
3702 	}
3703 
3704 	delete cache;
3705 }
3706 
3707 
3708 void*
3709 block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
3710 {
3711 	block_cache* cache = new(std::nothrow) block_cache(fd, numBlocks, blockSize,
3712 		readOnly);
3713 	if (cache == NULL)
3714 		return NULL;
3715 
3716 	if (cache->Init() != B_OK) {
3717 		delete cache;
3718 		return NULL;
3719 	}
3720 
3721 	MutexLocker _(sCachesLock);
3722 	sCaches.Add(cache);
3723 
3724 	return cache;
3725 }
3726 
3727 
3728 status_t
3729 block_cache_sync(void* _cache)
3730 {
3731 	block_cache* cache = (block_cache*)_cache;
3732 
3733 	// We will sync all dirty blocks to disk that have a completed
3734 	// transaction or no transaction only
3735 
3736 	MutexLocker locker(&cache->lock);
3737 
3738 	BlockWriter writer(cache);
3739 	BlockTable::Iterator iterator(cache->hash);
3740 
3741 	while (iterator.HasNext()) {
3742 		cached_block* block = iterator.Next();
3743 		if (block->CanBeWritten())
3744 			writer.Add(block);
3745 	}
3746 
3747 	status_t status = writer.Write();
3748 
3749 	locker.Unlock();
3750 
3751 	wait_for_notifications(cache);
3752 		// make sure that all pending TRANSACTION_WRITTEN notifications
3753 		// are handled after we return
3754 	return status;
3755 }
3756 
3757 
3758 status_t
3759 block_cache_sync_etc(void* _cache, off_t blockNumber, size_t numBlocks)
3760 {
3761 	block_cache* cache = (block_cache*)_cache;
3762 
3763 	// We will sync all dirty blocks to disk that have a completed
3764 	// transaction or no transaction only
3765 
3766 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
3767 		panic("block_cache_sync_etc: invalid block number %" B_PRIdOFF
3768 			" (max %" B_PRIdOFF ")",
3769 			blockNumber, cache->max_blocks - 1);
3770 		return B_BAD_VALUE;
3771 	}
3772 
3773 	MutexLocker locker(&cache->lock);
3774 	BlockWriter writer(cache);
3775 
3776 	for (; numBlocks > 0; numBlocks--, blockNumber++) {
3777 		cached_block* block = cache->hash->Lookup(blockNumber);
3778 		if (block == NULL)
3779 			continue;
3780 
3781 		if (block->CanBeWritten())
3782 			writer.Add(block);
3783 	}
3784 
3785 	status_t status = writer.Write();
3786 
3787 	locker.Unlock();
3788 
3789 	wait_for_notifications(cache);
3790 		// make sure that all pending TRANSACTION_WRITTEN notifications
3791 		// are handled after we return
3792 	return status;
3793 }
3794 
3795 
3796 /*!	Discards a block from the current transaction or from the cache.
3797 	You have to call this function when you no longer use a block, ie. when it
3798 	might be reclaimed by the file cache in order to make sure they won't
3799 	interfere.
3800 */
3801 void
3802 block_cache_discard(void* _cache, off_t blockNumber, size_t numBlocks)
3803 {
3804 	// TODO: this could be a nice place to issue the ATA trim command
3805 	block_cache* cache = (block_cache*)_cache;
3806 	TransactionLocker locker(cache);
3807 
3808 	BlockWriter writer(cache);
3809 
3810 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3811 		cached_block* block = cache->hash->Lookup(blockNumber);
3812 		if (block != NULL && block->previous_transaction != NULL)
3813 			writer.Add(block);
3814 	}
3815 
3816 	writer.Write();
3817 		// TODO: this can fail, too!
3818 
3819 	blockNumber -= numBlocks;
3820 		// reset blockNumber to its original value
3821 
3822 	for (size_t i = 0; i < numBlocks; i++, blockNumber++) {
3823 		cached_block* block = cache->hash->Lookup(blockNumber);
3824 		if (block == NULL)
3825 			continue;
3826 
3827 		ASSERT(block->previous_transaction == NULL);
3828 
3829 		if (block->unused) {
3830 			cache->unused_blocks.Remove(block);
3831 			cache->unused_block_count--;
3832 			cache->RemoveBlock(block);
3833 		} else {
3834 			if (block->transaction != NULL && block->parent_data != NULL
3835 				&& block->parent_data != block->current_data) {
3836 				panic("Discarded block %" B_PRIdOFF " has already been changed in this "
3837 					"transaction!", blockNumber);
3838 			}
3839 
3840 			// mark it as discarded (in the current transaction only, if any)
3841 			block->discard = true;
3842 		}
3843 	}
3844 }
3845 
3846 
3847 status_t
3848 block_cache_make_writable(void* _cache, off_t blockNumber, int32 transaction)
3849 {
3850 	block_cache* cache = (block_cache*)_cache;
3851 	MutexLocker locker(&cache->lock);
3852 
3853 	if (cache->read_only) {
3854 		panic("tried to make block writable on a read-only cache!");
3855 		return B_ERROR;
3856 	}
3857 
3858 	// TODO: this can be done better!
3859 	void* block;
3860 	status_t status = get_writable_cached_block(cache, blockNumber,
3861 		transaction, false, &block);
3862 	if (status == B_OK) {
3863 		put_cached_block((block_cache*)_cache, blockNumber);
3864 		return B_OK;
3865 	}
3866 
3867 	return status;
3868 }
3869 
3870 
3871 status_t
3872 block_cache_get_writable_etc(void* _cache, off_t blockNumber,
3873 	int32 transaction, void** _block)
3874 {
3875 	block_cache* cache = (block_cache*)_cache;
3876 	MutexLocker locker(&cache->lock);
3877 
3878 	TRACE(("block_cache_get_writable_etc(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3879 		blockNumber, transaction));
3880 	if (cache->read_only)
3881 		panic("tried to get writable block on a read-only cache!");
3882 
3883 	return get_writable_cached_block(cache, blockNumber,
3884 		transaction, false, _block);
3885 }
3886 
3887 
3888 void*
3889 block_cache_get_writable(void* _cache, off_t blockNumber, int32 transaction)
3890 {
3891 	void* block;
3892 	if (block_cache_get_writable_etc(_cache, blockNumber,
3893 			transaction, &block) == B_OK)
3894 		return block;
3895 
3896 	return NULL;
3897 }
3898 
3899 
3900 void*
3901 block_cache_get_empty(void* _cache, off_t blockNumber, int32 transaction)
3902 {
3903 	block_cache* cache = (block_cache*)_cache;
3904 	MutexLocker locker(&cache->lock);
3905 
3906 	TRACE(("block_cache_get_empty(block = %" B_PRIdOFF ", transaction = %" B_PRId32 ")\n",
3907 		blockNumber, transaction));
3908 	if (cache->read_only)
3909 		panic("tried to get empty writable block on a read-only cache!");
3910 
3911 	void* block;
3912 	if (get_writable_cached_block((block_cache*)_cache, blockNumber,
3913 			transaction, true, &block) == B_OK)
3914 		return block;
3915 
3916 	return NULL;
3917 }
3918 
3919 
3920 status_t
3921 block_cache_get_etc(void* _cache, off_t blockNumber, const void** _block)
3922 {
3923 	block_cache* cache = (block_cache*)_cache;
3924 	MutexLocker locker(&cache->lock);
3925 	bool allocated;
3926 
3927 	cached_block* block;
3928 	status_t status = get_cached_block(cache, blockNumber, &allocated, true,
3929 		&block);
3930 	if (status != B_OK)
3931 		return status;
3932 
3933 #if BLOCK_CACHE_DEBUG_CHANGED
3934 	if (block->compare == NULL)
3935 		block->compare = cache->Allocate();
3936 	if (block->compare != NULL)
3937 		memcpy(block->compare, block->current_data, cache->block_size);
3938 #endif
3939 	TB(Get(cache, block));
3940 
3941 	*_block = block->current_data;
3942 	return B_OK;
3943 }
3944 
3945 
3946 const void*
3947 block_cache_get(void* _cache, off_t blockNumber)
3948 {
3949 	const void* block;
3950 	if (block_cache_get_etc(_cache, blockNumber, &block) == B_OK)
3951 		return block;
3952 
3953 	return NULL;
3954 }
3955 
3956 
3957 /*!	Changes the internal status of a writable block to \a dirty. This can be
3958 	helpful in case you realize you don't need to change that block anymore
3959 	for whatever reason.
3960 
3961 	Note, you must only use this function on blocks that were acquired
3962 	writable!
3963 */
3964 status_t
3965 block_cache_set_dirty(void* _cache, off_t blockNumber, bool dirty,
3966 	int32 transaction)
3967 {
3968 	block_cache* cache = (block_cache*)_cache;
3969 	MutexLocker locker(&cache->lock);
3970 
3971 	cached_block* block = cache->hash->Lookup(blockNumber);
3972 	if (block == NULL)
3973 		return B_BAD_VALUE;
3974 	if (block->is_dirty == dirty) {
3975 		// there is nothing to do for us
3976 		return B_OK;
3977 	}
3978 
3979 	// TODO: not yet implemented
3980 	if (dirty)
3981 		panic("block_cache_set_dirty(): not yet implemented that way!\n");
3982 
3983 	return B_OK;
3984 }
3985 
3986 
3987 void
3988 block_cache_put(void* _cache, off_t blockNumber)
3989 {
3990 	block_cache* cache = (block_cache*)_cache;
3991 	MutexLocker locker(&cache->lock);
3992 
3993 	put_cached_block(cache, blockNumber);
3994 }
3995 
3996 
3997 /*! Allocates blocks and schedules them to be read from disk, but does not get references to the
3998 	blocks.
3999 	@param blockNumber The index of the first requested block.
4000 	@param _numBlocks As input, the number of blocks requested. As output, the number of
4001 	blocks actually scheduled.  Prefetching will stop short if the requested range includes a
4002 	block that is already cached.
4003 */
4004 status_t
4005 block_cache_prefetch(void* _cache, off_t blockNumber, size_t* _numBlocks)
4006 {
4007 #ifndef BUILDING_USERLAND_FS_SERVER
4008 	TRACE(("block_cache_prefetch: fetching %" B_PRIuSIZE " blocks starting with %" B_PRIdOFF "\n",
4009 		*_numBlocks, blockNumber));
4010 
4011 	block_cache* cache = reinterpret_cast<block_cache*>(_cache);
4012 	MutexLocker locker(&cache->lock);
4013 
4014 	size_t numBlocks = *_numBlocks;
4015 	*_numBlocks = 0;
4016 
4017 	BlockPrefetcher* blockPrefetcher = new BlockPrefetcher(cache, blockNumber, numBlocks);
4018 
4019 	status_t status = blockPrefetcher->Allocate();
4020 	if (status != B_OK || blockPrefetcher->NumAllocated() == 0) {
4021 		TRACE(("block_cache_prefetch returning early (%s): allocated %" B_PRIuSIZE "\n",
4022 			strerror(status), blockPrefetcher->NumAllocated()));
4023 		delete blockPrefetcher;
4024 		return status;
4025 	}
4026 
4027 	numBlocks = blockPrefetcher->NumAllocated();
4028 
4029 	status = blockPrefetcher->ReadAsync();
4030 
4031 	if (status == B_OK)
4032 		*_numBlocks = numBlocks;
4033 
4034 	return status;
4035 
4036 #else // BUILDING_USERLAND_FS_SERVER
4037 	*_numBlocks = 0;
4038 	return B_UNSUPPORTED;
4039 #endif // !BUILDING_USERLAND_FS_SERVER
4040 }
4041