xref: /haiku/src/system/kernel/cache/block_cache.cpp (revision 020cbad9d40235a2c50a81a42d69912a5ff8fbc4)
1 /*
2  * Copyright 2004-2008, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include "block_cache_private.h"
8 
9 #include <unistd.h>
10 #include <signal.h>
11 #include <stdlib.h>
12 #include <string.h>
13 #include <errno.h>
14 
15 #include <KernelExport.h>
16 #include <fs_cache.h>
17 
18 #include <block_cache.h>
19 #include <lock.h>
20 #include <vm_low_memory.h>
21 #include <tracing.h>
22 #include <util/kernel_cpp.h>
23 #include <util/DoublyLinkedList.h>
24 #include <util/AutoLock.h>
25 #include <util/khash.h>
26 
27 
28 // TODO: this is a naive but growing implementation to test the API:
29 //	1) block reading/writing is not at all optimized for speed, it will
30 //	   just read and write single blocks.
31 //	2) the locking could be improved; getting a block should not need to
32 //	   wait for blocks to be written
33 //	3) dirty blocks are only written back if asked for
34 // TODO: the retrieval/copy of the original data could be delayed until the
35 //		new data must be written, ie. in low memory situations.
36 
37 //#define TRACE_BLOCK_CACHE
38 #ifdef TRACE_BLOCK_CACHE
39 #	define TRACE(x)	dprintf x
40 #else
41 #	define TRACE(x) ;
42 #endif
43 
44 #define DEBUG_BLOCK_CACHE
45 //#define DEBUG_CHANGED
46 
47 // This macro is used for fatal situations that are acceptable in a running
48 // system, like out of memory situations - should only panic for debugging.
49 #define FATAL(x) panic x
50 
51 
52 struct cache_hook : DoublyLinkedListLinkImpl<cache_hook> {
53 	transaction_notification_hook	hook;
54 	void							*data;
55 };
56 
57 typedef DoublyLinkedList<cache_hook> HookList;
58 
59 struct cache_transaction {
60 	cache_transaction();
61 
62 	cache_transaction *next;
63 	int32			id;
64 	int32			num_blocks;
65 	int32			main_num_blocks;
66 	int32			sub_num_blocks;
67 	cached_block	*first_block;
68 	block_list		blocks;
69 	transaction_notification_hook notification_hook;
70 	void			*notification_data;
71 	HookList		listeners;
72 	bool			open;
73 	bool			has_sub_transaction;
74 };
75 
76 #ifdef BLOCK_CACHE_TRANSACTION_TRACING
77 namespace TransactionTracing {
78 
79 class Action : public AbstractTraceEntry {
80 	public:
81 		Action(const char *label, block_cache *cache,
82 				cache_transaction *transaction)
83 			:
84 			fCache(cache),
85 			fTransaction(transaction),
86 			fID(transaction->id),
87 			fSub(transaction->has_sub_transaction),
88 			fNumBlocks(transaction->num_blocks),
89 			fSubNumBlocks(transaction->sub_num_blocks)
90 		{
91 			strlcpy(fLabel, label, sizeof(label));
92 			Initialized();
93 		}
94 
95 		virtual void AddDump(TraceOutput& out)
96 		{
97 			out.Print("cache %p, %s transaction %p (id %ld)%s"
98 				", %ld/%ld blocks", fCache, fLabel, fTransaction, fID,
99 				fSub ? " sub" : "", fNumBlocks, fSubNumBlocks);
100 		}
101 
102 	private:
103 		char				fLabel[12];
104 		block_cache			*fCache;
105 		cache_transaction	*fTransaction;
106 		int32				fID;
107 		bool				fSub;
108 		int32				fNumBlocks;
109 		int32				fSubNumBlocks;
110 };
111 
112 class Detach : public AbstractTraceEntry {
113 	public:
114 		Detach(block_cache *cache, cache_transaction *transaction,
115 				cache_transaction *newTransaction)
116 			:
117 			fCache(cache),
118 			fTransaction(transaction),
119 			fID(transaction->id),
120 			fSub(transaction->has_sub_transaction),
121 			fNewTransaction(newTransaction),
122 			fNewID(newTransaction->id)
123 		{
124 			Initialized();
125 		}
126 
127 		virtual void AddDump(TraceOutput& out)
128 		{
129 			out.Print("cache %p, detach transaction %p (id %ld)"
130 				"from transaction %p (id %ld)%s",
131 				fCache, fNewTransaction, fNewID, fTransaction, fID,
132 				fSub ? " sub" : "");
133 		}
134 
135 	private:
136 		block_cache			*fCache;
137 		cache_transaction	*fTransaction;
138 		int32				fID;
139 		bool				fSub;
140 		cache_transaction	*fNewTransaction;
141 		int32				fNewID;
142 };
143 
144 class Abort : public AbstractTraceEntry {
145 	public:
146 		Abort(block_cache *cache, cache_transaction *transaction)
147 			:
148 			fCache(cache),
149 			fTransaction(transaction),
150 			fID(transaction->id),
151 			fNumBlocks(0)
152 		{
153 			bool isSub = transaction->has_sub_transaction;
154 			fNumBlocks = isSub ? transaction->sub_num_blocks
155 				: transaction->num_blocks;
156 			fBlocks = (off_t *)alloc_tracing_buffer(fNumBlocks * sizeof(off_t));
157 			if (fBlocks != NULL) {
158 				cached_block *block = transaction->first_block;
159 				for (int32 i = 0; block != NULL && i < fNumBlocks;
160 						block = block->transaction_next) {
161 					fBlocks[i++] = block->block_number;
162 				}
163 			} else
164 				fNumBlocks = 0;
165 			Initialized();
166 		}
167 
168 		virtual void AddDump(TraceOutput& out)
169 		{
170 			out.Print("cache %p, abort transaction "
171 				"%p (id %ld), blocks", fCache, fTransaction, fID);
172 			for (int32 i = 0; i < fNumBlocks && !out.IsFull(); i++)
173 				out.Print(" %Ld", fBlocks[i]);
174 		}
175 
176 	private:
177 		block_cache			*fCache;
178 		cache_transaction	*fTransaction;
179 		int32				fID;
180 		off_t				*fBlocks;
181 		int32				fNumBlocks;
182 };
183 
184 }	// namespace TransactionTracing
185 
186 #	define T(x) new(std::nothrow) TransactionTracing::x;
187 #else
188 #	define T(x) ;
189 #endif
190 
191 
192 static status_t write_cached_block(block_cache *cache, cached_block *block,
193 	bool deleteTransaction = true);
194 
195 
196 static DoublyLinkedList<block_cache> sCaches;
197 static mutex sCachesLock;
198 static DoublyLinkedListLink<block_cache> sMarkCache;
199 	// TODO: this only works if the link is the first entry of block_cache
200 static object_cache *sBlockCache;
201 
202 
203 //	#pragma mark - private transaction
204 
205 
206 cache_transaction::cache_transaction()
207 {
208 	num_blocks = 0;
209 	main_num_blocks = 0;
210 	sub_num_blocks = 0;
211 	first_block = NULL;
212 	notification_hook = NULL;
213 	notification_data = NULL;
214 	open = true;
215 }
216 
217 
218 static int
219 transaction_compare(void *_transaction, const void *_id)
220 {
221 	cache_transaction *transaction = (cache_transaction *)_transaction;
222 	const int32 *id = (const int32 *)_id;
223 
224 	return transaction->id - *id;
225 }
226 
227 
228 static uint32
229 transaction_hash(void *_transaction, const void *_id, uint32 range)
230 {
231 	cache_transaction *transaction = (cache_transaction *)_transaction;
232 	const int32 *id = (const int32 *)_id;
233 
234 	if (transaction != NULL)
235 		return transaction->id % range;
236 
237 	return (uint32)*id % range;
238 }
239 
240 
241 /*!	Notifies all listeners of this transaction, and removes them
242 	afterwards.
243 */
244 static void
245 notify_transaction_listeners(cache_transaction *transaction, int32 event)
246 {
247 	HookList::Iterator iterator = transaction->listeners.GetIterator();
248 	while (iterator.HasNext()) {
249 		cache_hook *hook = iterator.Next();
250 
251 		hook->hook(transaction->id, event, hook->data);
252 
253 		iterator.Remove();
254 		delete hook;
255 	}
256 }
257 
258 
259 static void
260 delete_transaction(block_cache *cache, cache_transaction *transaction)
261 {
262 	if (cache->last_transaction == transaction)
263 		cache->last_transaction = NULL;
264 
265 	delete transaction;
266 }
267 
268 
269 static cache_transaction *
270 lookup_transaction(block_cache *cache, int32 id)
271 {
272 	return (cache_transaction *)hash_lookup(cache->transaction_hash, &id);
273 }
274 
275 
276 //	#pragma mark - cached_block
277 
278 
279 int
280 compare_blocks(const void *_blockA, const void *_blockB)
281 {
282 	cached_block *blockA = *(cached_block **)_blockA;
283 	cached_block *blockB = *(cached_block **)_blockB;
284 
285 	off_t diff = blockA->block_number - blockB->block_number;
286 	if (diff > 0)
287 		return 1;
288 
289 	return diff < 0 ? -1 : 0;
290 }
291 
292 
293 /*static*/ int
294 cached_block::Compare(void *_cacheEntry, const void *_block)
295 {
296 	cached_block *cacheEntry = (cached_block *)_cacheEntry;
297 	const off_t *block = (const off_t *)_block;
298 
299 	return cacheEntry->block_number - *block;
300 }
301 
302 
303 
304 /*static*/ uint32
305 cached_block::Hash(void *_cacheEntry, const void *_block, uint32 range)
306 {
307 	cached_block *cacheEntry = (cached_block *)_cacheEntry;
308 	const off_t *block = (const off_t *)_block;
309 
310 	if (cacheEntry != NULL)
311 		return cacheEntry->block_number % range;
312 
313 	return (uint64)*block % range;
314 }
315 
316 
317 //	#pragma mark - block_cache
318 
319 
320 block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize,
321 		bool readOnly)
322 	:
323 	hash(NULL),
324 	fd(_fd),
325 	max_blocks(numBlocks),
326 	block_size(blockSize),
327 	next_transaction_id(1),
328 	last_transaction(NULL),
329 	transaction_hash(NULL),
330 	num_dirty_blocks(0),
331 	read_only(readOnly)
332 {
333 	mutex_lock(&sCachesLock);
334 	sCaches.Add(this);
335 	mutex_unlock(&sCachesLock);
336 
337 	buffer_cache = create_object_cache_etc("block cache buffers", blockSize,
338 		8, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
339 	if (buffer_cache == NULL)
340 		return;
341 
342 	hash = hash_init(1024, offsetof(cached_block, next), &cached_block::Compare,
343 		&cached_block::Hash);
344 	if (hash == NULL)
345 		return;
346 
347 	transaction_hash = hash_init(16, offsetof(cache_transaction, next),
348 		&transaction_compare, &::transaction_hash);
349 	if (transaction_hash == NULL)
350 		return;
351 
352 	if (benaphore_init(&lock, "block cache") < B_OK)
353 		return;
354 
355 	register_low_memory_handler(&block_cache::LowMemoryHandler, this, 0);
356 }
357 
358 
359 block_cache::~block_cache()
360 {
361 	mutex_lock(&sCachesLock);
362 	sCaches.Remove(this);
363 	mutex_unlock(&sCachesLock);
364 
365 	unregister_low_memory_handler(&block_cache::LowMemoryHandler, this);
366 
367 	benaphore_destroy(&lock);
368 
369 	hash_uninit(transaction_hash);
370 	hash_uninit(hash);
371 
372 	delete_object_cache(buffer_cache);
373 }
374 
375 
376 status_t
377 block_cache::InitCheck()
378 {
379 	if (lock.sem < B_OK)
380 		return lock.sem;
381 
382 	if (buffer_cache == NULL || hash == NULL || transaction_hash == NULL)
383 		return B_NO_MEMORY;
384 
385 	return B_OK;
386 }
387 
388 
389 void
390 block_cache::Free(void *buffer)
391 {
392 	if (buffer != NULL)
393 		object_cache_free(buffer_cache, buffer);
394 }
395 
396 
397 void *
398 block_cache::Allocate()
399 {
400 	return object_cache_alloc(buffer_cache, 0);
401 }
402 
403 
404 void
405 block_cache::FreeBlock(cached_block *block)
406 {
407 	Free(block->current_data);
408 
409 	if (block->original_data != NULL || block->parent_data != NULL) {
410 		panic("block_cache::FreeBlock(): %Ld, original %p, parent %p\n",
411 			block->block_number, block->original_data, block->parent_data);
412 	}
413 
414 #ifdef DEBUG_CHANGED
415 	Free(block->compare);
416 #endif
417 
418 	object_cache_free(sBlockCache, block);
419 }
420 
421 
422 cached_block *
423 block_cache::_GetUnusedBlock()
424 {
425 	TRACE(("block_cache: get unused block\n"));
426 
427 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
428 			cached_block *block = iterator.Next();) {
429 		// this can only happen if no transactions are used
430 		if (block->is_dirty)
431 			write_cached_block(this, block, false);
432 
433 		// remove block from lists
434 		iterator.Remove();
435 		hash_remove(hash, block);
436 
437 		// TODO: see if parent/compare data is handled correctly here!
438 		if (block->parent_data != NULL
439 			&& block->parent_data != block->original_data)
440 			Free(block->parent_data);
441 		if (block->original_data != NULL)
442 			Free(block->original_data);
443 
444 		return block;
445 	}
446 
447 	return NULL;
448 }
449 
450 
451 /*! Allocates a new block for \a blockNumber, ready for use */
452 cached_block *
453 block_cache::NewBlock(off_t blockNumber)
454 {
455 	cached_block *block = (cached_block *)object_cache_alloc(sBlockCache, 0);
456 	if (block == NULL) {
457 		dprintf("block allocation failed, unused list is %sempty.\n",
458 			unused_blocks.IsEmpty() ? "" : "not ");
459 
460 		// allocation failed, try to reuse an unused block
461 		block = _GetUnusedBlock();
462 		if (block == NULL) {
463 			FATAL(("could not allocate block!\n"));
464 			return NULL;
465 		}
466 	}
467 
468 	block->current_data = Allocate();
469 	if (block->current_data == NULL) {
470 		object_cache_free(sBlockCache, block);
471 		return NULL;
472 	}
473 
474 	block->block_number = blockNumber;
475 	block->ref_count = 0;
476 	block->accessed = 0;
477 	block->transaction_next = NULL;
478 	block->transaction = block->previous_transaction = NULL;
479 	block->original_data = NULL;
480 	block->parent_data = NULL;
481 	block->is_dirty = false;
482 	block->unused = false;
483 #ifdef DEBUG_CHANGED
484 	block->compare = NULL;
485 #endif
486 
487 	return block;
488 }
489 
490 
491 void
492 block_cache::RemoveUnusedBlocks(int32 maxAccessed, int32 count)
493 {
494 	TRACE(("block_cache: remove up to %ld unused blocks\n", count));
495 
496 	for (block_list::Iterator iterator = unused_blocks.GetIterator();
497 			cached_block *block = iterator.Next();) {
498 		if (maxAccessed < block->accessed)
499 			continue;
500 
501 		TRACE(("  remove block %Ld, accessed %ld times\n",
502 			block->block_number, block->accessed));
503 
504 		// this can only happen if no transactions are used
505 		if (block->is_dirty)
506 			write_cached_block(this, block, false);
507 
508 		// remove block from lists
509 		iterator.Remove();
510 		hash_remove(hash, block);
511 
512 		FreeBlock(block);
513 
514 		if (--count <= 0)
515 			break;
516 	}
517 }
518 
519 
520 void
521 block_cache::LowMemoryHandler(void *data, int32 level)
522 {
523 	block_cache *cache = (block_cache *)data;
524 	BenaphoreLocker locker(&cache->lock);
525 
526 	if (!locker.IsLocked()) {
527 		// If our block_cache were deleted, it could be that we had
528 		// been called before that deletion went through, therefore,
529 		// acquiring its lock might fail.
530 		return;
531 	}
532 
533 	TRACE(("block_cache: low memory handler called with level %ld\n", level));
534 
535 	// free some blocks according to the low memory state
536 	// (if there is enough memory left, we don't free any)
537 
538 	int32 free = 1;
539 	int32 accessed = 1;
540 	switch (vm_low_memory_state()) {
541 		case B_NO_LOW_MEMORY:
542 			return;
543 		case B_LOW_MEMORY_NOTE:
544 			free = 50;
545 			accessed = 2;
546 			break;
547 		case B_LOW_MEMORY_WARNING:
548 			free = 200;
549 			accessed = 10;
550 			break;
551 		case B_LOW_MEMORY_CRITICAL:
552 			free = LONG_MAX;
553 			accessed = LONG_MAX;
554 			break;
555 	}
556 
557 	cache->RemoveUnusedBlocks(accessed, free);
558 }
559 
560 
561 //	#pragma mark - private block functions
562 
563 
564 static void
565 put_cached_block(block_cache *cache, cached_block *block)
566 {
567 #ifdef DEBUG_CHANGED
568 	if (!block->is_dirty && block->compare != NULL
569 		&& memcmp(block->current_data, block->compare, cache->block_size)) {
570 		dprintf("new block:\n");
571 		dump_block((const char *)block->current_data, 256, "  ");
572 		dprintf("unchanged block:\n");
573 		dump_block((const char *)block->compare, 256, "  ");
574 		write_cached_block(cache, block);
575 		panic("block_cache: supposed to be clean block was changed!\n");
576 
577 		cache->Free(block->compare);
578 		block->compare = NULL;
579 	}
580 #endif
581 
582 	if (block->ref_count < 1) {
583 		panic("Invalid ref_count for block %p, cache %p\n", block, cache);
584 		return;
585 	}
586 
587 	if (--block->ref_count == 0
588 		&& block->transaction == NULL
589 		&& block->previous_transaction == NULL) {
590 		// put this block in the list of unused blocks
591 		block->unused = true;
592 if (block->original_data != NULL || block->parent_data != NULL)
593 	panic("put_cached_block(): %p (%Ld): %p, %p\n", block, block->block_number, block->original_data, block->parent_data);
594 		cache->unused_blocks.Add(block);
595 //		block->current_data = cache->allocator->Release(block->current_data);
596 	}
597 
598 	// free some blocks according to the low memory state
599 	// (if there is enough memory left, we don't free any)
600 
601 	int32 free = 1;
602 	switch (vm_low_memory_state()) {
603 		case B_NO_LOW_MEMORY:
604 			return;
605 		case B_LOW_MEMORY_NOTE:
606 			free = 1;
607 			break;
608 		case B_LOW_MEMORY_WARNING:
609 			free = 5;
610 			break;
611 		case B_LOW_MEMORY_CRITICAL:
612 			free = 20;
613 			break;
614 	}
615 
616 	cache->RemoveUnusedBlocks(LONG_MAX, free);
617 }
618 
619 
620 static void
621 put_cached_block(block_cache *cache, off_t blockNumber)
622 {
623 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
624 		panic("put_cached_block: invalid block number %lld (max %lld)",
625 			blockNumber, cache->max_blocks - 1);
626 	}
627 
628 	cached_block *block = (cached_block *)hash_lookup(cache->hash, &blockNumber);
629 	if (block != NULL)
630 		put_cached_block(cache, block);
631 }
632 
633 
634 /*!
635 	Retrieves the block \a blockNumber from the hash table, if it's already
636 	there, or reads it from the disk.
637 
638 	\param _allocated tells you wether or not a new block has been allocated
639 		to satisfy your request.
640 	\param readBlock if \c false, the block will not be read in case it was
641 		not already in the cache. The block you retrieve may contain random
642 		data.
643 */
644 static cached_block *
645 get_cached_block(block_cache *cache, off_t blockNumber, bool *_allocated,
646 	bool readBlock = true)
647 {
648 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
649 		panic("get_cached_block: invalid block number %lld (max %lld)",
650 			blockNumber, cache->max_blocks - 1);
651 		return NULL;
652 	}
653 
654 	cached_block *block = (cached_block *)hash_lookup(cache->hash,
655 		&blockNumber);
656 	*_allocated = false;
657 
658 	if (block == NULL) {
659 		// read block into cache
660 		block = cache->NewBlock(blockNumber);
661 		if (block == NULL)
662 			return NULL;
663 
664 		hash_insert_grow(cache->hash, block);
665 		*_allocated = true;
666 	}
667 
668 	if (*_allocated && readBlock) {
669 		int32 blockSize = cache->block_size;
670 
671 		ssize_t bytesRead = read_pos(cache->fd, blockNumber * blockSize,
672 			block->current_data, blockSize);
673 		if (bytesRead < blockSize) {
674 			hash_remove(cache->hash, block);
675 			cache->FreeBlock(block);
676 			FATAL(("could not read block %Ld: bytesRead: %ld, error: %s\n",
677 				blockNumber, bytesRead, strerror(errno)));
678 			return NULL;
679 		}
680 	}
681 
682 	if (block->unused) {
683 		//TRACE(("remove block %Ld from unused\n", blockNumber));
684 		block->unused = false;
685 		cache->unused_blocks.Remove(block);
686 	}
687 
688 	block->ref_count++;
689 	block->accessed++;
690 
691 	return block;
692 }
693 
694 
695 /*!
696 	Returns the writable block data for the requested blockNumber.
697 	If \a cleared is true, the block is not read from disk; an empty block
698 	is returned.
699 
700 	This is the only method to insert a block into a transaction. It makes
701 	sure that the previous block contents are preserved in that case.
702 */
703 static void *
704 get_writable_cached_block(block_cache *cache, off_t blockNumber, off_t base,
705 	off_t length, int32 transactionID, bool cleared)
706 {
707 	TRACE(("get_writable_cached_block(blockNumber = %Ld, transaction = %ld)\n",
708 		blockNumber, transactionID));
709 
710 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
711 		panic("get_writable_cached_block: invalid block number %lld (max %lld)",
712 			blockNumber, cache->max_blocks - 1);
713 	}
714 
715 	bool allocated;
716 	cached_block *block = get_cached_block(cache, blockNumber, &allocated,
717 		!cleared);
718 	if (block == NULL)
719 		return NULL;
720 
721 	// if there is no transaction support, we just return the current block
722 	if (transactionID == -1) {
723 		if (cleared)
724 			memset(block->current_data, 0, cache->block_size);
725 
726 		if (!block->is_dirty)
727 			cache->num_dirty_blocks++;
728 
729 		block->is_dirty = true;
730 			// mark the block as dirty
731 
732 		return block->current_data;
733 	}
734 
735 	cache_transaction *transaction = block->transaction;
736 
737 	if (transaction != NULL && transaction->id != transactionID) {
738 		// ToDo: we have to wait here until the other transaction is done.
739 		//	Maybe we should even panic, since we can't prevent any deadlocks.
740 		panic("get_writable_cached_block(): asked to get busy writable block (transaction %ld)\n", block->transaction->id);
741 		put_cached_block(cache, block);
742 		return NULL;
743 	}
744 	if (transaction == NULL && transactionID != -1) {
745 		// get new transaction
746 		transaction = lookup_transaction(cache, transactionID);
747 		if (transaction == NULL) {
748 			panic("get_writable_cached_block(): invalid transaction %ld!\n",
749 				transactionID);
750 			put_cached_block(cache, block);
751 			return NULL;
752 		}
753 		if (!transaction->open) {
754 			panic("get_writable_cached_block(): transaction already done!\n");
755 			put_cached_block(cache, block);
756 			return NULL;
757 		}
758 
759 		block->transaction = transaction;
760 
761 		// attach the block to the transaction block list
762 		block->transaction_next = transaction->first_block;
763 		transaction->first_block = block;
764 		transaction->num_blocks++;
765 	}
766 
767 	bool wasUnchanged = block->original_data == NULL
768 		|| block->previous_transaction != NULL;
769 
770 	if (!(allocated && cleared) && block->original_data == NULL) {
771 		// we already have data, so we need to preserve it
772 		block->original_data = cache->Allocate();
773 		if (block->original_data == NULL) {
774 			FATAL(("could not allocate original_data\n"));
775 			put_cached_block(cache, block);
776 			return NULL;
777 		}
778 
779 		memcpy(block->original_data, block->current_data, cache->block_size);
780 	}
781 	if (block->parent_data == block->current_data) {
782 		// remember any previous contents for the parent transaction
783 		block->parent_data = cache->Allocate();
784 		if (block->parent_data == NULL) {
785 			// TODO: maybe we should just continue the current transaction in this case...
786 			FATAL(("could not allocate parent\n"));
787 			put_cached_block(cache, block);
788 			return NULL;
789 		}
790 
791 		memcpy(block->parent_data, block->current_data, cache->block_size);
792 		transaction->sub_num_blocks++;
793 	} else if (transaction != NULL && transaction->has_sub_transaction
794 		&& block->parent_data == NULL && wasUnchanged)
795 		transaction->sub_num_blocks++;
796 
797 	if (cleared)
798 		memset(block->current_data, 0, cache->block_size);
799 
800 	block->is_dirty = true;
801 
802 	return block->current_data;
803 }
804 
805 
806 static status_t
807 write_cached_block(block_cache *cache, cached_block *block,
808 	bool deleteTransaction)
809 {
810 	cache_transaction *previous = block->previous_transaction;
811 	int32 blockSize = cache->block_size;
812 
813 	void *data = previous && block->original_data
814 		? block->original_data : block->current_data;
815 		// we first need to write back changes from previous transactions
816 
817 	TRACE(("write_cached_block(block %Ld)\n", block->block_number));
818 
819 	ssize_t written = write_pos(cache->fd, block->block_number * blockSize,
820 		data, blockSize);
821 
822 	if (written < blockSize) {
823 		FATAL(("could not write back block %Ld (%s)\n", block->block_number,
824 			strerror(errno)));
825 		return B_IO_ERROR;
826 	}
827 
828 	if (cache->num_dirty_blocks > 0)
829 		cache->num_dirty_blocks--;
830 
831 	if (data == block->current_data)
832 		block->is_dirty = false;
833 
834 	if (previous != NULL) {
835 		previous->blocks.Remove(block);
836 		block->previous_transaction = NULL;
837 
838 		if (block->original_data != NULL && block->transaction == NULL) {
839 			// This block is not part of a transaction, so it does not need
840 			// its original pointer anymore.
841 			cache->Free(block->original_data);
842 			block->original_data = NULL;
843 		}
844 
845 		// Has the previous transation been finished with that write?
846 		if (--previous->num_blocks == 0) {
847 			TRACE(("cache transaction %ld finished!\n", previous->id));
848 
849 			if (previous->notification_hook != NULL) {
850 				previous->notification_hook(previous->id, TRANSACTION_WRITTEN,
851 					previous->notification_data);
852 			}
853 
854 			if (deleteTransaction) {
855 				hash_remove(cache->transaction_hash, previous);
856 				delete_transaction(cache, previous);
857 			}
858 		}
859 	}
860 
861 	return B_OK;
862 }
863 
864 
865 #ifdef DEBUG_BLOCK_CACHE
866 static void
867 dump_block(cached_block *block)
868 {
869 	kprintf("%08lx %9Ld %08lx %08lx %08lx %5ld %6ld %c%c%c%c  %08lx "
870 		"%08lx\n", (addr_t)block, block->block_number,
871 		(addr_t)block->current_data, (addr_t)block->original_data,
872 		(addr_t)block->parent_data, block->ref_count, block->accessed,
873 		block->busy ? 'B' : '-', block->is_writing ? 'W' : '-',
874 		block->is_dirty ? 'B' : '-', block->unused ? 'U' : '-',
875 		(addr_t)block->transaction,
876 		(addr_t)block->previous_transaction);
877 }
878 
879 
880 static int
881 dump_cache(int argc, char **argv)
882 {
883 	bool showTransactions = false;
884 	bool showBlocks = false;
885 	int32 i = 1;
886 	while (argv[i] != NULL && argv[i][0] == '-') {
887 		for (char *arg = &argv[i][1]; arg[0]; arg++) {
888 			switch (arg[0]) {
889 				case 'b':
890 					showBlocks = true;
891 					break;
892 				case 't':
893 					showTransactions = true;
894 					break;
895 				default:
896 					print_debugger_command_usage(argv[0]);
897 					return 0;
898 			}
899 		}
900 		i++;
901 	}
902 
903 	if (i >= argc) {
904 		print_debugger_command_usage(argv[0]);
905 		return 0;
906 	}
907 
908 	block_cache *cache = (struct block_cache *)parse_expression(argv[i]);
909 	if (cache == NULL) {
910 		kprintf("invalid cache address\n");
911 		return 0;
912 	}
913 
914 	off_t blockNumber = -1;
915 	if (i + 1 < argc) {
916 		blockNumber = strtoll(argv[i + 1], NULL, 0);
917 		cached_block *block = (cached_block *)hash_lookup(cache->hash,
918 			&blockNumber);
919 		if (cache != NULL) {
920 			kprintf("BLOCK %p\n", block);
921 			kprintf(" current data:  %p\n", block->current_data);
922 			kprintf(" original data: %p\n", block->original_data);
923 			kprintf(" parent data:   %p\n", block->parent_data);
924 			kprintf(" ref_count:     %ld\n", block->ref_count);
925 			kprintf(" accessed:      %ld\n", block->accessed);
926 			kprintf(" flags:        ");
927 			if (block->is_writing)
928 				kprintf(" is-writing");
929 			if (block->is_dirty)
930 				kprintf(" is-dirty");
931 			if (block->unused)
932 				kprintf(" unused");
933 			kprintf("\n");
934 			if (block->transaction != NULL) {
935 				kprintf(" transaction:   %p (%ld)\n", block->transaction,
936 					block->transaction->id);
937 				if (block->transaction_next != NULL) {
938 					kprintf(" next in transaction: %Ld\n",
939 						block->transaction_next->block_number);
940 				}
941 			}
942 			if (block->previous_transaction != NULL) {
943 				kprintf(" previous transaction: %p (%ld)\n",
944 					block->previous_transaction,
945 					block->previous_transaction->id);
946 			}
947 		} else
948 			kprintf("block %Ld not found\n", blockNumber);
949 		return 0;
950 	}
951 
952 	kprintf("BLOCK CACHE: %p\n", cache);
953 
954 	kprintf(" fd:         %d\n", cache->fd);
955 	kprintf(" max_blocks: %Ld\n", cache->max_blocks);
956 	kprintf(" block_size: %lu\n", cache->block_size);
957 	kprintf(" next_transaction_id: %ld\n", cache->next_transaction_id);
958 
959 	if (showTransactions) {
960 		kprintf(" transactions:\n");
961 		kprintf("address       id state  blocks  main   sub\n");
962 
963 		hash_iterator iterator;
964 		hash_open(cache->transaction_hash, &iterator);
965 
966 		cache_transaction *transaction;
967 		while ((transaction = (cache_transaction *)hash_next(
968 				cache->transaction_hash, &iterator)) != NULL) {
969 			kprintf("%p %5ld %-7s %5ld %5ld %5ld\n", transaction,
970 				transaction->id, transaction->open ? "open" : "closed",
971 				transaction->num_blocks, transaction->main_num_blocks,
972 				transaction->sub_num_blocks);
973 		}
974 	}
975 
976 	if (showBlocks) {
977 		kprintf(" blocks:\n");
978 		kprintf("address  block no. current  original parent    refs access "
979 			"flags transact prev. trans\n");
980 	}
981 
982 	uint32 referenced = 0;
983 	uint32 count = 0;
984 	uint32 dirty = 0;
985 	hash_iterator iterator;
986 	hash_open(cache->hash, &iterator);
987 	cached_block *block;
988 	while ((block = (cached_block *)hash_next(cache->hash, &iterator)) != NULL) {
989 		if (showBlocks)
990 			dump_block(block);
991 
992 		if (block->is_dirty)
993 			dirty++;
994 		if (block->ref_count)
995 			referenced++;
996 		count++;
997 	}
998 
999 	kprintf(" %ld blocks total, %ld dirty, %ld referenced.\n", count, dirty,
1000 		referenced);
1001 
1002 	hash_close(cache->hash, &iterator, false);
1003 	return 0;
1004 }
1005 
1006 
1007 static int
1008 dump_transaction(int argc, char **argv)
1009 {
1010 	bool showBlocks = false;
1011 	int i = 1;
1012 	if (argc > 1 && !strcmp(argv[1], "-b")) {
1013 		showBlocks = true;
1014 		i++;
1015 	}
1016 
1017 	if (argc - i < 1 || argc - i > 2) {
1018 		print_debugger_command_usage(argv[0]);
1019 		return 0;
1020 	}
1021 
1022 	cache_transaction *transaction = NULL;
1023 
1024 	if (argc - i == 1) {
1025 		transaction = (cache_transaction *)parse_expression(argv[i]);
1026 	} else {
1027 		block_cache *cache = (block_cache *)parse_expression(argv[i]);
1028 		int32 id = parse_expression(argv[i + 1]);
1029 		transaction = lookup_transaction(cache, id);
1030 		if (transaction == NULL) {
1031 			kprintf("No transaction with ID %ld found.\n", id);
1032 			return 0;
1033 		}
1034 	}
1035 
1036 	kprintf("TRANSACTION %p\n", transaction);
1037 
1038 	kprintf(" id:             %ld\n", transaction->id);
1039 	kprintf(" num block:      %ld\n", transaction->num_blocks);
1040 	kprintf(" main num block: %ld\n", transaction->main_num_blocks);
1041 	kprintf(" sub num block:  %ld\n", transaction->sub_num_blocks);
1042 	kprintf(" has sub:        %d\n", transaction->has_sub_transaction);
1043 	kprintf(" state:          %s\n", transaction->open ? "open" : "closed");
1044 
1045 	if (!showBlocks)
1046 		return 0;
1047 
1048 	kprintf(" blocks:\n");
1049 	kprintf("address  block no. current  original parent    refs access "
1050 		"flags transact prev. trans\n");
1051 
1052 	cached_block *block = transaction->first_block;
1053 	while (block != NULL) {
1054 		dump_block(block);
1055 		block = block->transaction_next;
1056 	}
1057 
1058 	kprintf("--\n");
1059 
1060 	block_list::Iterator iterator = transaction->blocks.GetIterator();
1061 	while (iterator.HasNext()) {
1062 		block = iterator.Next();
1063 		dump_block(block);
1064 	}
1065 
1066 	return 0;
1067 }
1068 
1069 
1070 static int
1071 dump_caches(int argc, char **argv)
1072 {
1073 	kprintf("Block caches:\n");
1074 	DoublyLinkedList<block_cache>::Iterator i = sCaches.GetIterator();
1075 	while (i.HasNext()) {
1076 		block_cache *cache = i.Next();
1077 		if (cache == (block_cache *)&sMarkCache)
1078 			continue;
1079 
1080 		kprintf("  %p\n", cache);
1081 	}
1082 
1083 	return 0;
1084 }
1085 #endif	// DEBUG_BLOCK_CACHE
1086 
1087 
1088 static block_cache *
1089 get_next_block_cache(block_cache *last)
1090 {
1091 	MutexLocker _(sCachesLock);
1092 	block_cache *cache;
1093 	if (last != NULL) {
1094 		cache = sCaches.GetNext((block_cache *)&sMarkCache);
1095 		sCaches.Remove((block_cache *)&sMarkCache);
1096 	} else
1097 		cache = sCaches.Head();
1098 
1099 	if (cache != NULL)
1100 		sCaches.Insert(sCaches.GetNext(cache), (block_cache *)&sMarkCache);
1101 
1102 	return cache;
1103 }
1104 
1105 
1106 static status_t
1107 block_writer(void *)
1108 {
1109 	while (true) {
1110 		// write 64 blocks of each block_cache every two seconds
1111 		// TODO: change this once we have an I/O scheduler
1112 		snooze(2000000LL);
1113 
1114 		block_cache *cache = NULL;
1115 		while ((cache = get_next_block_cache(cache)) != NULL) {
1116 			BenaphoreLocker locker(&cache->lock);
1117 			const uint32 kMaxCount = 64;
1118 			cached_block *blocks[kMaxCount];
1119 			uint32 count = 0;
1120 
1121 			if (cache->num_dirty_blocks) {
1122 				// This cache is not using transactions, we'll scan the blocks
1123 				// directly
1124 				hash_iterator iterator;
1125 				hash_open(cache->hash, &iterator);
1126 
1127 				cached_block *block;
1128 				while (count < kMaxCount
1129 					&& (block = (cached_block *)hash_next(cache->hash,
1130 							&iterator)) != NULL) {
1131 					if (block->is_dirty)
1132 						blocks[count++] = block;
1133 				}
1134 
1135 				hash_close(cache->hash, &iterator, false);
1136 			} else {
1137 				hash_iterator iterator;
1138 				hash_open(cache->transaction_hash, &iterator);
1139 
1140 				cache_transaction *transaction;
1141 				while ((transaction = (cache_transaction *)hash_next(
1142 						cache->transaction_hash, &iterator)) != NULL
1143 					&& count < kMaxCount) {
1144 					if (transaction->open)
1145 						continue;
1146 
1147 					// sort blocks to speed up writing them back
1148 					// TODO: ideally, this should be handled by the I/O scheduler
1149 					block_list::Iterator iterator
1150 						= transaction->blocks.GetIterator();
1151 
1152 					for (; count < kMaxCount && iterator.HasNext(); count++) {
1153 						blocks[count] = iterator.Next();
1154 					}
1155 				}
1156 
1157 				hash_close(cache->transaction_hash, &iterator, false);
1158 			}
1159 
1160 			qsort(blocks, count, sizeof(void *), &compare_blocks);
1161 
1162 			for (uint32 i = 0; i < count; i++) {
1163 				if (write_cached_block(cache, blocks[i], true) != B_OK)
1164 					break;
1165 			}
1166 		}
1167 	}
1168 }
1169 
1170 
1171 extern "C" status_t
1172 block_cache_init(void)
1173 {
1174 	sBlockCache = create_object_cache_etc("cached blocks", sizeof(cached_block),
1175 		8, 0, CACHE_LARGE_SLAB, NULL, NULL, NULL, NULL);
1176 	if (sBlockCache == NULL)
1177 		return B_ERROR;
1178 
1179 	mutex_init(&sCachesLock, "block caches");
1180 	new (&sCaches) DoublyLinkedList<block_cache>;
1181 		// manually call constructor
1182 
1183 	thread_id thread = spawn_kernel_thread(&block_writer, "block writer",
1184 		B_LOW_PRIORITY, NULL);
1185 	if (thread >= B_OK)
1186 		send_signal_etc(thread, SIGCONT, B_DO_NOT_RESCHEDULE);
1187 
1188 #ifdef DEBUG_BLOCK_CACHE
1189 	add_debugger_command_etc("block_caches", &dump_caches,
1190 		"dumps all block caches", "\n", 0);
1191 	add_debugger_command_etc("block_cache", &dump_cache,
1192 		"dumps a specific block cache",
1193 		"[-bt] <cache-address> [block-number]\n"
1194 		"  -t lists the transactions\n"
1195 		"  -b lists all blocks\n", 0);
1196 	add_debugger_command_etc("transaction", &dump_transaction,
1197 		"dumps a specific transaction", "[-b] ((<cache> <id>) | <transaction>)\n"
1198 		"Either use a block cache pointer and an ID or a pointer to the transaction.\n"
1199 		"  -b lists all blocks that are part of this transaction\n", 0);
1200 #endif
1201 
1202 	return B_OK;
1203 }
1204 
1205 
1206 //	#pragma mark - public transaction API
1207 
1208 
1209 extern "C" int32
1210 cache_start_transaction(void *_cache)
1211 {
1212 	block_cache *cache = (block_cache *)_cache;
1213 	BenaphoreLocker locker(&cache->lock);
1214 
1215 	if (cache->last_transaction && cache->last_transaction->open) {
1216 		panic("last transaction (%ld) still open!\n",
1217 			cache->last_transaction->id);
1218 	}
1219 
1220 	cache_transaction *transaction = new(nothrow) cache_transaction;
1221 	if (transaction == NULL)
1222 		return B_NO_MEMORY;
1223 
1224 	transaction->id = atomic_add(&cache->next_transaction_id, 1);
1225 	cache->last_transaction = transaction;
1226 
1227 	TRACE(("cache_start_transaction(): id %ld started\n", transaction->id));
1228 	T(Action("start", cache, transaction));
1229 
1230 	hash_insert_grow(cache->transaction_hash, transaction);
1231 
1232 	return transaction->id;
1233 }
1234 
1235 
1236 extern "C" status_t
1237 cache_sync_transaction(void *_cache, int32 id)
1238 {
1239 	block_cache *cache = (block_cache *)_cache;
1240 	BenaphoreLocker locker(&cache->lock);
1241 	status_t status = B_ENTRY_NOT_FOUND;
1242 
1243 	TRACE(("cache_sync_transaction(id %ld)\n", id));
1244 
1245 	hash_iterator iterator;
1246 	hash_open(cache->transaction_hash, &iterator);
1247 
1248 	cache_transaction *transaction;
1249 	while ((transaction = (cache_transaction *)hash_next(
1250 			cache->transaction_hash, &iterator)) != NULL) {
1251 		// close all earlier transactions which haven't been closed yet
1252 
1253 		if (transaction->id <= id && !transaction->open) {
1254 			// write back all of their remaining dirty blocks
1255 			T(Action("sync", cache, transaction));
1256 			while (transaction->num_blocks > 0) {
1257 				// sort blocks to speed up writing them back
1258 				// TODO: ideally, this should be handled by the I/O scheduler
1259 				block_list::Iterator iterator = transaction->blocks.GetIterator();
1260 				uint32 maxCount = transaction->num_blocks;
1261 				cached_block *buffer[16];
1262 				cached_block **blocks = (cached_block **)malloc(maxCount
1263 					* sizeof(void *));
1264 				if (blocks == NULL) {
1265 					maxCount = 16;
1266 					blocks = buffer;
1267 				}
1268 
1269 				uint32 count = 0;
1270 				for (; count < maxCount && iterator.HasNext(); count++) {
1271 					blocks[count] = iterator.Next();
1272 				}
1273 				qsort(blocks, count, sizeof(void *), &compare_blocks);
1274 
1275 				for (uint32 i = 0; i < count; i++) {
1276 					status = write_cached_block(cache, blocks[i], false);
1277 					if (status != B_OK)
1278 						break;
1279 				}
1280 
1281 				if (blocks != buffer)
1282 					free(blocks);
1283 
1284 				if (status != B_OK)
1285 					return status;
1286 			}
1287 
1288 			hash_remove_current(cache->transaction_hash, &iterator);
1289 			delete_transaction(cache, transaction);
1290 		}
1291 	}
1292 
1293 	hash_close(cache->transaction_hash, &iterator, false);
1294 	return B_OK;
1295 }
1296 
1297 
1298 extern "C" status_t
1299 cache_end_transaction(void *_cache, int32 id,
1300 	transaction_notification_hook hook, void *data)
1301 {
1302 	block_cache *cache = (block_cache *)_cache;
1303 	BenaphoreLocker locker(&cache->lock);
1304 
1305 	TRACE(("cache_end_transaction(id = %ld)\n", id));
1306 
1307 	cache_transaction *transaction = lookup_transaction(cache, id);
1308 	if (transaction == NULL) {
1309 		panic("cache_end_transaction(): invalid transaction ID\n");
1310 		return B_BAD_VALUE;
1311 	}
1312 
1313 	T(Action("end", cache, transaction));
1314 
1315 	transaction->notification_hook = hook;
1316 	transaction->notification_data = data;
1317 
1318 	notify_transaction_listeners(transaction, TRANSACTION_ENDED);
1319 
1320 	// iterate through all blocks and free the unchanged original contents
1321 
1322 	cached_block *block = transaction->first_block, *next;
1323 	for (; block != NULL; block = next) {
1324 		next = block->transaction_next;
1325 
1326 		if (block->previous_transaction != NULL) {
1327 			// need to write back pending changes
1328 			write_cached_block(cache, block);
1329 		}
1330 
1331 		if (block->original_data != NULL) {
1332 			cache->Free(block->original_data);
1333 			block->original_data = NULL;
1334 		}
1335 		if (transaction->has_sub_transaction) {
1336 			if (block->parent_data != block->current_data)
1337 				cache->Free(block->parent_data);
1338 			block->parent_data = NULL;
1339 		}
1340 
1341 		// move the block to the previous transaction list
1342 		transaction->blocks.Add(block);
1343 
1344 		block->previous_transaction = transaction;
1345 		block->transaction_next = NULL;
1346 		block->transaction = NULL;
1347 	}
1348 
1349 	transaction->open = false;
1350 
1351 	return B_OK;
1352 }
1353 
1354 
1355 extern "C" status_t
1356 cache_abort_transaction(void *_cache, int32 id)
1357 {
1358 	block_cache *cache = (block_cache *)_cache;
1359 	BenaphoreLocker locker(&cache->lock);
1360 
1361 	TRACE(("cache_abort_transaction(id = %ld)\n", id));
1362 
1363 	cache_transaction *transaction = lookup_transaction(cache, id);
1364 	if (transaction == NULL) {
1365 		panic("cache_abort_transaction(): invalid transaction ID\n");
1366 		return B_BAD_VALUE;
1367 	}
1368 
1369 	T(Abort(cache, transaction));
1370 	notify_transaction_listeners(transaction, TRANSACTION_ABORTED);
1371 
1372 	// iterate through all blocks and restore their original contents
1373 
1374 	cached_block *block = transaction->first_block, *next;
1375 	for (; block != NULL; block = next) {
1376 		next = block->transaction_next;
1377 
1378 		if (block->original_data != NULL) {
1379 			TRACE(("cache_abort_transaction(id = %ld): restored contents of block %Ld\n",
1380 				transaction->id, block->block_number));
1381 			memcpy(block->current_data, block->original_data, cache->block_size);
1382 			cache->Free(block->original_data);
1383 			block->original_data = NULL;
1384 		}
1385 		if (transaction->has_sub_transaction) {
1386 			if (block->parent_data != block->current_data)
1387 				cache->Free(block->parent_data);
1388 			block->parent_data = NULL;
1389 		}
1390 
1391 		block->transaction_next = NULL;
1392 		block->transaction = NULL;
1393 	}
1394 
1395 	hash_remove(cache->transaction_hash, transaction);
1396 	delete_transaction(cache, transaction);
1397 	return B_OK;
1398 }
1399 
1400 
1401 /*!	Acknowledges the current parent transaction, and starts a new transaction
1402 	from its sub transaction.
1403 	The new transaction also gets a new transaction ID.
1404 */
1405 extern "C" int32
1406 cache_detach_sub_transaction(void *_cache, int32 id,
1407 	transaction_notification_hook hook, void *data)
1408 {
1409 	block_cache *cache = (block_cache *)_cache;
1410 	BenaphoreLocker locker(&cache->lock);
1411 
1412 	TRACE(("cache_detach_sub_transaction(id = %ld)\n", id));
1413 
1414 	cache_transaction *transaction = lookup_transaction(cache, id);
1415 	if (transaction == NULL) {
1416 		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
1417 		return B_BAD_VALUE;
1418 	}
1419 	if (!transaction->has_sub_transaction)
1420 		return B_BAD_VALUE;
1421 
1422 	// create a new transaction for the sub transaction
1423 	cache_transaction *newTransaction = new(nothrow) cache_transaction;
1424 	if (transaction == NULL)
1425 		return B_NO_MEMORY;
1426 
1427 	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
1428 	T(Detach(cache, transaction, newTransaction));
1429 
1430 	transaction->notification_hook = hook;
1431 	transaction->notification_data = data;
1432 
1433 	notify_transaction_listeners(transaction, TRANSACTION_ENDED);
1434 
1435 	// iterate through all blocks and free the unchanged original contents
1436 
1437 	cached_block *block = transaction->first_block, *next, *last = NULL;
1438 	for (; block != NULL; block = next) {
1439 		next = block->transaction_next;
1440 
1441 		if (block->previous_transaction != NULL) {
1442 			// need to write back pending changes
1443 			write_cached_block(cache, block);
1444 		}
1445 
1446 		if (block->original_data != NULL && block->parent_data != NULL
1447 			&& block->parent_data != block->current_data) {
1448 			// free the original data if the parent data of the transaction
1449 			// will be made current - but keep them otherwise
1450 			cache->Free(block->original_data);
1451 			block->original_data = NULL;
1452 		}
1453 		if (block->parent_data == NULL
1454 			|| block->parent_data != block->current_data) {
1455 			// we need to move this block over to the new transaction
1456 			block->original_data = block->parent_data;
1457 			if (last == NULL)
1458 				newTransaction->first_block = block;
1459 			else
1460 				last->transaction_next = block;
1461 
1462 			block->transaction = newTransaction;
1463 			last = block;
1464 		} else
1465 			block->transaction = NULL;
1466 
1467 		if (block->parent_data != NULL) {
1468 			// move the block to the previous transaction list
1469 			transaction->blocks.Add(block);
1470 			block->parent_data = NULL;
1471 		}
1472 
1473 		block->previous_transaction = transaction;
1474 		block->transaction_next = NULL;
1475 	}
1476 
1477 	newTransaction->num_blocks = transaction->sub_num_blocks;
1478 
1479 	transaction->open = false;
1480 	transaction->has_sub_transaction = false;
1481 	transaction->num_blocks = transaction->main_num_blocks;
1482 	transaction->sub_num_blocks = 0;
1483 
1484 	hash_insert_grow(cache->transaction_hash, newTransaction);
1485 	cache->last_transaction = newTransaction;
1486 
1487 	return newTransaction->id;
1488 }
1489 
1490 
1491 extern "C" status_t
1492 cache_abort_sub_transaction(void *_cache, int32 id)
1493 {
1494 	block_cache *cache = (block_cache *)_cache;
1495 	BenaphoreLocker locker(&cache->lock);
1496 
1497 	TRACE(("cache_abort_sub_transaction(id = %ld)\n", id));
1498 
1499 	cache_transaction *transaction = lookup_transaction(cache, id);
1500 	if (transaction == NULL) {
1501 		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
1502 		return B_BAD_VALUE;
1503 	}
1504 	if (!transaction->has_sub_transaction)
1505 		return B_BAD_VALUE;
1506 
1507 	T(Abort(cache, transaction));
1508 	notify_transaction_listeners(transaction, TRANSACTION_ABORTED);
1509 
1510 	// revert all changes back to the version of the parent
1511 
1512 	cached_block *block = transaction->first_block, *next;
1513 	for (; block != NULL; block = next) {
1514 		next = block->transaction_next;
1515 
1516 		if (block->parent_data == NULL) {
1517 			if (block->original_data != NULL) {
1518 				// the parent transaction didn't change the block, but the sub
1519 				// transaction did - we need to revert from the original data
1520 				memcpy(block->current_data, block->original_data,
1521 					cache->block_size);
1522 			}
1523 		} else if (block->parent_data != block->current_data) {
1524 			// the block has been changed and must be restored
1525 			TRACE(("cache_abort_sub_transaction(id = %ld): restored contents of block %Ld\n",
1526 				transaction->id, block->block_number));
1527 			memcpy(block->current_data, block->parent_data, cache->block_size);
1528 			cache->Free(block->parent_data);
1529 		}
1530 
1531 		block->parent_data = NULL;
1532 	}
1533 
1534 	// all subsequent changes will go into the main transaction
1535 	transaction->has_sub_transaction = false;
1536 	transaction->sub_num_blocks = 0;
1537 
1538 	return B_OK;
1539 }
1540 
1541 
1542 extern "C" status_t
1543 cache_start_sub_transaction(void *_cache, int32 id)
1544 {
1545 	block_cache *cache = (block_cache *)_cache;
1546 	BenaphoreLocker locker(&cache->lock);
1547 
1548 	TRACE(("cache_start_sub_transaction(id = %ld)\n", id));
1549 
1550 	cache_transaction *transaction = lookup_transaction(cache, id);
1551 	if (transaction == NULL) {
1552 		panic("cache_start_sub_transaction(): invalid transaction ID %ld\n", id);
1553 		return B_BAD_VALUE;
1554 	}
1555 
1556 	notify_transaction_listeners(transaction, TRANSACTION_ENDED);
1557 
1558 	// move all changed blocks up to the parent
1559 
1560 	cached_block *block = transaction->first_block, *next;
1561 	for (; block != NULL; block = next) {
1562 		next = block->transaction_next;
1563 
1564 		if (transaction->has_sub_transaction
1565 			&& block->parent_data != NULL
1566 			&& block->parent_data != block->current_data) {
1567 			// there already is an older sub transaction - we acknowledge
1568 			// its changes and move its blocks up to the parent
1569 			cache->Free(block->parent_data);
1570 		}
1571 
1572 		// we "allocate" the parent data lazily, that means, we don't copy
1573 		// the data (and allocate memory for it) until we need to
1574 		block->parent_data = block->current_data;
1575 	}
1576 
1577 	// all subsequent changes will go into the sub transaction
1578 	transaction->has_sub_transaction = true;
1579 	transaction->main_num_blocks = transaction->num_blocks;
1580 	transaction->sub_num_blocks = 0;
1581 	T(Action("start-sub", cache, transaction));
1582 
1583 	return B_OK;
1584 }
1585 
1586 
1587 /*!	Adds a transaction listener that gets notified when the transaction
1588 	is ended or aborted.
1589 	The listener gets automatically removed in this case.
1590 */
1591 status_t
1592 cache_add_transaction_listener(void *_cache, int32 id,
1593 	transaction_notification_hook hookFunction, void *data)
1594 {
1595 	block_cache *cache = (block_cache *)_cache;
1596 
1597 	cache_hook *hook = new(std::nothrow) cache_hook;
1598 	if (hook == NULL)
1599 		return B_NO_MEMORY;
1600 
1601 	BenaphoreLocker locker(&cache->lock);
1602 
1603 	cache_transaction *transaction = lookup_transaction(cache, id);
1604 	if (transaction == NULL) {
1605 		delete hook;
1606 		return B_BAD_VALUE;
1607 	}
1608 
1609 	hook->hook = hookFunction;
1610 	hook->data = data;
1611 
1612 	transaction->listeners.Add(hook);
1613 	return B_OK;
1614 }
1615 
1616 
1617 status_t
1618 cache_remove_transaction_listener(void *_cache, int32 id,
1619 	transaction_notification_hook hookFunction, void *data)
1620 {
1621 	block_cache *cache = (block_cache *)_cache;
1622 
1623 	BenaphoreLocker locker(&cache->lock);
1624 
1625 	cache_transaction *transaction = lookup_transaction(cache, id);
1626 	if (transaction == NULL)
1627 		return B_BAD_VALUE;
1628 
1629 	HookList::Iterator iterator = transaction->listeners.GetIterator();
1630 	while (iterator.HasNext()) {
1631 		cache_hook *hook = iterator.Next();
1632 		if (hook->data == data && hook->hook == hookFunction) {
1633 			iterator.Remove();
1634 			delete hook;
1635 			return B_OK;
1636 		}
1637 	}
1638 
1639 	return B_ENTRY_NOT_FOUND;
1640 }
1641 
1642 
1643 extern "C" status_t
1644 cache_next_block_in_transaction(void *_cache, int32 id, bool mainOnly,
1645 	long *_cookie, off_t *_blockNumber, void **_data, void **_unchangedData)
1646 {
1647 	cached_block *block = (cached_block *)*_cookie;
1648 	block_cache *cache = (block_cache *)_cache;
1649 
1650 	BenaphoreLocker locker(&cache->lock);
1651 
1652 	cache_transaction *transaction = lookup_transaction(cache, id);
1653 	if (transaction == NULL || !transaction->open)
1654 		return B_BAD_VALUE;
1655 
1656 	if (block == NULL)
1657 		block = transaction->first_block;
1658 	else
1659 		block = block->transaction_next;
1660 
1661 	if (mainOnly && transaction->has_sub_transaction) {
1662 		// find next block that the parent changed
1663 		while (block != NULL && block->parent_data == NULL)
1664 			block = block->transaction_next;
1665 	}
1666 
1667 	if (block == NULL)
1668 		return B_ENTRY_NOT_FOUND;
1669 
1670 	if (_blockNumber)
1671 		*_blockNumber = block->block_number;
1672 	if (_data)
1673 		*_data = mainOnly ? block->parent_data : block->current_data;
1674 	if (_unchangedData)
1675 		*_unchangedData = block->original_data;
1676 
1677 	*_cookie = (addr_t)block;
1678 	return B_OK;
1679 }
1680 
1681 
1682 extern "C" int32
1683 cache_blocks_in_transaction(void *_cache, int32 id)
1684 {
1685 	block_cache *cache = (block_cache *)_cache;
1686 	BenaphoreLocker locker(&cache->lock);
1687 
1688 	cache_transaction *transaction = lookup_transaction(cache, id);
1689 	if (transaction == NULL)
1690 		return B_BAD_VALUE;
1691 
1692 	return transaction->num_blocks;
1693 }
1694 
1695 
1696 extern "C" int32
1697 cache_blocks_in_main_transaction(void *_cache, int32 id)
1698 {
1699 	block_cache *cache = (block_cache *)_cache;
1700 	BenaphoreLocker locker(&cache->lock);
1701 
1702 	cache_transaction *transaction = lookup_transaction(cache, id);
1703 	if (transaction == NULL)
1704 		return B_BAD_VALUE;
1705 
1706 	return transaction->main_num_blocks;
1707 }
1708 
1709 
1710 extern "C" int32
1711 cache_blocks_in_sub_transaction(void *_cache, int32 id)
1712 {
1713 	block_cache *cache = (block_cache *)_cache;
1714 	BenaphoreLocker locker(&cache->lock);
1715 
1716 	cache_transaction *transaction = lookup_transaction(cache, id);
1717 	if (transaction == NULL)
1718 		return B_BAD_VALUE;
1719 
1720 	return transaction->sub_num_blocks;
1721 }
1722 
1723 
1724 //	#pragma mark - public block cache API
1725 
1726 
1727 extern "C" void
1728 block_cache_delete(void *_cache, bool allowWrites)
1729 {
1730 	block_cache *cache = (block_cache *)_cache;
1731 
1732 	if (allowWrites)
1733 		block_cache_sync(cache);
1734 
1735 	BenaphoreLocker locker(&cache->lock);
1736 
1737 	// free all blocks
1738 
1739 	uint32 cookie = 0;
1740 	cached_block *block;
1741 	while ((block = (cached_block *)hash_remove_first(cache->hash,
1742 			&cookie)) != NULL) {
1743 		cache->FreeBlock(block);
1744 	}
1745 
1746 	// free all transactions (they will all be aborted)
1747 
1748 	cookie = 0;
1749 	cache_transaction *transaction;
1750 	while ((transaction = (cache_transaction *)hash_remove_first(
1751 			cache->transaction_hash, &cookie)) != NULL) {
1752 		delete transaction;
1753 	}
1754 
1755 	delete cache;
1756 }
1757 
1758 
1759 extern "C" void *
1760 block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
1761 {
1762 	block_cache *cache = new(nothrow) block_cache(fd, numBlocks, blockSize,
1763 		readOnly);
1764 	if (cache == NULL)
1765 		return NULL;
1766 
1767 	if (cache->InitCheck() != B_OK) {
1768 		delete cache;
1769 		return NULL;
1770 	}
1771 
1772 	return cache;
1773 }
1774 
1775 
1776 extern "C" status_t
1777 block_cache_sync(void *_cache)
1778 {
1779 	block_cache *cache = (block_cache *)_cache;
1780 
1781 	// we will sync all dirty blocks to disk that have a completed
1782 	// transaction or no transaction only
1783 
1784 	BenaphoreLocker locker(&cache->lock);
1785 	hash_iterator iterator;
1786 	hash_open(cache->hash, &iterator);
1787 
1788 	cached_block *block;
1789 	while ((block = (cached_block *)hash_next(cache->hash, &iterator)) != NULL) {
1790 		if (block->previous_transaction != NULL
1791 			|| (block->transaction == NULL && block->is_dirty)) {
1792 			status_t status = write_cached_block(cache, block);
1793 			if (status != B_OK)
1794 				return status;
1795 		}
1796 	}
1797 
1798 	hash_close(cache->hash, &iterator, false);
1799 	return B_OK;
1800 }
1801 
1802 
1803 extern "C" status_t
1804 block_cache_sync_etc(void *_cache, off_t blockNumber, size_t numBlocks)
1805 {
1806 	block_cache *cache = (block_cache *)_cache;
1807 
1808 	// we will sync all dirty blocks to disk that have a completed
1809 	// transaction or no transaction only
1810 
1811 	if (blockNumber < 0 || blockNumber >= cache->max_blocks) {
1812 		panic("block_cache_sync_etc: invalid block number %Ld (max %Ld)",
1813 			blockNumber, cache->max_blocks - 1);
1814 		return B_BAD_VALUE;
1815 	}
1816 
1817 	BenaphoreLocker locker(&cache->lock);
1818 
1819 	for (; numBlocks > 0; numBlocks--, blockNumber++) {
1820 		cached_block *block = (cached_block *)hash_lookup(cache->hash,
1821 			&blockNumber);
1822 		if (block == NULL)
1823 			continue;
1824 
1825 		// TODO: sort blocks!
1826 
1827 		if (block->previous_transaction != NULL
1828 			|| (block->transaction == NULL && block->is_dirty)) {
1829 			status_t status = write_cached_block(cache, block);
1830 			if (status != B_OK)
1831 				return status;
1832 		}
1833 	}
1834 
1835 	return B_OK;
1836 }
1837 
1838 
1839 extern "C" status_t
1840 block_cache_make_writable(void *_cache, off_t blockNumber, int32 transaction)
1841 {
1842 	block_cache *cache = (block_cache *)_cache;
1843 	BenaphoreLocker locker(&cache->lock);
1844 
1845 	if (cache->read_only)
1846 		panic("tried to make block writable on a read-only cache!");
1847 
1848 	// ToDo: this can be done better!
1849 	void *block = get_writable_cached_block(cache, blockNumber,
1850 		blockNumber, 1, transaction, false);
1851 	if (block != NULL) {
1852 		put_cached_block((block_cache *)_cache, blockNumber);
1853 		return B_OK;
1854 	}
1855 
1856 	return B_ERROR;
1857 }
1858 
1859 
1860 extern "C" void *
1861 block_cache_get_writable_etc(void *_cache, off_t blockNumber, off_t base,
1862 	off_t length, int32 transaction)
1863 {
1864 	block_cache *cache = (block_cache *)_cache;
1865 	BenaphoreLocker locker(&cache->lock);
1866 
1867 	TRACE(("block_cache_get_writable_etc(block = %Ld, transaction = %ld)\n",
1868 		blockNumber, transaction));
1869 	if (cache->read_only)
1870 		panic("tried to get writable block on a read-only cache!");
1871 
1872 	return get_writable_cached_block(cache, blockNumber, base, length,
1873 		transaction, false);
1874 }
1875 
1876 
1877 extern "C" void *
1878 block_cache_get_writable(void *_cache, off_t blockNumber, int32 transaction)
1879 {
1880 	return block_cache_get_writable_etc(_cache, blockNumber,
1881 		blockNumber, 1, transaction);
1882 }
1883 
1884 
1885 extern "C" void *
1886 block_cache_get_empty(void *_cache, off_t blockNumber, int32 transaction)
1887 {
1888 	block_cache *cache = (block_cache *)_cache;
1889 	BenaphoreLocker locker(&cache->lock);
1890 
1891 	TRACE(("block_cache_get_empty(block = %Ld, transaction = %ld)\n",
1892 		blockNumber, transaction));
1893 	if (cache->read_only)
1894 		panic("tried to get empty writable block on a read-only cache!");
1895 
1896 	return get_writable_cached_block((block_cache *)_cache, blockNumber,
1897 		blockNumber, 1, transaction, true);
1898 }
1899 
1900 
1901 extern "C" const void *
1902 block_cache_get_etc(void *_cache, off_t blockNumber, off_t base, off_t length)
1903 {
1904 	block_cache *cache = (block_cache *)_cache;
1905 	BenaphoreLocker locker(&cache->lock);
1906 	bool allocated;
1907 
1908 	cached_block *block = get_cached_block(cache, blockNumber, &allocated);
1909 	if (block == NULL)
1910 		return NULL;
1911 
1912 #ifdef DEBUG_CHANGED
1913 	if (block->compare == NULL)
1914 		block->compare = cache->Allocate();
1915 	if (block->compare != NULL)
1916 		memcpy(block->compare, block->current_data, cache->block_size);
1917 #endif
1918 	return block->current_data;
1919 }
1920 
1921 
1922 extern "C" const void *
1923 block_cache_get(void *_cache, off_t blockNumber)
1924 {
1925 	return block_cache_get_etc(_cache, blockNumber, blockNumber, 1);
1926 }
1927 
1928 
1929 /*!
1930 	Changes the internal status of a writable block to \a dirty. This can be
1931 	helpful in case you realize you don't need to change that block anymore
1932 	for whatever reason.
1933 
1934 	Note, you must only use this function on blocks that were acquired
1935 	writable!
1936 */
1937 extern "C" status_t
1938 block_cache_set_dirty(void *_cache, off_t blockNumber, bool dirty,
1939 	int32 transaction)
1940 {
1941 	block_cache *cache = (block_cache *)_cache;
1942 	BenaphoreLocker locker(&cache->lock);
1943 
1944 	cached_block *block = (cached_block *)hash_lookup(cache->hash,
1945 		&blockNumber);
1946 	if (block == NULL)
1947 		return B_BAD_VALUE;
1948 	if (block->is_dirty == dirty) {
1949 		// there is nothing to do for us
1950 		return B_OK;
1951 	}
1952 
1953 	// TODO: not yet implemented
1954 	if (dirty)
1955 		panic("block_cache_set_dirty(): not yet implemented that way!\n");
1956 
1957 	return B_OK;
1958 }
1959 
1960 
1961 extern "C" void
1962 block_cache_put(void *_cache, off_t blockNumber)
1963 {
1964 	block_cache *cache = (block_cache *)_cache;
1965 	BenaphoreLocker locker(&cache->lock);
1966 
1967 	put_cached_block(cache, blockNumber);
1968 }
1969 
1970