xref: /haiku/src/system/kernel/cache/block_cache.cpp (revision 302f62604763c95777d6d04cca456e876f471c4f)
1 /*
2  * Copyright 2004-2006, Axel Dörfler, axeld@pinc-software.de. All rights reserved.
3  * Distributed under the terms of the MIT License.
4  */
5 
6 
7 #include "block_cache_private.h"
8 
9 #include <KernelExport.h>
10 #include <fs_cache.h>
11 
12 #include <block_cache.h>
13 #include <lock.h>
14 #include <vm_low_memory.h>
15 #include <util/kernel_cpp.h>
16 #include <util/DoublyLinkedList.h>
17 #include <util/AutoLock.h>
18 #include <util/khash.h>
19 
20 #include <unistd.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <errno.h>
24 
25 
26 // ToDo: this is a naive but growing implementation to test the API:
27 //	1) block reading/writing is not at all optimized for speed, it will
28 //	   just read and write single blocks.
29 //	2) the locking could be improved; getting a block should not need to
30 //	   wait for blocks to be written
31 //	3) dirty blocks are only written back if asked for
32 // TODO: the retrieval/copy of the original data could be delayed until the
33 //		new data must be written, ie. in low memory situations.
34 
35 //#define TRACE_BLOCK_CACHE
36 #ifdef TRACE_BLOCK_CACHE
37 #	define TRACE(x)	dprintf x
38 #else
39 #	define TRACE(x) ;
40 #endif
41 
42 // This macro is used for fatal situations that are acceptable in a running
43 // system, like out of memory situations - should only panic for debugging.
44 #define FATAL(x) panic x
45 
46 struct cache_transaction {
47 	cache_transaction();
48 
49 	cache_transaction *next;
50 	int32			id;
51 	int32			num_blocks;
52 	int32			sub_num_blocks;
53 	cached_block	*first_block;
54 	block_list		blocks;
55 	transaction_notification_hook notification_hook;
56 	void			*notification_data;
57 	bool			open;
58 	bool			has_sub_transaction;
59 };
60 
61 static status_t write_cached_block(block_cache *cache, cached_block *block,
62 	bool deleteTransaction = true);
63 
64 
65 //	#pragma mark - private transaction
66 
67 
68 cache_transaction::cache_transaction()
69 {
70 	num_blocks = 0;
71 	sub_num_blocks = 0;
72 	first_block = NULL;
73 	notification_hook = NULL;
74 	notification_data = NULL;
75 	open = true;
76 }
77 
78 
79 static int
80 transaction_compare(void *_transaction, const void *_id)
81 {
82 	cache_transaction *transaction = (cache_transaction *)_transaction;
83 	const int32 *id = (const int32 *)_id;
84 
85 	return transaction->id - *id;
86 }
87 
88 
89 static uint32
90 transaction_hash(void *_transaction, const void *_id, uint32 range)
91 {
92 	cache_transaction *transaction = (cache_transaction *)_transaction;
93 	const int32 *id = (const int32 *)_id;
94 
95 	if (transaction != NULL)
96 		return transaction->id % range;
97 
98 	return (uint32)*id % range;
99 }
100 
101 
102 static void
103 delete_transaction(block_cache *cache, cache_transaction *transaction)
104 {
105 	hash_remove(cache->transaction_hash, transaction);
106 	if (cache->last_transaction == transaction)
107 		cache->last_transaction = NULL;
108 
109 	delete transaction;
110 }
111 
112 
113 static cache_transaction *
114 lookup_transaction(block_cache *cache, int32 id)
115 {
116 	return (cache_transaction *)hash_lookup(cache->transaction_hash, &id);
117 }
118 
119 
120 //	#pragma mark - cached_block
121 
122 
123 /* static */
124 int
125 cached_block::Compare(void *_cacheEntry, const void *_block)
126 {
127 	cached_block *cacheEntry = (cached_block *)_cacheEntry;
128 	const off_t *block = (const off_t *)_block;
129 
130 	return cacheEntry->block_number - *block;
131 }
132 
133 
134 
135 /* static */
136 uint32
137 cached_block::Hash(void *_cacheEntry, const void *_block, uint32 range)
138 {
139 	cached_block *cacheEntry = (cached_block *)_cacheEntry;
140 	const off_t *block = (const off_t *)_block;
141 
142 	if (cacheEntry != NULL)
143 		return cacheEntry->block_number % range;
144 
145 	return (uint64)*block % range;
146 }
147 
148 
149 //	#pragma mark - block_cache
150 
151 
152 block_cache::block_cache(int _fd, off_t numBlocks, size_t blockSize, bool readOnly)
153 	:
154 	hash(NULL),
155 	fd(_fd),
156 	max_blocks(numBlocks),
157 	block_size(blockSize),
158 	next_transaction_id(1),
159 	last_transaction(NULL),
160 	transaction_hash(NULL),
161 	ranges_hash(NULL),
162 	read_only(readOnly)
163 {
164 	hash = hash_init(32, 0, &cached_block::Compare, &cached_block::Hash);
165 	if (hash == NULL)
166 		return;
167 
168 	transaction_hash = hash_init(16, 0, &transaction_compare, &::transaction_hash);
169 	if (transaction_hash == NULL)
170 		return;
171 
172 	ranges_hash = hash_init(16, 0, &block_range::Compare, &block_range::Hash);
173 	if (ranges_hash == NULL)
174 		return;
175 
176 	if (benaphore_init(&lock, "block cache") < B_OK)
177 		return;
178 
179 	chunk_size = max_c(blockSize, B_PAGE_SIZE);
180 	chunks_per_range = kBlockRangeSize / chunk_size;
181 	range_mask = (1UL << chunks_per_range) - 1;
182 	chunk_mask = (1UL << (chunk_size / blockSize)) - 1;
183 
184 	register_low_memory_handler(&block_cache::LowMemoryHandler, this, 0);
185 }
186 
187 
188 block_cache::~block_cache()
189 {
190 	unregister_low_memory_handler(&block_cache::LowMemoryHandler, this);
191 
192 	benaphore_destroy(&lock);
193 
194 	hash_uninit(ranges_hash);
195 	hash_uninit(transaction_hash);
196 	hash_uninit(hash);
197 }
198 
199 
200 status_t
201 block_cache::InitCheck()
202 {
203 	if (lock.sem < B_OK)
204 		return lock.sem;
205 
206 	if (hash == NULL || transaction_hash == NULL || ranges_hash == NULL)
207 		return B_NO_MEMORY;
208 
209 	return B_OK;
210 }
211 
212 
213 block_range *
214 block_cache::GetFreeRange()
215 {
216 	if (!free_ranges.IsEmpty())
217 		return free_ranges.First();
218 
219 	// we need to allocate a new range
220 	block_range *range;
221 	if (block_range::New(this, &range) != B_OK) {
222 		RemoveUnusedBlocks(2, 50);
223 
224 		if (!free_ranges.IsEmpty())
225 			return free_ranges.First();
226 
227 		RemoveUnusedBlocks(LONG_MAX, 50);
228 
229 		if (!free_ranges.IsEmpty())
230 			return free_ranges.First();
231 
232 		// TODO: We also need to free ranges from other caches to get a free one
233 		// (if not, an active volume might have stolen all free ranges already)
234 		return NULL;
235 	}
236 
237 	return range;
238 }
239 
240 
241 block_range *
242 block_cache::GetRange(void *address)
243 {
244 	return (block_range *)hash_lookup(ranges_hash, address);
245 }
246 
247 
248 void
249 block_cache::Free(void *address)
250 {
251 	if (address == NULL)
252 		return;
253 
254 	block_range *range = GetRange(address);
255 	if (range == NULL)
256 		panic("no range for address %p\n", address);
257 
258 	ASSERT(range != NULL);
259 	range->Free(this, address);
260 
261 	if (range->Unused(this))
262 		block_range::Delete(this, range);
263 }
264 
265 
266 void *
267 block_cache::Allocate()
268 {
269 	block_range *range = GetFreeRange();
270 	if (range == NULL)
271 		return NULL;
272 
273 	return range->Allocate(this);
274 }
275 
276 
277 void
278 block_cache::FreeBlock(cached_block *block)
279 {
280 	block_range *range = GetRange(block->current_data);
281 	ASSERT(range != NULL);
282 	range->Free(this, block);
283 
284 	if (block->original_data != NULL || block->parent_data != NULL)
285 		panic("block_cache::FreeBlock(): %p, %p\n", block->original_data, block->parent_data);
286 
287 #ifdef DEBUG_CHANGED
288 	Free(block->compare);
289 #endif
290 
291 	if (range->Unused(this))
292 		block_range::Delete(this, range);
293 
294 	delete block;
295 }
296 
297 
298 cached_block *
299 block_cache::NewBlock(off_t blockNumber)
300 {
301 	cached_block *block = new(nothrow) cached_block;
302 	if (block == NULL) {
303 		FATAL(("could not allocate block!\n"));
304 		return NULL;
305 	}
306 
307 	block_range *range = GetFreeRange();
308 	if (range == NULL) {
309 		FATAL(("could not get range!\n"));
310 		delete block;
311 		return NULL;
312 	}
313 
314 	range->Allocate(this, block);
315 
316 	block->block_number = blockNumber;
317 	block->ref_count = 0;
318 	block->accessed = 0;
319 	block->transaction_next = NULL;
320 	block->transaction = block->previous_transaction = NULL;
321 	block->original_data = NULL;
322 	block->parent_data = NULL;
323 	block->is_dirty = false;
324 	block->unused = false;
325 #ifdef DEBUG_CHANGED
326 	block->compare = NULL;
327 #endif
328 
329 	hash_insert(hash, block);
330 
331 	return block;
332 }
333 
334 
335 void
336 block_cache::RemoveUnusedBlocks(int32 maxAccessed, int32 count)
337 {
338 	TRACE(("block_cache: remove up to %ld unused blocks\n", count));
339 
340 	cached_block *next = NULL;
341 	for (cached_block *block = unused_blocks.First(); block != NULL; block = next) {
342 		next = block->next;
343 
344 		if (maxAccessed < block->accessed)
345 			continue;
346 
347 		TRACE(("  remove block %Ld, accessed %ld times\n",
348 			block->block_number, block->accessed));
349 
350 		// this can only happen if no transactions are used
351 		if (block->is_dirty)
352 			write_cached_block(this, block, false);
353 
354 		// remove block from lists
355 		unused_blocks.Remove(block);
356 		hash_remove(hash, block);
357 
358 		FreeBlock(block);
359 
360 		if (--count <= 0)
361 			break;
362 	}
363 }
364 
365 
366 void
367 block_cache::LowMemoryHandler(void *data, int32 level)
368 {
369 	block_cache *cache = (block_cache *)data;
370 	BenaphoreLocker locker(&cache->lock);
371 
372 	if (!locker.IsLocked()) {
373 		// If our block_cache were deleted, it could be that we had
374 		// been called before that deletion went through, therefore,
375 		// acquiring its lock might fail.
376 		return;
377 	}
378 
379 	TRACE(("block_cache: low memory handler called with level %ld\n", level));
380 
381 	// free some blocks according to the low memory state
382 	// (if there is enough memory left, we don't free any)
383 
384 	int32 free = 1;
385 	int32 accessed = 1;
386 	switch (vm_low_memory_state()) {
387 		case B_NO_LOW_MEMORY:
388 			return;
389 		case B_LOW_MEMORY_NOTE:
390 			free = 50;
391 			accessed = 2;
392 			break;
393 		case B_LOW_MEMORY_WARNING:
394 			free = 200;
395 			accessed = 10;
396 			break;
397 		case B_LOW_MEMORY_CRITICAL:
398 			free = LONG_MAX;
399 			accessed = LONG_MAX;
400 			break;
401 	}
402 
403 	cache->RemoveUnusedBlocks(accessed, free);
404 }
405 
406 
407 //	#pragma mark -
408 
409 
410 static void
411 put_cached_block(block_cache *cache, cached_block *block)
412 {
413 #ifdef DEBUG_CHANGED
414 	if (!block->is_dirty && block->compare != NULL && memcmp(block->current_data, block->compare, cache->block_size)) {
415 		dprintf("new block:\n");
416 		dump_block((const char *)block->current_data, 256, "  ");
417 		dprintf("unchanged block:\n");
418 		dump_block((const char *)block->compare, 256, "  ");
419 		write_cached_block(cache, block);
420 		panic("block_cache: supposed to be clean block was changed!\n");
421 
422 		cache->Free(block->compare);
423 		block->compare = NULL;
424 	}
425 #endif
426 
427 	if (--block->ref_count == 0
428 		&& block->transaction == NULL
429 		&& block->previous_transaction == NULL) {
430 		// put this block in the list of unused blocks
431 		block->unused = true;
432 		cache->unused_blocks.Add(block);
433 //		block->current_data = cache->allocator->Release(block->current_data);
434 	}
435 
436 	// free some blocks according to the low memory state
437 	// (if there is enough memory left, we don't free any)
438 
439 	int32 free = 1;
440 	switch (vm_low_memory_state()) {
441 		case B_NO_LOW_MEMORY:
442 			return;
443 		case B_LOW_MEMORY_NOTE:
444 			free = 1;
445 			break;
446 		case B_LOW_MEMORY_WARNING:
447 			free = 5;
448 			break;
449 		case B_LOW_MEMORY_CRITICAL:
450 			free = 20;
451 			break;
452 	}
453 
454 	cache->RemoveUnusedBlocks(LONG_MAX, free);
455 }
456 
457 
458 static void
459 put_cached_block(block_cache *cache, off_t blockNumber)
460 {
461 	if (blockNumber < 0 || blockNumber >= cache->max_blocks)
462 		panic("put_cached_block: invalid block number %lld (max %lld)", blockNumber, cache->max_blocks - 1);
463 
464 	cached_block *block = (cached_block *)hash_lookup(cache->hash, &blockNumber);
465 	if (block != NULL)
466 		put_cached_block(cache, block);
467 }
468 
469 
470 static cached_block *
471 get_cached_block(block_cache *cache, off_t blockNumber, bool *allocated, bool readBlock = true)
472 {
473 	if (blockNumber < 0 || blockNumber >= cache->max_blocks)
474 		panic("get_cached_block: invalid block number %lld (max %lld)", blockNumber, cache->max_blocks - 1);
475 
476 	cached_block *block = (cached_block *)hash_lookup(cache->hash, &blockNumber);
477 	*allocated = false;
478 
479 	if (block == NULL) {
480 		// read block into cache
481 		block = cache->NewBlock(blockNumber);
482 		if (block == NULL)
483 			return NULL;
484 
485 		*allocated = true;
486 	} else {
487 /*
488 		if (block->ref_count == 0 && block->current_data != NULL) {
489 			// see if the old block can be resurrected
490 			block->current_data = cache->allocator->Acquire(block->current_data);
491 		}
492 
493 		if (block->current_data == NULL) {
494 			// there is no block yet, but we need one
495 			block->current_data = cache->allocator->Get();
496 			if (block->current_data == NULL)
497 				return NULL;
498 
499 			*allocated = true;
500 		}
501 */
502 	}
503 
504 	if (*allocated && readBlock) {
505 		int32 blockSize = cache->block_size;
506 
507 		if (read_pos(cache->fd, blockNumber * blockSize, block->current_data, blockSize) < blockSize) {
508 			cache->FreeBlock(block);
509 			FATAL(("could not read block %Ld\n", blockNumber));
510 			return NULL;
511 		}
512 	}
513 
514 	if (block->unused) {
515 		//TRACE(("remove block %Ld from unused\n", blockNumber));
516 		block->unused = false;
517 		cache->unused_blocks.Remove(block);
518 	}
519 
520 	block->ref_count++;
521 	block->accessed++;
522 
523 	return block;
524 }
525 
526 
527 /**	Returns the writable block data for the requested blockNumber.
528  *	If \a cleared is true, the block is not read from disk; an empty block
529  *	is returned.
530  *	This is the only method to insert a block into a transaction. It makes
531  *	sure that the previous block contents are preserved in that case.
532  */
533 
534 static void *
535 get_writable_cached_block(block_cache *cache, off_t blockNumber, off_t base, off_t length,
536 	int32 transactionID, bool cleared)
537 {
538 	TRACE(("get_writable_cached_block(blockNumber = %Ld, transaction = %ld)\n", blockNumber, transactionID));
539 
540 	if (blockNumber < 0 || blockNumber >= cache->max_blocks)
541 		panic("get_writable_cached_block: invalid block number %lld (max %lld)", blockNumber, cache->max_blocks - 1);
542 
543 	bool allocated;
544 	cached_block *block = get_cached_block(cache, blockNumber, &allocated, !cleared);
545 	if (block == NULL)
546 		return NULL;
547 
548 	// if there is no transaction support, we just return the current block
549 	if (transactionID == -1) {
550 		if (cleared)
551 			memset(block->current_data, 0, cache->block_size);
552 
553 		block->is_dirty = true;
554 			// mark the block as dirty
555 
556 		return block->current_data;
557 	}
558 
559 	if (block->transaction != NULL && block->transaction->id != transactionID) {
560 		// ToDo: we have to wait here until the other transaction is done.
561 		//	Maybe we should even panic, since we can't prevent any deadlocks.
562 		panic("get_writable_cached_block(): asked to get busy writable block (transaction %ld)\n", block->transaction->id);
563 		put_cached_block(cache, block);
564 		return NULL;
565 	}
566 	if (block->transaction == NULL && transactionID != -1) {
567 		// get new transaction
568 		cache_transaction *transaction = lookup_transaction(cache, transactionID);
569 		if (transaction == NULL) {
570 			panic("get_writable_cached_block(): invalid transaction %ld!\n", transactionID);
571 			put_cached_block(cache, block);
572 			return NULL;
573 		}
574 		if (!transaction->open) {
575 			panic("get_writable_cached_block(): transaction already done!\n");
576 			put_cached_block(cache, block);
577 			return NULL;
578 		}
579 
580 		block->transaction = transaction;
581 
582 		// attach the block to the transaction block list
583 		block->transaction_next = transaction->first_block;
584 		transaction->first_block = block;
585 		transaction->num_blocks++;
586 	}
587 
588 	if (!(allocated && cleared) && block->original_data == NULL) {
589 		// we already have data, so we need to preserve it
590 		block->original_data = cache->Allocate();
591 		if (block->original_data == NULL) {
592 			FATAL(("could not allocate original_data\n"));
593 			put_cached_block(cache, block);
594 			return NULL;
595 		}
596 
597 		memcpy(block->original_data, block->current_data, cache->block_size);
598 	}
599 	if (block->parent_data == block->current_data) {
600 		// remember any previous contents for the parent transaction
601 		block->parent_data = cache->Allocate();
602 		if (block->parent_data == NULL) {
603 			// ToDo: maybe we should just continue the current transaction in this case...
604 			FATAL(("could not allocate parent\n"));
605 			put_cached_block(cache, block);
606 			return NULL;
607 		}
608 
609 		memcpy(block->parent_data, block->current_data, cache->block_size);
610 		block->transaction->sub_num_blocks++;
611 	}
612 
613 	if (cleared)
614 		memset(block->current_data, 0, cache->block_size);
615 
616 	block->is_dirty = true;
617 
618 	return block->current_data;
619 }
620 
621 
622 static status_t
623 write_cached_block(block_cache *cache, cached_block *block, bool deleteTransaction)
624 {
625 	cache_transaction *previous = block->previous_transaction;
626 	int32 blockSize = cache->block_size;
627 
628 	void *data = previous && block->original_data ? block->original_data : block->current_data;
629 		// we first need to write back changes from previous transactions
630 
631 	TRACE(("write_cached_block(block %Ld)\n", block->block_number));
632 
633 	ssize_t written = write_pos(cache->fd, block->block_number * blockSize, data, blockSize);
634 
635 	if (written < blockSize) {
636 		FATAL(("could not write back block %Ld (%s)\n", block->block_number, strerror(errno)));
637 		return B_IO_ERROR;
638 	}
639 
640 	if (data == block->current_data)
641 		block->is_dirty = false;
642 
643 	if (previous != NULL) {
644 		previous->blocks.Remove(block);
645 		block->previous_transaction = NULL;
646 
647 		// Has the previous transation been finished with that write?
648 		if (--previous->num_blocks == 0) {
649 			TRACE(("cache transaction %ld finished!\n", previous->id));
650 
651 			if (previous->notification_hook != NULL)
652 				previous->notification_hook(previous->id, previous->notification_data);
653 
654 			if (deleteTransaction)
655 				delete_transaction(cache, previous);
656 		}
657 	}
658 
659 	return B_OK;
660 }
661 
662 
663 extern "C" status_t
664 block_cache_init(void)
665 {
666 	return init_block_allocator();
667 }
668 
669 
670 //	#pragma mark - public transaction API
671 
672 
673 extern "C" int32
674 cache_start_transaction(void *_cache)
675 {
676 	block_cache *cache = (block_cache *)_cache;
677 	BenaphoreLocker locker(&cache->lock);
678 
679 	if (cache->last_transaction && cache->last_transaction->open)
680 		panic("last transaction (%ld) still open!\n", cache->last_transaction->id);
681 
682 	cache_transaction *transaction = new(nothrow) cache_transaction;
683 	if (transaction == NULL)
684 		return B_NO_MEMORY;
685 
686 	transaction->id = atomic_add(&cache->next_transaction_id, 1);
687 	cache->last_transaction = transaction;
688 
689 	TRACE(("cache_start_transaction(): id %ld started\n", transaction->id));
690 
691 	hash_insert(cache->transaction_hash, transaction);
692 
693 	return transaction->id;
694 }
695 
696 
697 extern "C" status_t
698 cache_sync_transaction(void *_cache, int32 id)
699 {
700 	block_cache *cache = (block_cache *)_cache;
701 	BenaphoreLocker locker(&cache->lock);
702 	status_t status = B_ENTRY_NOT_FOUND;
703 
704 	hash_iterator iterator;
705 	hash_open(cache->transaction_hash, &iterator);
706 
707 	cache_transaction *transaction;
708 	while ((transaction = (cache_transaction *)hash_next(cache->transaction_hash, &iterator)) != NULL) {
709 		// ToDo: fix hash interface to make this easier
710 
711 		if (transaction->id <= id && !transaction->open) {
712 			while (transaction->num_blocks > 0) {
713 				status = write_cached_block(cache, transaction->blocks.Head(), false);
714 				if (status != B_OK)
715 					return status;
716 			}
717 			delete_transaction(cache, transaction);
718 			hash_rewind(cache->transaction_hash, &iterator);
719 		}
720 	}
721 
722 	hash_close(cache->transaction_hash, &iterator, false);
723 	return B_OK;
724 }
725 
726 
727 extern "C" status_t
728 cache_end_transaction(void *_cache, int32 id, transaction_notification_hook hook, void *data)
729 {
730 	block_cache *cache = (block_cache *)_cache;
731 	BenaphoreLocker locker(&cache->lock);
732 
733 	TRACE(("cache_end_transaction(id = %ld)\n", id));
734 
735 	cache_transaction *transaction = lookup_transaction(cache, id);
736 	if (transaction == NULL) {
737 		panic("cache_end_transaction(): invalid transaction ID\n");
738 		return B_BAD_VALUE;
739 	}
740 
741 	transaction->notification_hook = hook;
742 	transaction->notification_data = data;
743 
744 	// iterate through all blocks and free the unchanged original contents
745 
746 	cached_block *block = transaction->first_block, *next;
747 	for (; block != NULL; block = next) {
748 		next = block->transaction_next;
749 
750 		if (block->previous_transaction != NULL) {
751 			// need to write back pending changes
752 			write_cached_block(cache, block);
753 		}
754 
755 		if (block->original_data != NULL) {
756 			cache->Free(block->original_data);
757 			block->original_data = NULL;
758 		}
759 		if (transaction->has_sub_transaction) {
760 			if (block->parent_data != block->current_data)
761 				cache->Free(block->parent_data);
762 			block->parent_data = NULL;
763 		}
764 
765 		// move the block to the previous transaction list
766 		transaction->blocks.Add(block);
767 
768 		block->previous_transaction = transaction;
769 		block->transaction_next = NULL;
770 		block->transaction = NULL;
771 	}
772 
773 	transaction->open = false;
774 
775 	return B_OK;
776 }
777 
778 
779 extern "C" status_t
780 cache_abort_transaction(void *_cache, int32 id)
781 {
782 	block_cache *cache = (block_cache *)_cache;
783 	BenaphoreLocker locker(&cache->lock);
784 
785 	TRACE(("cache_abort_transaction(id = %ld)\n", id));
786 
787 	cache_transaction *transaction = lookup_transaction(cache, id);
788 	if (transaction == NULL) {
789 		panic("cache_abort_transaction(): invalid transaction ID\n");
790 		return B_BAD_VALUE;
791 	}
792 
793 	// iterate through all blocks and restore their original contents
794 
795 	cached_block *block = transaction->first_block, *next;
796 	for (; block != NULL; block = next) {
797 		next = block->transaction_next;
798 
799 		if (block->original_data != NULL) {
800 			TRACE(("cache_abort_transaction(id = %ld): restored contents of block %Ld\n",
801 				transaction->id, block->block_number));
802 			memcpy(block->current_data, block->original_data, cache->block_size);
803 			cache->Free(block->original_data);
804 			block->original_data = NULL;
805 		}
806 		if (transaction->has_sub_transaction) {
807 			if (block->parent_data != block->current_data)
808 				cache->Free(block->parent_data);
809 			block->parent_data = NULL;
810 		}
811 
812 		block->transaction_next = NULL;
813 		block->transaction = NULL;
814 	}
815 
816 	delete_transaction(cache, transaction);
817 	return B_OK;
818 }
819 
820 
821 /**	Acknowledges the current parent transaction, and starts a new transaction
822  *	from its sub transaction.
823  *	The new transaction also gets a new transaction ID.
824  */
825 
826 extern "C" int32
827 cache_detach_sub_transaction(void *_cache, int32 id,
828 	transaction_notification_hook hook, void *data)
829 {
830 	block_cache *cache = (block_cache *)_cache;
831 	BenaphoreLocker locker(&cache->lock);
832 
833 	TRACE(("cache_detach_sub_transaction(id = %ld)\n", id));
834 
835 	cache_transaction *transaction = lookup_transaction(cache, id);
836 	if (transaction == NULL) {
837 		panic("cache_detach_sub_transaction(): invalid transaction ID\n");
838 		return B_BAD_VALUE;
839 	}
840 	if (!transaction->has_sub_transaction)
841 		return B_BAD_VALUE;
842 
843 	// create a new transaction for the sub transaction
844 	cache_transaction *newTransaction = new(nothrow) cache_transaction;
845 	if (transaction == NULL)
846 		return B_NO_MEMORY;
847 
848 	newTransaction->id = atomic_add(&cache->next_transaction_id, 1);
849 
850 	transaction->notification_hook = hook;
851 	transaction->notification_data = data;
852 
853 	// iterate through all blocks and free the unchanged original contents
854 
855 	cached_block *block = transaction->first_block, *next, *last = NULL;
856 	for (; block != NULL; block = next) {
857 		next = block->transaction_next;
858 
859 		if (block->previous_transaction != NULL) {
860 			// need to write back pending changes
861 			write_cached_block(cache, block);
862 		}
863 
864 		if (block->original_data != NULL && block->parent_data != NULL
865 			&& block->parent_data != block->current_data) {
866 			// free the original data if the parent data of the transaction
867 			// will be made current - but keep them otherwise
868 			cache->Free(block->original_data);
869 			block->original_data = NULL;
870 		}
871 		if (block->parent_data != NULL && block->parent_data != block->current_data) {
872 			// we need to move this block over to the new transaction
873 			block->original_data = block->parent_data;
874 			if (last == NULL)
875 				newTransaction->first_block = block;
876 			else
877 				last->transaction_next = block;
878 
879 			last = block;
880 		}
881 		block->parent_data = NULL;
882 
883 		// move the block to the previous transaction list
884 		transaction->blocks.Add(block);
885 
886 		block->previous_transaction = transaction;
887 		block->transaction_next = NULL;
888 		block->transaction = newTransaction;
889 	}
890 
891 	transaction->open = false;
892 
893 	hash_insert(cache->transaction_hash, newTransaction);
894 	cache->last_transaction = newTransaction;
895 
896 	return B_OK;
897 }
898 
899 
900 extern "C" status_t
901 cache_abort_sub_transaction(void *_cache, int32 id)
902 {
903 	block_cache *cache = (block_cache *)_cache;
904 	BenaphoreLocker locker(&cache->lock);
905 
906 	TRACE(("cache_abort_sub_transaction(id = %ld)\n", id));
907 
908 	cache_transaction *transaction = lookup_transaction(cache, id);
909 	if (transaction == NULL) {
910 		panic("cache_abort_sub_transaction(): invalid transaction ID\n");
911 		return B_BAD_VALUE;
912 	}
913 	if (!transaction->has_sub_transaction)
914 		return B_BAD_VALUE;
915 
916 	// revert all changes back to the version of the parent
917 
918 	cached_block *block = transaction->first_block, *next;
919 	for (; block != NULL; block = next) {
920 		next = block->transaction_next;
921 
922 		if (block->parent_data == NULL) {
923 			if (block->original_data != NULL) {
924 				// the parent transaction didn't change the block, but the sub
925 				// transaction did - we need to revert from the original data
926 				memcpy(block->current_data, block->original_data, cache->block_size);
927 			}
928 		} else if (block->parent_data != block->current_data) {
929 			// the block has been changed and must be restored
930 			TRACE(("cache_abort_sub_transaction(id = %ld): restored contents of block %Ld\n",
931 				transaction->id, block->block_number));
932 			memcpy(block->current_data, block->parent_data, cache->block_size);
933 			cache->Free(block->parent_data);
934 		}
935 
936 		block->parent_data = NULL;
937 	}
938 
939 	// all subsequent changes will go into the main transaction
940 	transaction->has_sub_transaction = false;
941 	return B_OK;
942 }
943 
944 
945 extern "C" status_t
946 cache_start_sub_transaction(void *_cache, int32 id)
947 {
948 	block_cache *cache = (block_cache *)_cache;
949 	BenaphoreLocker locker(&cache->lock);
950 
951 	TRACE(("cache_start_sub_transaction(id = %ld)\n", id));
952 
953 	cache_transaction *transaction = lookup_transaction(cache, id);
954 	if (transaction == NULL) {
955 		panic("cache_start_sub_transaction(): invalid transaction ID %ld\n", id);
956 		return B_BAD_VALUE;
957 	}
958 
959 	// move all changed blocks up to the parent
960 
961 	cached_block *block = transaction->first_block, *next;
962 	for (; block != NULL; block = next) {
963 		next = block->transaction_next;
964 
965 		if (transaction->has_sub_transaction
966 			&& block->parent_data != NULL
967 			&& block->parent_data != block->current_data) {
968 			// there already is an older sub transaction - we acknowledge
969 			// its changes and move its blocks up to the parent
970 			cache->Free(block->parent_data);
971 		}
972 
973 		// we "allocate" the parent data lazily, that means, we don't copy
974 		// the data (and allocate memory for it) until we need to
975 		block->parent_data = block->current_data;
976 	}
977 
978 	// all subsequent changes will go into the sub transaction
979 	transaction->has_sub_transaction = true;
980 	transaction->sub_num_blocks = 0;
981 
982 	return B_OK;
983 }
984 
985 
986 extern "C" status_t
987 cache_next_block_in_transaction(void *_cache, int32 id, uint32 *_cookie, off_t *_blockNumber,
988 	void **_data, void **_unchangedData)
989 {
990 	cached_block *block = (cached_block *)*_cookie;
991 	block_cache *cache = (block_cache *)_cache;
992 
993 	BenaphoreLocker locker(&cache->lock);
994 
995 	cache_transaction *transaction = lookup_transaction(cache, id);
996 	if (transaction == NULL)
997 		return B_BAD_VALUE;
998 
999 	if (block == NULL)
1000 		block = transaction->first_block;
1001 	else
1002 		block = block->transaction_next;
1003 
1004 	if (block == NULL)
1005 		return B_ENTRY_NOT_FOUND;
1006 
1007 	if (_blockNumber)
1008 		*_blockNumber = block->block_number;
1009 	if (_data)
1010 		*_data = block->current_data;
1011 	if (_unchangedData)
1012 		*_unchangedData = block->original_data;
1013 
1014 	*_cookie = (uint32)block;
1015 	return B_OK;
1016 }
1017 
1018 
1019 extern "C" int32
1020 cache_blocks_in_transaction(void *_cache, int32 id)
1021 {
1022 	block_cache *cache = (block_cache *)_cache;
1023 	BenaphoreLocker locker(&cache->lock);
1024 
1025 	cache_transaction *transaction = lookup_transaction(cache, id);
1026 	if (transaction == NULL)
1027 		return B_BAD_VALUE;
1028 
1029 	return transaction->num_blocks;
1030 }
1031 
1032 
1033 extern "C" int32
1034 cache_blocks_in_sub_transaction(void *_cache, int32 id)
1035 {
1036 	block_cache *cache = (block_cache *)_cache;
1037 	BenaphoreLocker locker(&cache->lock);
1038 
1039 	cache_transaction *transaction = lookup_transaction(cache, id);
1040 	if (transaction == NULL)
1041 		return B_BAD_VALUE;
1042 
1043 	return transaction->sub_num_blocks;
1044 }
1045 
1046 
1047 //	#pragma mark - public block cache API
1048 //	public interface
1049 
1050 
1051 extern "C" void
1052 block_cache_delete(void *_cache, bool allowWrites)
1053 {
1054 	block_cache *cache = (block_cache *)_cache;
1055 
1056 	if (allowWrites)
1057 		block_cache_sync(cache);
1058 
1059 	BenaphoreLocker locker(&cache->lock);
1060 
1061 	// free all blocks
1062 
1063 	uint32 cookie = 0;
1064 	cached_block *block;
1065 	while ((block = (cached_block *)hash_remove_first(cache->hash, &cookie)) != NULL) {
1066 		cache->FreeBlock(block);
1067 	}
1068 
1069 	// free all transactions (they will all be aborted)
1070 
1071 	cookie = 0;
1072 	cache_transaction *transaction;
1073 	while ((transaction = (cache_transaction *)hash_remove_first(cache->transaction_hash, &cookie)) != NULL) {
1074 		delete transaction;
1075 	}
1076 
1077 	delete cache;
1078 }
1079 
1080 
1081 extern "C" void *
1082 block_cache_create(int fd, off_t numBlocks, size_t blockSize, bool readOnly)
1083 {
1084 	block_cache *cache = new(nothrow) block_cache(fd, numBlocks, blockSize, readOnly);
1085 	if (cache == NULL)
1086 		return NULL;
1087 
1088 	if (cache->InitCheck() != B_OK) {
1089 		delete cache;
1090 		return NULL;
1091 	}
1092 
1093 	return cache;
1094 }
1095 
1096 
1097 extern "C" status_t
1098 block_cache_sync(void *_cache)
1099 {
1100 	block_cache *cache = (block_cache *)_cache;
1101 
1102 	// we will sync all dirty blocks to disk that have a completed
1103 	// transaction or no transaction only
1104 
1105 	BenaphoreLocker locker(&cache->lock);
1106 	hash_iterator iterator;
1107 	hash_open(cache->hash, &iterator);
1108 
1109 	cached_block *block;
1110 	while ((block = (cached_block *)hash_next(cache->hash, &iterator)) != NULL) {
1111 		if (block->previous_transaction != NULL
1112 			|| (block->transaction == NULL && block->is_dirty)) {
1113 			status_t status = write_cached_block(cache, block);
1114 			if (status != B_OK)
1115 				return status;
1116 		}
1117 	}
1118 
1119 	hash_close(cache->hash, &iterator, false);
1120 	return B_OK;
1121 }
1122 
1123 
1124 extern "C" status_t
1125 block_cache_make_writable(void *_cache, off_t blockNumber, int32 transaction)
1126 {
1127 	block_cache *cache = (block_cache *)_cache;
1128 	BenaphoreLocker locker(&cache->lock);
1129 
1130 	if (cache->read_only)
1131 		panic("tried to make block writable on a read-only cache!");
1132 
1133 	// ToDo: this can be done better!
1134 	void *block = get_writable_cached_block(cache, blockNumber,
1135 		blockNumber, 1, transaction, false);
1136 	if (block != NULL) {
1137 		put_cached_block((block_cache *)_cache, blockNumber);
1138 		return B_OK;
1139 	}
1140 
1141 	return B_ERROR;
1142 }
1143 
1144 
1145 extern "C" void *
1146 block_cache_get_writable_etc(void *_cache, off_t blockNumber, off_t base,
1147 	off_t length, int32 transaction)
1148 {
1149 	block_cache *cache = (block_cache *)_cache;
1150 	BenaphoreLocker locker(&cache->lock);
1151 
1152 	TRACE(("block_cache_get_writable_etc(block = %Ld, transaction = %ld)\n",
1153 		blockNumber, transaction));
1154 	if (cache->read_only)
1155 		panic("tried to get writable block on a read-only cache!");
1156 
1157 	return get_writable_cached_block(cache, blockNumber, base, length,
1158 		transaction, false);
1159 }
1160 
1161 
1162 extern "C" void *
1163 block_cache_get_writable(void *_cache, off_t blockNumber, int32 transaction)
1164 {
1165 	return block_cache_get_writable_etc(_cache, blockNumber,
1166 		blockNumber, 1, transaction);
1167 }
1168 
1169 
1170 extern "C" void *
1171 block_cache_get_empty(void *_cache, off_t blockNumber, int32 transaction)
1172 {
1173 	block_cache *cache = (block_cache *)_cache;
1174 	BenaphoreLocker locker(&cache->lock);
1175 
1176 	TRACE(("block_cache_get_empty(block = %Ld, transaction = %ld)\n",
1177 		blockNumber, transaction));
1178 	if (cache->read_only)
1179 		panic("tried to get empty writable block on a read-only cache!");
1180 
1181 	return get_writable_cached_block((block_cache *)_cache, blockNumber,
1182 				blockNumber, 1, transaction, true);
1183 }
1184 
1185 
1186 extern "C" const void *
1187 block_cache_get_etc(void *_cache, off_t blockNumber, off_t base, off_t length)
1188 {
1189 	block_cache *cache = (block_cache *)_cache;
1190 	BenaphoreLocker locker(&cache->lock);
1191 	bool allocated;
1192 
1193 	cached_block *block = get_cached_block(cache, blockNumber, &allocated);
1194 	if (block == NULL)
1195 		return NULL;
1196 
1197 #ifdef DEBUG_CHANGED
1198 	if (block->compare == NULL)
1199 		block->compare = cache->Allocate();
1200 	if (block->compare != NULL)
1201 		memcpy(block->compare, block->current_data, cache->block_size);
1202 #endif
1203 	return block->current_data;
1204 }
1205 
1206 
1207 extern "C" const void *
1208 block_cache_get(void *_cache, off_t blockNumber)
1209 {
1210 	return block_cache_get_etc(_cache, blockNumber, blockNumber, 1);
1211 }
1212 
1213 
1214 extern "C" status_t
1215 block_cache_set_dirty(void *_cache, off_t blockNumber, bool isDirty, int32 transaction)
1216 {
1217 	// not yet implemented
1218 	// Note, you must only use this function on blocks that were acquired writable!
1219 	if (isDirty)
1220 		panic("block_cache_set_dirty(): not yet implemented that way!\n");
1221 
1222 	return B_OK;
1223 }
1224 
1225 
1226 extern "C" void
1227 block_cache_put(void *_cache, off_t blockNumber)
1228 {
1229 	block_cache *cache = (block_cache *)_cache;
1230 	BenaphoreLocker locker(&cache->lock);
1231 
1232 	put_cached_block(cache, blockNumber);
1233 }
1234 
1235