xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 2b76973fa2401f7a5edf68e6470f3d3210cbcff3)
1 /*
2  * Copyright 2001-2012, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 
10 //! B+Tree implementation
11 
12 
13 #include "Debug.h"
14 #include "BPlusTree.h"
15 #include "Utility.h"
16 
17 #if !_BOOT_MODE
18 #include "Inode.h"
19 #else
20 #include "Stream.h"
21 
22 // BFS::Stream from the bootloader has the same API as Inode.
23 #define Inode BFS::Stream
24 
25 #define strerror(x)		"error message unavailable"
26 
27 namespace BFS {
28 #endif
29 
30 
31 /*!	Simple array used for the duplicate handling in the B+Tree. This is an
32 	on disk structure.
33 */
34 struct duplicate_array {
35 	off_t	count;
36 	off_t	values[0];
37 
38 	inline bool IsEmpty() const
39 	{
40 		return count == 0;
41 	}
42 
43 	inline int32 Count() const
44 	{
45 		return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
46 	}
47 
48 	inline off_t ValueAt(uint32 index) const
49 	{
50 		return BFS_ENDIAN_TO_HOST_INT64(values[index]);
51 	}
52 
53 	inline void SetValueAt(uint32 index, off_t value)
54 	{
55 		values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
56 	}
57 
58 	inline int32 Find(off_t value) const
59 	{
60 		int32 i;
61 		return _FindInternal(value, i) ? i : -1;
62 	}
63 
64 	void Insert(off_t value);
65 	bool Remove(off_t value);
66 
67 private:
68 	bool _FindInternal(off_t value, int32& index) const;
69 } _PACKED;
70 
71 
72 #ifdef DEBUG
73 class NodeChecker {
74 public:
75 	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
76 		:
77 		fNode(node),
78 		fSize(nodeSize),
79 		fText(text)
80 	{
81 		Check("integrity check failed on construction.");
82 	}
83 
84 	~NodeChecker()
85 	{
86 		Check("integrity check failed on destruction.");
87 	}
88 
89 	void
90 	Check(const char* message)
91 	{
92 		if (fNode->CheckIntegrity(fSize) != B_OK) {
93 			dprintf("%s: %s\n", fText, message);
94 			DEBUGGER(("NodeChecker integrity check failed!"));
95 		}
96 	}
97 
98 private:
99 	const bplustree_node*	fNode;
100 	int32					fSize;
101 	const char*				fText;
102 };
103 #endif // DEBUG
104 
105 
106 #if !_BOOT_MODE
107 class BitmapArray {
108 public:
109 								BitmapArray(size_t numBits);
110 								~BitmapArray();
111 
112 			status_t			InitCheck() const;
113 
114 			bool				IsSet(size_t index) const;
115 			void				Set(size_t index, bool set);
116 
117 			size_t				CountSet() const { return fCountSet; }
118 
119 private:
120 			uint8*				fBitmap;
121 			size_t				fSize;
122 			size_t				fCountSet;
123 };
124 
125 
126 struct TreeCheck {
127 	TreeCheck(BPlusTree* tree)
128 		:
129 		fLevelCount(0),
130 		fFreeCount(0),
131 		fNodeSize(tree->NodeSize()),
132 		fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
133 		fFoundErrors(0),
134 		fVisited(tree->Stream()->Size() / tree->NodeSize()),
135 		fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
136 	{
137 		fPreviousOffsets = (off_t*)malloc(
138 			sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
139 		if (fPreviousOffsets != NULL) {
140 			for (size_t i = 0; i < fMaxLevels; i++)
141 				fPreviousOffsets[i] = BPLUSTREE_NULL;
142 		}
143 	}
144 
145 	~TreeCheck()
146 	{
147 		free(fPreviousOffsets);
148 	}
149 
150 	status_t InitCheck() const
151 	{
152 		if (fPreviousOffsets == NULL)
153 			return B_NO_MEMORY;
154 
155 		status_t status = fVisited.InitCheck();
156 		if (status != B_OK)
157 			return status;
158 
159 		return fVisitedFragment.InitCheck();
160 	}
161 
162 	bool Visited(off_t offset) const
163 	{
164 		return fVisited.IsSet(offset / fNodeSize);
165 	}
166 
167 	void SetVisited(off_t offset)
168 	{
169 		fVisited.Set(offset / fNodeSize, true);
170 	}
171 
172 	size_t VisitedCount() const
173 	{
174 		return fVisited.CountSet();
175 	}
176 
177 	bool VisitedFragment(off_t offset) const
178 	{
179 		return fVisitedFragment.IsSet(offset / fNodeSize);
180 	}
181 
182 	void SetVisitedFragment(off_t offset)
183 	{
184 		fVisitedFragment.Set(offset / fNodeSize, true);
185 	}
186 
187 	uint32 MaxLevels() const
188 	{
189 		return fLevelCount;
190 	}
191 
192 	void SetLevel(uint32 level)
193 	{
194 		if (fLevelCount < level)
195 			fLevelCount = level;
196 	}
197 
198 	off_t PreviousOffset(uint32 level)
199 	{
200 		return fPreviousOffsets[level];
201 	}
202 
203 	void SetPreviousOffset(uint32 level, off_t offset)
204 	{
205 		fPreviousOffsets[level] = offset;
206 	}
207 
208 	void FoundError()
209 	{
210 		fFoundErrors++;
211 	}
212 
213 	bool ErrorsFound()
214 	{
215 		return fFoundErrors != 0;
216 	}
217 
218 private:
219 			uint32				fLevelCount;
220 			uint32				fFreeCount;
221 			uint32				fNodeSize;
222 			uint32				fMaxLevels;
223 			uint32				fFoundErrors;
224 			BitmapArray			fVisited;
225 			BitmapArray			fVisitedFragment;
226 			off_t*				fPreviousOffsets;
227 };
228 
229 
230 // #pragma mark -
231 
232 
233 // Node Caching for the BPlusTree class
234 //
235 // With write support, there is the need for a function that allocates new
236 // nodes by either returning empty nodes, or by growing the file's data stream
237 //
238 // !! The CachedNode class assumes that you have properly locked the stream
239 // !! before asking for nodes.
240 //
241 // Note: This code will fail if the block size is smaller than the node size!
242 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
243 // is hard-coded to 1024 bytes, that's not an issue now.
244 
245 void
246 CachedNode::UnsetUnchanged(Transaction& transaction)
247 {
248 	if (fTree == NULL || fTree->fStream == NULL)
249 		return;
250 
251 	if (fNode != NULL) {
252 		void* cache = fTree->fStream->GetVolume()->BlockCache();
253 
254 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
255 		block_cache_put(cache, fBlockNumber);
256 		fNode = NULL;
257 	}
258 }
259 #endif // !_BOOT_MODE
260 
261 
262 void
263 CachedNode::Unset()
264 {
265 	if (fTree == NULL || fTree->fStream == NULL)
266 		return;
267 
268 	if (fNode != NULL) {
269 #if !_BOOT_MODE
270 		if (fWritable && fOffset == 0) {
271 			// The B+tree header has been updated - we need to update the
272 			// BPlusTrees copy of it, as well.
273 			memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
274 		}
275 
276 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
277 			fBlockNumber);
278 #endif // !_BOOT_MODE
279 
280 		fNode = NULL;
281 	}
282 }
283 
284 
285 const bplustree_node*
286 CachedNode::SetTo(off_t offset, bool check)
287 {
288 	const bplustree_node* node;
289 	if (SetTo(offset, &node, check) == B_OK)
290 		return node;
291 
292 	return NULL;
293 }
294 
295 
296 status_t
297 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
298 {
299 	if (fTree == NULL || fTree->fStream == NULL)
300 		RETURN_ERROR(B_BAD_VALUE);
301 
302 	Unset();
303 
304 	// You can only ask for nodes at valid positions - you can't
305 	// even access the b+tree header with this method (use SetToHeader()
306 	// instead)
307 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
308 		|| offset <= 0
309 		|| (offset % fTree->fNodeSize) != 0) {
310 		RETURN_ERROR(B_BAD_VALUE);
311 	}
312 
313 	if (InternalSetTo(NULL, offset) != NULL && check) {
314 		// sanity checks (links, all_key_count)
315 		if (!fTree->fHeader.CheckNode(fNode)) {
316 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
317 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
318 				fBlockNumber, fTree->fStream->ID()));
319 			return B_BAD_DATA;
320 		}
321 	}
322 
323 	*_node = fNode;
324 	return B_OK;
325 }
326 
327 
328 #if !_BOOT_MODE
329 bplustree_node*
330 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
331 {
332 	if (fTree == NULL || fTree->fStream == NULL) {
333 		REPORT_ERROR(B_BAD_VALUE);
334 		return NULL;
335 	}
336 
337 	Unset();
338 
339 	// You can only ask for nodes at valid positions - you can't
340 	// even access the b+tree header with this method (use SetToHeader()
341 	// instead)
342 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
343 		|| offset <= 0
344 		|| (offset % fTree->fNodeSize) != 0)
345 		return NULL;
346 
347 	if (InternalSetTo(&transaction, offset) != NULL && check) {
348 		// sanity checks (links, all_key_count)
349 		if (!fTree->fHeader.CheckNode(fNode)) {
350 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
351 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
352 				fBlockNumber, fTree->fStream->ID()));
353 			return NULL;
354 		}
355 	}
356 	return fNode;
357 }
358 
359 
360 bplustree_node*
361 CachedNode::MakeWritable(Transaction& transaction)
362 {
363 	if (fNode == NULL)
364 		return NULL;
365 
366 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
367 			fBlockNumber, transaction.ID()) == B_OK) {
368 		return fNode;
369 	}
370 
371 	return NULL;
372 }
373 #endif // !_BOOT_MODE
374 
375 
376 const bplustree_header*
377 CachedNode::SetToHeader()
378 {
379 	if (fTree == NULL || fTree->fStream == NULL) {
380 		REPORT_ERROR(B_BAD_VALUE);
381 		return NULL;
382 	}
383 
384 	Unset();
385 
386 	InternalSetTo(NULL, 0LL);
387 	return (bplustree_header*)fNode;
388 }
389 
390 
391 #if !_BOOT_MODE
392 bplustree_header*
393 CachedNode::SetToWritableHeader(Transaction& transaction)
394 {
395 	if (fTree == NULL || fTree->fStream == NULL) {
396 		REPORT_ERROR(B_BAD_VALUE);
397 		return NULL;
398 	}
399 
400 	Unset();
401 
402 	InternalSetTo(&transaction, 0LL);
403 
404 	if (fNode != NULL && !fTree->fInTransaction) {
405 		transaction.AddListener(fTree);
406 		fTree->fInTransaction = true;
407 
408 		if (!transaction.GetVolume()->IsInitializing()) {
409 			acquire_vnode(transaction.GetVolume()->FSVolume(),
410 				fTree->fStream->ID());
411 		}
412 	}
413 
414 	return (bplustree_header*)fNode;
415 }
416 #endif // !_BOOT_MODE
417 
418 
419 bplustree_node*
420 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
421 {
422 	fNode = NULL;
423 	fOffset = offset;
424 
425 	off_t fileOffset;
426 	block_run run;
427 	if (offset < fTree->fStream->Size()
428 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
429 
430 #if !_BOOT_MODE
431 		Volume* volume = fTree->fStream->GetVolume();
432 #else
433 		Volume* volume = &fTree->fStream->GetVolume();
434 #endif
435 
436 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
437 		fBlockNumber = volume->ToBlock(run) + blockOffset;
438 		uint8* block = NULL;
439 
440 #if !_BOOT_MODE
441 		if (transaction != NULL) {
442 			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
443 				fBlockNumber, transaction->ID());
444 			fWritable = true;
445 		} else {
446 			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
447 			fWritable = false;
448 		}
449 #else // !_BOOT_MODE
450 		if (fBlock == NULL) {
451 			fBlock = (uint8*)malloc(volume->BlockSize());
452 			if (fBlock == NULL)
453 				return NULL;
454 		}
455 
456 		if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(),
457 				fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) {
458 			block = fBlock;
459 		}
460 
461 		fWritable = false;
462 #endif // _BOOT_MODE
463 
464 		if (block != NULL) {
465 			// The node is somewhere in that block...
466 			// (confusing offset calculation)
467 			fNode = (bplustree_node*)(block + offset
468 				- (fileOffset + (blockOffset << volume->BlockShift())));
469 		} else
470 			REPORT_ERROR(B_IO_ERROR);
471 	}
472 	return fNode;
473 }
474 
475 
476 #if !_BOOT_MODE
477 status_t
478 CachedNode::Free(Transaction& transaction, off_t offset)
479 {
480 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
481 		RETURN_ERROR(B_BAD_VALUE);
482 
483 	// TODO: scan the free nodes list and remove all nodes at the end
484 	// of the tree - perhaps that shouldn't be done everytime that
485 	// function is called, perhaps it should be done when the directory
486 	// inode is closed or based on some calculation or whatever...
487 
488 	CachedNode cached(fTree);
489 	bplustree_header* header = cached.SetToWritableHeader(transaction);
490 	if (header == NULL)
491 		return B_IO_ERROR;
492 
493 #if 0
494 	// TODO: temporarily disabled because CheckNode() doesn't like this...
495 	// 		Also, it's such an edge case that it's almost useless, anyway.
496 	// if the node is the last one in the tree, we shrink
497 	// the tree and file size by one node
498 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
499 	if (offset == lastOffset) {
500 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
501 		if (status != B_OK)
502 			return status;
503 
504 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
505 		return B_OK;
506 	}
507 #endif
508 
509 	// add the node to the free nodes list
510 	fNode->left_link = header->free_node_pointer;
511 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
512 
513 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
514 	return B_OK;
515 }
516 
517 
518 status_t
519 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
520 	off_t* _offset)
521 {
522 	if (fTree == NULL || fTree->fStream == NULL)
523 		RETURN_ERROR(B_BAD_VALUE);
524 
525 	Unset();
526 
527 	bplustree_header* header;
528 	status_t status;
529 
530 	// if there are any free nodes, recycle them
531 	if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
532 		CachedNode cached(fTree);
533 		header = cached.SetToWritableHeader(transaction);
534 		if (header == NULL)
535 			return B_IO_ERROR;
536 
537 		*_offset = header->FreeNode();
538 		*_node = fNode;
539 
540 		// set new free node pointer
541 		header->free_node_pointer = fNode->left_link;
542 		fNode->Initialize();
543 		return B_OK;
544 	}
545 
546 	// allocate space for a new node
547 	Inode* stream = fTree->fStream;
548 	if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
549 		return status;
550 
551 	CachedNode cached(fTree);
552 	header = cached.SetToWritableHeader(transaction);
553 	if (header == NULL)
554 		return B_IO_ERROR;
555 
556 	// the maximum_size has to be changed before the call to SetTo() - or
557 	// else it will fail because the requested node is out of bounds
558 	off_t offset = header->MaximumSize();
559 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
560 		+ fTree->fNodeSize);
561 
562 	cached.Unset();
563 		// SetToWritable() below needs the new values in the tree's header
564 
565 	if (SetToWritable(transaction, offset, false) == NULL)
566 		RETURN_ERROR(B_ERROR);
567 
568 	fNode->Initialize();
569 
570 	*_offset = offset;
571 	*_node = fNode;
572 	return B_OK;
573 }
574 
575 
576 //	#pragma mark -
577 
578 
579 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
580 	:
581 	fStream(NULL),
582 	fInTransaction(false)
583 {
584 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
585 	SetTo(transaction, stream);
586 }
587 #endif // !_BOOT_MODE
588 
589 
590 BPlusTree::BPlusTree(Inode* stream)
591 	:
592 	fStream(NULL),
593 	fInTransaction(false)
594 {
595 #if !_BOOT_MODE
596 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
597 #endif
598 
599 	SetTo(stream);
600 }
601 
602 
603 BPlusTree::BPlusTree()
604 	:
605 	fStream(NULL),
606 	fNodeSize(BPLUSTREE_NODE_SIZE),
607 	fAllowDuplicates(true),
608 	fInTransaction(false),
609 	fStatus(B_NO_INIT)
610 {
611 #if !_BOOT_MODE
612 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
613 #endif
614 }
615 
616 
617 BPlusTree::~BPlusTree()
618 {
619 #if !_BOOT_MODE
620 	// if there are any TreeIterators left, we need to stop them
621 	// (can happen when the tree's inode gets deleted while
622 	// traversing the tree - a TreeIterator doesn't lock the inode)
623 	mutex_lock(&fIteratorLock);
624 
625 	SinglyLinkedList<TreeIterator>::Iterator iterator
626 		= fIterators.GetIterator();
627 	while (iterator.HasNext())
628 		iterator.Next()->Stop();
629 
630 	mutex_destroy(&fIteratorLock);
631 
632 	ASSERT(!fInTransaction);
633 #endif // !_BOOT_MODE
634 }
635 
636 
637 #if !_BOOT_MODE
638 /*! Create a new B+Tree on the specified stream */
639 status_t
640 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
641 {
642 	// initializes in-memory B+Tree
643 
644 	fStream = stream;
645 
646 	CachedNode cached(this);
647 	bplustree_header* header = cached.SetToWritableHeader(transaction);
648 	if (header == NULL) {
649 		// allocate space for new header + node!
650 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
651 		if (fStatus != B_OK)
652 			RETURN_ERROR(fStatus);
653 
654 		header = cached.SetToWritableHeader(transaction);
655 		if (header == NULL)
656 			RETURN_ERROR(fStatus = B_ERROR);
657 	}
658 
659 	fAllowDuplicates = stream->IsIndex()
660 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
661 
662 	fNodeSize = nodeSize;
663 
664 	// initialize b+tree header
665  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
666  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
667  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
668  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
669  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
670  	header->free_node_pointer
671  		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
672  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
673 
674 	cached.Unset();
675 
676 	// initialize b+tree root node
677 	cached.SetToWritable(transaction, fHeader.RootNode(), false);
678 	if (cached.Node() == NULL)
679 		RETURN_ERROR(B_IO_ERROR);
680 
681 	cached.Node()->Initialize();
682 
683 	return fStatus = B_OK;
684 }
685 #endif // !_BOOT_MODE
686 
687 
688 status_t
689 BPlusTree::SetTo(Inode* stream)
690 {
691 	if (stream == NULL)
692 		RETURN_ERROR(fStatus = B_BAD_VALUE);
693 
694 	fStream = stream;
695 
696 	// get on-disk B+Tree header
697 
698 	CachedNode cached(this);
699 	const bplustree_header* header = cached.SetToHeader();
700 	if (header != NULL)
701 		memcpy(&fHeader, header, sizeof(bplustree_header));
702 	else
703 		RETURN_ERROR(fStatus = B_IO_ERROR);
704 
705 	// is header valid?
706 
707 	if (fHeader.MaximumSize() != stream->Size()) {
708 		dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
709 			B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
710 		// we can't change the header since we don't have a transaction
711 		//fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
712 	}
713 	if (!fHeader.IsValid()) {
714 #ifdef DEBUG
715 		dump_bplustree_header(&fHeader);
716 		dump_block((const char*)&fHeader, 128);
717 #endif
718 		RETURN_ERROR(fStatus = B_BAD_DATA);
719 	}
720 
721 	fNodeSize = fHeader.NodeSize();
722 
723 	// validity check
724 	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
725 		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
726 		S_DOUBLE_INDEX};
727 	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
728 		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
729 		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
730 
731 	if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
732 		|| ((stream->Mode() & S_INDEX_DIR) != 0
733 			&& kToMode[fHeader.DataType()] != mode)
734 		|| !stream->IsContainer()) {
735 		D(	dump_bplustree_header(&fHeader);
736 			dump_inode(&stream->Node());
737 		);
738 		RETURN_ERROR(fStatus = B_BAD_TYPE);
739 	}
740 
741 	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
742 	// in the original BFS code - we will honour it nevertheless
743 	fAllowDuplicates = stream->IsIndex()
744 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
745 
746 	cached.SetTo(fHeader.RootNode());
747 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
748 }
749 
750 
751 status_t
752 BPlusTree::InitCheck()
753 {
754 	return fStatus;
755 }
756 
757 
758 #if !_BOOT_MODE
759 status_t
760 BPlusTree::Validate(bool repair, bool& _errorsFound)
761 {
762 	TreeCheck check(this);
763 	if (check.InitCheck() != B_OK)
764 		return B_NO_MEMORY;
765 
766 	check.SetVisited(0);
767 
768 	// Walk the free nodes
769 
770 	CachedNode cached(this);
771 	off_t freeOffset = fHeader.FreeNode();
772 	while (freeOffset > 0) {
773 		const bplustree_node* node;
774 		status_t status = cached.SetTo(freeOffset, &node, false);
775 		if (status != B_OK) {
776 			if (status == B_IO_ERROR)
777 				return B_IO_ERROR;
778 
779 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
780 				"not be read: %s\n", fStream->ID(), freeOffset,
781 				strerror(status));
782 			check.FoundError();
783 			break;
784 		}
785 
786 		if (check.Visited(freeOffset)) {
787 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
788 				" circular!\n", fStream->ID(), freeOffset);
789 			// TODO: if 'repair' is true, we could collect all unvisited nodes
790 			// at the end, and put the into the free list
791 			check.FoundError();
792 			break;
793 		}
794 
795 		check.SetVisited(freeOffset);
796 
797 		if (node->OverflowLink() != BPLUSTREE_FREE) {
798 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
799 				" misses free mark!\n", fStream->ID(), freeOffset);
800 		}
801 		freeOffset = node->LeftLink();
802 	}
803 
804 	// Iterate over the complete tree recursively
805 
806 	const bplustree_node* root;
807 	status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
808 	if (status != B_OK)
809 		return status;
810 
811 	status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
812 
813 	if (check.ErrorsFound())
814 		_errorsFound = true;
815 
816 	if (status != B_OK)
817 		return status;
818 
819 	if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
820 		dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
821 			"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
822 			fHeader.MaxNumberOfLevels());
823 	}
824 
825 	if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
826 		dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
827 			" nodes.\n", fStream->ID(), check.VisitedCount(),
828 			fHeader.MaximumSize() / fNodeSize);
829 	}
830 
831 	return B_OK;
832 }
833 
834 
835 status_t
836 BPlusTree::MakeEmpty()
837 {
838 	// Put all nodes into the free list in order
839 	Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
840 
841 	// Reset the header, and root node
842 	CachedNode cached(this);
843 	bplustree_header* header = cached.SetToWritableHeader(transaction);
844 	if (header == NULL)
845 		return B_IO_ERROR;
846 
847 	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
848 	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
849 	if (fStream->Size() > (off_t)NodeSize() * 2)
850 		header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
851 	else {
852 		header->free_node_pointer
853 			= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
854 	}
855 
856 	bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
857 	if (node == NULL)
858 		return B_IO_ERROR;
859 
860 	node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
861 	node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
862 	node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
863 	node->all_key_count = 0;
864 	node->all_key_length = 0;
865 
866 	for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
867 			offset += NodeSize()) {
868 		bplustree_node* node = cached.SetToWritable(transaction, offset, false);
869 		if (node == NULL) {
870 			dprintf("--> could not open %" B_PRIdOFF "\n", offset);
871 			return B_IO_ERROR;
872 		}
873 		if (offset < fStream->Size() - (off_t)NodeSize())
874 			node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
875 		else
876 			node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
877 
878 		node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
879 
880 		// It's not important to write it out in a single transaction; just
881 		// make sure it doesn't get too large
882 		if (offset % (1024 * 1024) == 0) {
883 			transaction.Done();
884 			transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
885 		}
886 	}
887 
888 	return transaction.Done();
889 }
890 
891 
892 int32
893 BPlusTree::TypeCodeToKeyType(type_code code)
894 {
895 	switch (code) {
896 		case B_STRING_TYPE:
897 			return BPLUSTREE_STRING_TYPE;
898 		case B_SSIZE_T_TYPE:
899 		case B_INT32_TYPE:
900 			return BPLUSTREE_INT32_TYPE;
901 		case B_SIZE_T_TYPE:
902 		case B_UINT32_TYPE:
903 			return BPLUSTREE_UINT32_TYPE;
904 		case B_OFF_T_TYPE:
905 		case B_INT64_TYPE:
906 			return BPLUSTREE_INT64_TYPE;
907 		case B_UINT64_TYPE:
908 			return BPLUSTREE_UINT64_TYPE;
909 		case B_FLOAT_TYPE:
910 			return BPLUSTREE_FLOAT_TYPE;
911 		case B_DOUBLE_TYPE:
912 			return BPLUSTREE_DOUBLE_TYPE;
913 	}
914 	return -1;
915 }
916 
917 
918 int32
919 BPlusTree::ModeToKeyType(mode_t mode)
920 {
921 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
922 			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
923 		case S_INT_INDEX:
924 			return BPLUSTREE_INT32_TYPE;
925 		case S_UINT_INDEX:
926 			return BPLUSTREE_UINT32_TYPE;
927 		case S_LONG_LONG_INDEX:
928 			return BPLUSTREE_INT64_TYPE;
929 		case S_ULONG_LONG_INDEX:
930 			return BPLUSTREE_UINT64_TYPE;
931 		case S_FLOAT_INDEX:
932 			return BPLUSTREE_FLOAT_TYPE;
933 		case S_DOUBLE_INDEX:
934 			return BPLUSTREE_DOUBLE_TYPE;
935 		case S_STR_INDEX:
936 		default:
937 			// default is for standard directories
938 			return BPLUSTREE_STRING_TYPE;
939 	}
940 }
941 #endif // !_BOOT_MODE
942 
943 
944 //	#pragma mark - TransactionListener implementation
945 
946 
947 #if !_BOOT_MODE
948 void
949 BPlusTree::TransactionDone(bool success)
950 {
951 	if (!success) {
952 		// update header from disk
953 		CachedNode cached(this);
954 		const bplustree_header* header = cached.SetToHeader();
955 		if (header != NULL)
956 			memcpy(&fHeader, header, sizeof(bplustree_header));
957 	}
958 }
959 
960 
961 void
962 BPlusTree::RemovedFromTransaction()
963 {
964 	fInTransaction = false;
965 
966 	if (!fStream->GetVolume()->IsInitializing())
967 		put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
968 }
969 
970 
971 //	#pragma mark -
972 
973 
974 void
975 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
976 	uint16 splitAt, int8 change)
977 {
978 	// Although every iterator which is affected by this update currently
979 	// waits on a semaphore, other iterators could be added/removed at
980 	// any time, so we need to protect this loop
981 	MutexLocker _(fIteratorLock);
982 
983 	SinglyLinkedList<TreeIterator>::Iterator iterator
984 		= fIterators.GetIterator();
985 	while (iterator.HasNext())
986 		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
987 }
988 
989 
990 void
991 BPlusTree::_AddIterator(TreeIterator* iterator)
992 {
993 	MutexLocker _(fIteratorLock);
994 	fIterators.Add(iterator);
995 }
996 
997 
998 void
999 BPlusTree::_RemoveIterator(TreeIterator* iterator)
1000 {
1001 	MutexLocker _(fIteratorLock);
1002 	fIterators.Remove(iterator);
1003 }
1004 #endif // !_BOOT_MODE
1005 
1006 
1007 int32
1008 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
1009 	int keyLength2)
1010 {
1011 	type_code type = 0;
1012 	switch (fHeader.DataType()) {
1013 	    case BPLUSTREE_STRING_TYPE:
1014 	    	type = B_STRING_TYPE;
1015 	    	break;
1016 		case BPLUSTREE_INT32_TYPE:
1017 	    	type = B_INT32_TYPE;
1018 	    	break;
1019 		case BPLUSTREE_UINT32_TYPE:
1020 	    	type = B_UINT32_TYPE;
1021 	    	break;
1022 		case BPLUSTREE_INT64_TYPE:
1023 	    	type = B_INT64_TYPE;
1024 	    	break;
1025 		case BPLUSTREE_UINT64_TYPE:
1026 	    	type = B_UINT64_TYPE;
1027 	    	break;
1028 		case BPLUSTREE_FLOAT_TYPE:
1029 	    	type = B_FLOAT_TYPE;
1030 	    	break;
1031 		case BPLUSTREE_DOUBLE_TYPE:
1032 	    	type = B_DOUBLE_TYPE;
1033 	    	break;
1034 	}
1035    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
1036 }
1037 
1038 
1039 status_t
1040 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
1041 	uint16 keyLength, uint16* _index, off_t* _next)
1042 {
1043 #ifdef DEBUG
1044 	NodeChecker checker(node, fNodeSize, "find");
1045 #endif
1046 
1047 	if (node->all_key_count == 0) {
1048 		if (_index)
1049 			*_index = 0;
1050 		if (_next)
1051 			*_next = node->OverflowLink();
1052 		return B_ENTRY_NOT_FOUND;
1053 	}
1054 
1055 	off_t* values = node->Values();
1056 	int16 saveIndex = -1;
1057 
1058 	// binary search in the key array
1059 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
1060 		uint16 i = (first + last) >> 1;
1061 
1062 		uint16 searchLength;
1063 		uint8* searchKey = node->KeyAt(i, &searchLength);
1064 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
1065 				> (uint8*)node + fNodeSize
1066 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
1067 #if !_BOOT_MODE
1068 			fStream->GetVolume()->Panic();
1069 #endif
1070 			RETURN_ERROR(B_BAD_DATA);
1071 		}
1072 
1073 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
1074 		if (cmp < 0) {
1075 			last = i - 1;
1076 			saveIndex = i;
1077 		} else if (cmp > 0) {
1078 			saveIndex = first = i + 1;
1079 		} else {
1080 			if (_index)
1081 				*_index = i;
1082 			if (_next)
1083 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1084 			return B_OK;
1085 		}
1086 	}
1087 
1088 	if (_index)
1089 		*_index = saveIndex;
1090 	if (_next) {
1091 		if (saveIndex == node->NumKeys())
1092 			*_next = node->OverflowLink();
1093 		else
1094 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
1095 	}
1096 	return B_ENTRY_NOT_FOUND;
1097 }
1098 
1099 
1100 #if !_BOOT_MODE
1101 /*!	Prepares the stack to contain all nodes that were passed while
1102 	following the key, from the root node to the leaf node that could
1103 	or should contain that key.
1104 */
1105 status_t
1106 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
1107 	uint16 keyLength)
1108 {
1109 	// set the root node to begin with
1110 	node_and_key nodeAndKey;
1111 	nodeAndKey.nodeOffset = fHeader.RootNode();
1112 
1113 	CachedNode cached(this);
1114 	const bplustree_node* node;
1115 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1116 		// if we are already on leaf level, we're done
1117 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1118 			// node that the keyIndex is not properly set here (but it's not
1119 			// needed in the calling functions anyway)!
1120 			nodeAndKey.keyIndex = 0;
1121 			stack.Push(nodeAndKey);
1122 			return B_OK;
1123 		}
1124 
1125 		off_t nextOffset;
1126 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
1127 			&nextOffset);
1128 
1129 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
1130 			RETURN_ERROR(B_ERROR);
1131 
1132 		if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
1133 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
1134 			break;
1135 		}
1136 
1137 		// put the node offset & the correct keyIndex on the stack
1138 		stack.Push(nodeAndKey);
1139 
1140 		nodeAndKey.nodeOffset = nextOffset;
1141 	}
1142 
1143 	FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
1144 		B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
1145 	return B_ERROR;
1146 }
1147 
1148 
1149 /*!	This will find a free duplicate fragment in the given bplustree_node.
1150 	The CachedNode will be set to the writable fragment on success.
1151 */
1152 status_t
1153 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
1154 	const bplustree_node* node, CachedNode& cached,
1155 	off_t* _offset, bplustree_node** _fragment, uint32* _index)
1156 {
1157 	off_t* values = node->Values();
1158 	for (int32 i = 0; i < node->NumKeys(); i++) {
1159 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1160 
1161 		// does the value link to a duplicate fragment?
1162 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
1163 			continue;
1164 
1165 		const bplustree_node* fragment = cached.SetTo(
1166 			bplustree_node::FragmentOffset(value), false);
1167 		if (fragment == NULL) {
1168 			FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
1169 				B_PRIdOFF "\n", value, fStream->ID()));
1170 			continue;
1171 		}
1172 
1173 		// see if there is some space left for us
1174 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
1175 		for (uint32 j = 0; j < num; j++) {
1176 			duplicate_array* array = fragment->FragmentAt(j);
1177 
1178 			if (array->IsEmpty()) {
1179 				// found an unused fragment
1180 				*_fragment = cached.MakeWritable(transaction);
1181 				if (*_fragment == NULL)
1182 					return B_IO_ERROR;
1183 
1184 				*_offset = bplustree_node::FragmentOffset(value);
1185 				*_index = j;
1186 				return B_OK;
1187 			}
1188 		}
1189 	}
1190 	return B_ENTRY_NOT_FOUND;
1191 }
1192 
1193 
1194 status_t
1195 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
1196 	const bplustree_node* node, uint16 index, off_t value)
1197 {
1198 	CachedNode cachedDuplicate(this);
1199 	off_t* values = node->Values();
1200 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1201 	status_t status;
1202 	off_t offset;
1203 
1204 	if (bplustree_node::IsDuplicate(oldValue)) {
1205 		// If it's a duplicate fragment, try to insert it into that, or if it
1206 		// doesn't fit anymore, create a new duplicate node
1207 
1208 		if (bplustree_node::LinkType(oldValue)
1209 				== BPLUSTREE_DUPLICATE_FRAGMENT) {
1210 			bplustree_node* duplicate = cachedDuplicate.SetToWritable(
1211 				transaction, bplustree_node::FragmentOffset(oldValue), false);
1212 			if (duplicate == NULL)
1213 				return B_IO_ERROR;
1214 
1215 			duplicate_array* array = duplicate->FragmentAt(
1216 				bplustree_node::FragmentIndex(oldValue));
1217 			int32 arrayCount = array->Count();
1218 			if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
1219 				FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
1220 					"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1221 					(int)bplustree_node::FragmentIndex(oldValue),
1222 					bplustree_node::FragmentOffset(oldValue), arrayCount,
1223 					fStream->ID()));
1224 				return B_BAD_DATA;
1225 			}
1226 
1227 			if (arrayCount < NUM_FRAGMENT_VALUES) {
1228 				array->Insert(value);
1229 			} else {
1230 				// Test if the fragment will be empty if we remove this key's
1231 				// values
1232 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
1233 					// The node will be empty without our values, so let us
1234 					// reuse it as a duplicate node
1235 					offset = bplustree_node::FragmentOffset(oldValue);
1236 
1237 					memmove(duplicate->DuplicateArray(), array,
1238 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1239 					duplicate->left_link = duplicate->right_link
1240 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1241 
1242 					array = duplicate->DuplicateArray();
1243 					array->Insert(value);
1244 				} else {
1245 					// Create a new duplicate node
1246 
1247 					cachedDuplicate.UnsetUnchanged(transaction);
1248 						// The old duplicate has not been touched, so we can
1249 						// reuse it
1250 
1251 					bplustree_node* newDuplicate;
1252 					status = cachedDuplicate.Allocate(transaction,
1253 						&newDuplicate, &offset);
1254 					if (status != B_OK)
1255 						RETURN_ERROR(status);
1256 
1257 					// Copy the array from the fragment node to the duplicate
1258 					// node and free the old entry (by zero'ing all values)
1259 					newDuplicate->overflow_link = array->count;
1260 					memcpy(&newDuplicate->all_key_count, &array->values[0],
1261 						array->Count() * sizeof(off_t));
1262 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1263 
1264 					array = newDuplicate->DuplicateArray();
1265 					array->Insert(value);
1266 				}
1267 
1268 				// Update the main pointer to link to a duplicate node
1269 				if (cached.MakeWritable(transaction) == NULL)
1270 					return B_IO_ERROR;
1271 
1272 				values[index]
1273 					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1274 						BPLUSTREE_DUPLICATE_NODE, offset));
1275 			}
1276 
1277 			return B_OK;
1278 		}
1279 
1280 		// Put the value into a dedicated duplicate node
1281 
1282 		// search for free space in the duplicate nodes of that key
1283 		duplicate_array* array;
1284 		int32 arrayCount;
1285 		const bplustree_node* duplicate;
1286 		off_t duplicateOffset;
1287 		do {
1288 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1289 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1290 			if (duplicate == NULL)
1291 				return B_IO_ERROR;
1292 
1293 			array = duplicate->DuplicateArray();
1294 			arrayCount =array->Count();
1295 			if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1296 				FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
1297 					B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1298 					duplicateOffset, arrayCount, fStream->ID()));
1299 				return B_BAD_DATA;
1300 			}
1301 		} while (arrayCount >= NUM_DUPLICATE_VALUES
1302 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
1303 
1304 		bplustree_node* writableDuplicate
1305 			= cachedDuplicate.MakeWritable(transaction);
1306 		if (writableDuplicate == NULL)
1307 			return B_IO_ERROR;
1308 
1309 		if (arrayCount < NUM_DUPLICATE_VALUES) {
1310 			array = writableDuplicate->DuplicateArray();
1311 			array->Insert(value);
1312 		} else {
1313 			// no space left - add a new duplicate node
1314 
1315 			bplustree_node* newDuplicate;
1316 			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
1317 				&offset);
1318 			if (status != B_OK)
1319 				RETURN_ERROR(status);
1320 
1321 			// link the two nodes together
1322 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
1323 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
1324 
1325 			array = newDuplicate->DuplicateArray();
1326 			array->count = 0;
1327 			array->Insert(value);
1328 		}
1329 		return B_OK;
1330 	}
1331 
1332 	// Search for a free duplicate fragment or create a new one
1333 	// to insert the duplicate value into
1334 
1335 	uint32 fragmentIndex = 0;
1336 	bplustree_node* fragment;
1337 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
1338 			&offset, &fragment, &fragmentIndex) != B_OK) {
1339 		// allocate a new duplicate fragment node
1340 		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
1341 		if (status != B_OK)
1342 			RETURN_ERROR(status);
1343 
1344 		memset(fragment, 0, fNodeSize);
1345 	}
1346 
1347 	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
1348 	array->Insert(oldValue);
1349 	array->Insert(value);
1350 
1351 	if (cached.MakeWritable(transaction) == NULL)
1352 		return B_IO_ERROR;
1353 
1354 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1355 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
1356 
1357 	return B_OK;
1358 }
1359 
1360 
1361 void
1362 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
1363 	uint16 keyLength, off_t value)
1364 {
1365 	// should never happen, but who knows?
1366 	if (index > node->NumKeys())
1367 		return;
1368 
1369 	off_t* values = node->Values();
1370 	uint16* keyLengths = node->KeyLengths();
1371 	uint8* keys = node->Keys();
1372 
1373 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1374 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1375 		+ keyLength);
1376 
1377 	off_t* newValues = node->Values();
1378 	uint16* newKeyLengths = node->KeyLengths();
1379 
1380 	// move values and copy new value into them
1381 	memmove(newValues + index + 1, values + index,
1382 		sizeof(off_t) * (node->NumKeys() - 1 - index));
1383 	memmove(newValues, values, sizeof(off_t) * index);
1384 
1385 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1386 
1387 	// move and update key length index
1388 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1389 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1390 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1391 	}
1392 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1393 
1394 	int32 keyStart;
1395 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1396 		+ (keyStart = index > 0
1397 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1398 
1399 	// move keys and copy new key into them
1400 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1401 	int32 size = node->AllKeyLength() - length;
1402 	if (size > 0)
1403 		memmove(keys + length, keys + length - keyLength, size);
1404 
1405 	memcpy(keys + keyStart, key, keyLength);
1406 }
1407 
1408 
1409 /*!	Splits the \a node into two halves - the other half will be put into
1410 	\a other. It also takes care to create a new overflow link if the node
1411 	to split is an index node.
1412 */
1413 status_t
1414 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1415 	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1416 	uint16* _keyLength, off_t* _value)
1417 {
1418 	if (*_keyIndex > node->NumKeys() + 1)
1419 		return B_BAD_VALUE;
1420 
1421 	uint16* inKeyLengths = node->KeyLengths();
1422 	off_t* inKeyValues = node->Values();
1423 	uint8* inKeys = node->Keys();
1424 	uint8* outKeys = other->Keys();
1425 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1426 
1427 	if (keyIndex > node->NumKeys()) {
1428 		FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
1429 			"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
1430 		return B_BAD_VALUE;
1431 	}
1432 
1433 	// how many keys will fit in one (half) page?
1434 	// that loop will find the answer to this question and
1435 	// change the key lengths indices for their new home
1436 
1437 	// "bytes" is the number of bytes written for the new key,
1438 	// "bytesBefore" are the bytes before that key
1439 	// "bytesAfter" are the bytes after the new key, if any
1440 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1441 
1442 	size_t size = fNodeSize >> 1;
1443 	int32 out, in;
1444 	for (in = out = 0; in < node->NumKeys() + 1;) {
1445 		if (!bytes) {
1446 			bytesBefore = in > 0
1447 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1448 		}
1449 
1450 		if (in == keyIndex && !bytes) {
1451 			bytes = *_keyLength;
1452 		} else {
1453 			if (keyIndex < out) {
1454 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1455 					- bytesBefore;
1456 			}
1457 
1458 			in++;
1459 		}
1460 		out++;
1461 
1462 		if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1463 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1464 			// we have found the number of keys in the new node!
1465 			break;
1466 		}
1467 	}
1468 
1469 	// if the new key was not inserted, set the length of the keys
1470 	// that can be copied directly
1471 	if (keyIndex >= out && in > 0)
1472 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1473 
1474 	if (bytesBefore < 0 || bytesAfter < 0)
1475 		return B_BAD_DATA;
1476 
1477 	other->left_link = node->left_link;
1478 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1479 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1480 		+ bytesAfter);
1481 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1482 
1483 	uint16* outKeyLengths = other->KeyLengths();
1484 	off_t* outKeyValues = other->Values();
1485 	int32 keys = out > keyIndex ? keyIndex : out;
1486 
1487 	if (bytesBefore) {
1488 		// copy the keys
1489 		memcpy(outKeys, inKeys, bytesBefore);
1490 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1491 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1492 	}
1493 	if (bytes) {
1494 		// copy the newly inserted key
1495 		memcpy(outKeys + bytesBefore, key, bytes);
1496 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1497 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1498 
1499 		if (bytesAfter) {
1500 			// copy the keys after the new key
1501 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1502 				bytesAfter);
1503 			keys = out - keyIndex - 1;
1504 			for (int32 i = 0;i < keys;i++) {
1505 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1506 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1507 						+ bytes);
1508 			}
1509 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1510 				keys * sizeof(off_t));
1511 		}
1512 	}
1513 
1514 	// if the new key was already inserted, we shouldn't use it again
1515 	if (in != out)
1516 		keyIndex--;
1517 
1518 	int32 total = bytesBefore + bytesAfter;
1519 
1520 	// these variables are for the key that will be returned
1521 	// to the parent node
1522 	uint8* newKey = NULL;
1523 	uint16 newLength;
1524 	bool newAllocated = false;
1525 
1526 	// If we have split an index node, we have to drop the first key
1527 	// of the next node (which can also be the new key to insert).
1528 	// The dropped key is also the one which has to be inserted in
1529 	// the parent node, so we will set the "newKey" already here.
1530 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1531 		if (in == keyIndex) {
1532 			newKey = key;
1533 			newLength = *_keyLength;
1534 
1535 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1536 			keyIndex--;
1537 		} else {
1538 			// If a key is dropped (is not the new key), we have to copy
1539 			// it, because it would be lost if not.
1540 			uint8* droppedKey = node->KeyAt(in, &newLength);
1541 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1542 					> (uint8*)node + fNodeSize
1543 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1544 				fStream->GetVolume()->Panic();
1545 				RETURN_ERROR(B_BAD_DATA);
1546 			}
1547 			newKey = (uint8*)malloc(newLength);
1548 			if (newKey == NULL)
1549 				return B_NO_MEMORY;
1550 
1551 			newAllocated = true;
1552 			memcpy(newKey, droppedKey, newLength);
1553 
1554 			other->overflow_link = inKeyValues[in];
1555 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1556 		}
1557 	}
1558 
1559 	// and now the same game for the other page and the rest of the keys
1560 	// (but with memmove() instead of memcpy(), because they may overlap)
1561 
1562 	bytesBefore = bytesAfter = bytes = 0;
1563 	out = 0;
1564 	int32 skip = in;
1565 	while (in < node->NumKeys() + 1) {
1566 		if (in == keyIndex && !bytes) {
1567 			// it's enough to set bytesBefore once here, because we do
1568 			// not need to know the exact length of all keys in this
1569 			// loop
1570 			bytesBefore = in > skip
1571 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1572 			bytes = *_keyLength;
1573 			out++;
1574 		} else {
1575 			if (in < node->NumKeys()) {
1576 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1577 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1578 
1579 				if (bytes) {
1580 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1581 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1582 
1583 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1584 						- bytesBefore - bytes;
1585 				}
1586 				out++;
1587 			}
1588 			in++;
1589 		}
1590 	}
1591 
1592 	// adjust the byte counts (since we were a bit lazy in the loop)
1593 	if (keyIndex < skip)
1594 		bytesBefore = node->AllKeyLength() - total;
1595 
1596 	if (bytesBefore < 0 || bytesAfter < 0) {
1597 		if (newAllocated)
1598 			free(newKey);
1599 		return B_BAD_DATA;
1600 	}
1601 
1602 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1603 		// right link, and overflow link can stay the same
1604 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1605 		+ bytesAfter);
1606 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1607 
1608 	// array positions have changed
1609 	outKeyLengths = node->KeyLengths();
1610 	outKeyValues = node->Values();
1611 
1612 	// move the keys in the old node: the order is important here,
1613 	// because we don't want to overwrite any contents
1614 
1615 	keys = keyIndex <= skip ? out : keyIndex - skip;
1616 	keyIndex -= skip;
1617 	in = out - keyIndex - 1;
1618 		// Note: keyIndex and in will contain invalid values when the new key
1619 		// went to the other node. But in this case bytes and bytesAfter are
1620 		// 0 and subsequently we never use keyIndex and in.
1621 
1622 	if (bytesBefore)
1623 		memmove(inKeys, inKeys + total, bytesBefore);
1624 	if (bytesAfter) {
1625 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1626 			bytesAfter);
1627 	}
1628 
1629 	if (bytesBefore)
1630 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1631 	if (bytesAfter) {
1632 		// if byteAfter is > 0, keyIndex is larger than skip
1633 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1634 			in * sizeof(uint16));
1635 	}
1636 
1637 	if (bytesBefore)
1638 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1639 	if (bytesAfter) {
1640 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1641 			in * sizeof(off_t));
1642 	}
1643 
1644 	if (bytes) {
1645 		// finally, copy the newly inserted key (don't overwrite anything)
1646 		memcpy(inKeys + bytesBefore, key, bytes);
1647 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1648 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1649 	}
1650 
1651 	// Prepare the key that will be inserted in the parent node which
1652 	// is either the dropped key or the last of the other node.
1653 	// If it's the dropped key, "newKey" was already set earlier.
1654 
1655 	if (newKey == NULL)
1656 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1657 
1658 	memcpy(key, newKey, newLength);
1659 	*_keyLength = newLength;
1660 	*_value = otherOffset;
1661 
1662 	if (newAllocated)
1663 		free(newKey);
1664 
1665 	return B_OK;
1666 }
1667 
1668 
1669 /*!	This inserts a key into the tree. The changes made to the tree will
1670 	all be part of the \a transaction.
1671 	You need to have the inode write locked.
1672 */
1673 status_t
1674 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1675 	off_t value)
1676 {
1677 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1678 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1679 		RETURN_ERROR(B_BAD_VALUE);
1680 #ifdef DEBUG
1681 	if (value < 0)
1682 		panic("tried to insert invalid value %Ld!\n", value);
1683 #endif
1684 
1685 	ASSERT_WRITE_LOCKED_INODE(fStream);
1686 
1687 	Stack<node_and_key> stack;
1688 	if (_SeekDown(stack, key, keyLength) != B_OK)
1689 		RETURN_ERROR(B_ERROR);
1690 
1691 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1692 
1693 	memcpy(keyBuffer, key, keyLength);
1694 	keyBuffer[keyLength] = 0;
1695 
1696 	node_and_key nodeAndKey;
1697 	const bplustree_node* node;
1698 
1699 	CachedNode cached(this);
1700 	while (stack.Pop(&nodeAndKey)
1701 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1702 #ifdef DEBUG
1703 		NodeChecker checker(node, fNodeSize, "insert");
1704 #endif
1705 		if (node->IsLeaf()) {
1706 			// first round, check for duplicate entries
1707 			status_t status = _FindKey(node, key, keyLength,
1708 				&nodeAndKey.keyIndex);
1709 
1710 			// is this a duplicate entry?
1711 			if (status == B_OK) {
1712 				if (fAllowDuplicates) {
1713 					status = _InsertDuplicate(transaction, cached, node,
1714 						nodeAndKey.keyIndex, value);
1715 					if (status != B_OK)
1716 						RETURN_ERROR(status);
1717 					return B_OK;
1718 				}
1719 
1720 				return B_NAME_IN_USE;
1721 			}
1722 		}
1723 
1724 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1725 		if (writableNode == NULL)
1726 			return B_IO_ERROR;
1727 
1728 		// is the node big enough to hold the pair?
1729 		if (int32(key_align(sizeof(bplustree_node)
1730 				+ writableNode->AllKeyLength() + keyLength)
1731 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1732 				+ sizeof(off_t))) < fNodeSize) {
1733 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1734 				keyBuffer, keyLength, value);
1735 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1736 				nodeAndKey.keyIndex, 0, 1);
1737 
1738 			return B_OK;
1739 		} else {
1740 			CachedNode cachedNewRoot(this);
1741 			CachedNode cachedOther(this);
1742 
1743 			// do we need to allocate a new root node? if so, then do
1744 			// it now
1745 			off_t newRoot = BPLUSTREE_NULL;
1746 			if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1747 				bplustree_node* root;
1748 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1749 					&newRoot);
1750 				if (status != B_OK) {
1751 					// The tree is most likely corrupted!
1752 					// But it's still sane at leaf level - we could set
1753 					// a flag in the header that forces the tree to be
1754 					// rebuild next time...
1755 					// But since we will have journaling, that's not a big
1756 					// problem anyway.
1757 					RETURN_ERROR(status);
1758 				}
1759 			}
1760 
1761 			// reserve space for the other node
1762 			bplustree_node* other;
1763 			off_t otherOffset;
1764 			status_t status = cachedOther.Allocate(transaction, &other,
1765 				&otherOffset);
1766 			if (status != B_OK) {
1767 				cachedNewRoot.Free(transaction, newRoot);
1768 				RETURN_ERROR(status);
1769 			}
1770 
1771 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1772 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1773 					&value) != B_OK) {
1774 				// free root node & other node here
1775 				cachedOther.Free(transaction, otherOffset);
1776 				cachedNewRoot.Free(transaction, newRoot);
1777 
1778 				RETURN_ERROR(B_ERROR);
1779 			}
1780 #ifdef DEBUG
1781 			checker.Check("insert split");
1782 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1783 #endif
1784 
1785 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1786 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1787 
1788 			// update the right link of the node in the left of the new node
1789 			if ((other = cachedOther.SetToWritable(transaction,
1790 					other->LeftLink())) != NULL) {
1791 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1792 			}
1793 
1794 			// create a new root if necessary
1795 			if (newRoot != BPLUSTREE_NULL) {
1796 				bplustree_node* root = cachedNewRoot.Node();
1797 
1798 				_InsertKey(root, 0, keyBuffer, keyLength,
1799 					writableNode->LeftLink());
1800 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1801 					nodeAndKey.nodeOffset);
1802 
1803 				CachedNode cached(this);
1804 				bplustree_header* header
1805 					= cached.SetToWritableHeader(transaction);
1806 				if (header == NULL)
1807 					return B_IO_ERROR;
1808 
1809 				// finally, update header to point to the new root
1810 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1811 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1812 					header->MaxNumberOfLevels() + 1);
1813 
1814 				return B_OK;
1815 			}
1816 		}
1817 	}
1818 	RETURN_ERROR(B_ERROR);
1819 }
1820 
1821 
1822 /*!	Removes the duplicate index/value pair from the tree.
1823 	It's part of the private tree interface.
1824 */
1825 status_t
1826 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1827 	const bplustree_node* node, CachedNode& cached, uint16 index,
1828 	off_t value)
1829 {
1830 	off_t* values = node->Values();
1831 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1832 
1833 	CachedNode cachedDuplicate(this);
1834 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1835 	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1836 		duplicateOffset, false);
1837 	if (duplicate == NULL)
1838 		return B_IO_ERROR;
1839 
1840 	// if it's a duplicate fragment, remove the entry from there
1841 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1842 		duplicate_array* array = duplicate->FragmentAt(
1843 			bplustree_node::FragmentIndex(oldValue));
1844 		int32 arrayCount = array->Count();
1845 
1846 		if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
1847 			FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
1848 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1849 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1850 				arrayCount, fStream->ID()));
1851 			return B_BAD_DATA;
1852 		}
1853 		if (!array->Remove(value)) {
1854 			FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1855 				"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
1856 				duplicateOffset, fStream->ID()));
1857 			return B_ENTRY_NOT_FOUND;
1858 		}
1859 
1860 		// remove the array from the fragment node if it is empty
1861 		if (--arrayCount == 1) {
1862 			// set the link to the remaining value
1863 			if (cached.MakeWritable(transaction) == NULL)
1864 				return B_IO_ERROR;
1865 
1866 			values[index] = array->values[0];
1867 
1868 			// Remove the whole fragment node, if this was the only array,
1869 			// otherwise free just the array
1870 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1871 				status_t status = cachedDuplicate.Free(transaction,
1872 					duplicateOffset);
1873 				if (status != B_OK)
1874 					return status;
1875 			} else
1876 				array->count = 0;
1877 		}
1878 		return B_OK;
1879 	}
1880 
1881 	// Remove value from a duplicate node!
1882 
1883 	duplicate_array* array = NULL;
1884 	int32 arrayCount = 0;
1885 
1886 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1887 		FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1888 			", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
1889 		return B_BAD_DATA;
1890 	}
1891 
1892 	// Search the duplicate nodes until the entry could be found (and removed)
1893 	while (duplicate != NULL) {
1894 		array = duplicate->DuplicateArray();
1895 		arrayCount = array->Count();
1896 
1897 		if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1898 			FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
1899 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1900 				duplicateOffset, arrayCount, fStream->ID()));
1901 			return B_BAD_DATA;
1902 		}
1903 
1904 		if (array->Remove(value)) {
1905 			arrayCount--;
1906 			break;
1907 		}
1908 
1909 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1910 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1911 
1912 		cachedDuplicate.UnsetUnchanged(transaction);
1913 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1914 			false);
1915 	}
1916 	if (duplicate == NULL)
1917 		RETURN_ERROR(B_IO_ERROR);
1918 
1919 	// The entry got removed from the duplicate node, but we might want to free
1920 	// it now in case it's empty
1921 
1922 	while (true) {
1923 		off_t left = duplicate->LeftLink();
1924 		off_t right = duplicate->RightLink();
1925 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1926 
1927 		if ((isLast && arrayCount == 1) || arrayCount == 0) {
1928 			// Free empty duplicate page, link their siblings together, and
1929 			// update the duplicate link if needed (ie. when we either remove
1930 			// the last duplicate node or have a new first one)
1931 
1932 			if (left == BPLUSTREE_NULL) {
1933 				// the duplicate link points to us
1934 				if (cached.MakeWritable(transaction) == NULL)
1935 					return B_IO_ERROR;
1936 
1937 				if (arrayCount == 1) {
1938 					// This is the last node, and there is only one value left;
1939 					// replace the duplicate link with that value, it's no
1940 					// duplicate anymore
1941 					values[index] = array->values[0];
1942 				} else {
1943 					// Move the duplicate link to the next node
1944 					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1945 						bplustree_node::MakeLink(
1946 							BPLUSTREE_DUPLICATE_NODE, right));
1947 				}
1948 			}
1949 
1950 			status_t status = cachedDuplicate.Free(transaction,
1951 				duplicateOffset);
1952 			if (status != B_OK)
1953 				return status;
1954 
1955 			if (left != BPLUSTREE_NULL
1956 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1957 						false)) != NULL) {
1958 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1959 
1960 				// If the next node is the last node, we need to free that node
1961 				// and convert the duplicate entry back into a normal entry
1962 				array = duplicate->DuplicateArray();
1963 				arrayCount = array->Count();
1964 				if (right == BPLUSTREE_NULL
1965 					&& duplicate->LeftLink() == BPLUSTREE_NULL
1966 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1967 					duplicateOffset = left;
1968 					continue;
1969 				}
1970 			}
1971 			if (right != BPLUSTREE_NULL
1972 				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1973 						right, false)) != NULL) {
1974 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1975 
1976 				// Again, we may need to turn the duplicate entry back into a
1977 				// normal entry
1978 				array = duplicate->DuplicateArray();
1979 				arrayCount = array->Count();
1980 				if (left == BPLUSTREE_NULL
1981 					&& duplicate->RightLink() == BPLUSTREE_NULL
1982 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1983 					duplicateOffset = right;
1984 					continue;
1985 				}
1986 			}
1987 			return B_OK;
1988 		}
1989 		if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
1990 			// If the number of entries fits in a duplicate fragment, then
1991 			// either find a free fragment node, or convert this node to a
1992 			// fragment node.
1993 			CachedNode cachedOther(this);
1994 
1995 			bplustree_node* fragment = NULL;
1996 			uint32 fragmentIndex = 0;
1997 			off_t offset;
1998 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
1999 					&offset, &fragment, &fragmentIndex) == B_OK) {
2000 				// move to other node
2001 				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
2002 				memcpy(target, array,
2003 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2004 
2005 				cachedDuplicate.Free(transaction, duplicateOffset);
2006 				duplicateOffset = offset;
2007 			} else {
2008 				// convert node
2009 				memmove(duplicate, array,
2010 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2011 				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
2012 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2013 			}
2014 
2015 			if (cached.MakeWritable(transaction) == NULL)
2016 				return B_IO_ERROR;
2017 
2018 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
2019 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
2020 		}
2021 		return B_OK;
2022 	}
2023 }
2024 
2025 
2026 /*!	Removes the key with the given index from the specified node.
2027 	Since it has to get the key from the node anyway (to obtain it's
2028 	pointer), it's not needed to pass the key & its length, although
2029 	the calling method (BPlusTree::Remove()) have this data.
2030 */
2031 void
2032 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
2033 {
2034 	// should never happen, but who knows?
2035 	if (index > node->NumKeys() && node->NumKeys() > 0) {
2036 		FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
2037 			"\n", index, fStream->ID()));
2038 		return;
2039 	}
2040 
2041 	off_t* values = node->Values();
2042 
2043 	// if we would have to drop the overflow link, drop
2044 	// the last key instead and update the overflow link
2045 	// to the value of that one
2046 	if (!node->IsLeaf() && index == node->NumKeys())
2047 		node->overflow_link = values[--index];
2048 
2049 	uint16 length;
2050 	uint8* key = node->KeyAt(index, &length);
2051 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
2052 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2053 		FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
2054 			fStream->ID()));
2055 		fStream->GetVolume()->Panic();
2056 		return;
2057 	}
2058 
2059 	uint16* keyLengths = node->KeyLengths();
2060 	uint8* keys = node->Keys();
2061 
2062 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
2063 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
2064 		node->AllKeyLength() - length);
2065 
2066 	off_t* newValues = node->Values();
2067 	uint16* newKeyLengths = node->KeyLengths();
2068 
2069 	// move key data
2070 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
2071 
2072 	// move and update key lengths
2073 	if (index > 0 && newKeyLengths != keyLengths)
2074 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
2075 	for (uint16 i = index; i < node->NumKeys(); i++) {
2076 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
2077 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
2078 	}
2079 
2080 	// move values
2081 	if (index > 0)
2082 		memmove(newValues, values, index * sizeof(off_t));
2083 	if (node->NumKeys() > index) {
2084 		memmove(newValues + index, values + index + 1,
2085 			(node->NumKeys() - index) * sizeof(off_t));
2086 	}
2087 }
2088 
2089 
2090 /*!	Removes the specified key from the tree. The "value" parameter is only used
2091 	for trees which allow duplicates, so you may safely ignore it.
2092 	It's not an optional parameter, so at least you have to think about it.
2093 	You need to have the inode write locked.
2094 */
2095 status_t
2096 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
2097 	off_t value)
2098 {
2099 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2100 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2101 		RETURN_ERROR(B_BAD_VALUE);
2102 
2103 	ASSERT_WRITE_LOCKED_INODE(fStream);
2104 
2105 	Stack<node_and_key> stack;
2106 	if (_SeekDown(stack, key, keyLength) != B_OK)
2107 		RETURN_ERROR(B_ERROR);
2108 
2109 	node_and_key nodeAndKey;
2110 	const bplustree_node* node;
2111 
2112 	CachedNode cached(this);
2113 	while (stack.Pop(&nodeAndKey)
2114 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
2115 #ifdef DEBUG
2116 		NodeChecker checker(node, fNodeSize, "remove");
2117 #endif
2118 		if (node->IsLeaf()) {
2119 			// first round, check for duplicate entries
2120 			status_t status = _FindKey(node, key, keyLength,
2121 				&nodeAndKey.keyIndex);
2122 			if (status != B_OK)
2123 				RETURN_ERROR(status);
2124 
2125 			// Is this a duplicate entry?
2126 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
2127 					node->Values()[nodeAndKey.keyIndex]))) {
2128 				if (fAllowDuplicates) {
2129 					return _RemoveDuplicate(transaction, node, cached,
2130 						nodeAndKey.keyIndex, value);
2131 				}
2132 
2133 				FATAL(("dupliate node found where no duplicates are "
2134 					"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
2135 				RETURN_ERROR(B_ERROR);
2136 			} else {
2137 				if (node->Values()[nodeAndKey.keyIndex] != value)
2138 					return B_ENTRY_NOT_FOUND;
2139 
2140 				// If we will remove the last key, the iterator will be set
2141 				// to the next node after the current - if there aren't any
2142 				// more nodes, we need a way to prevent the TreeIterators to
2143 				// touch the old node again, we use BPLUSTREE_FREE for this
2144 				off_t next = node->RightLink() == BPLUSTREE_NULL
2145 					? BPLUSTREE_FREE : node->RightLink();
2146 				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
2147 					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
2148 			}
2149 		}
2150 
2151 		bplustree_node* writableNode = cached.MakeWritable(transaction);
2152 		if (writableNode == NULL)
2153 			return B_IO_ERROR;
2154 
2155 		// if it's an empty root node, we have to convert it
2156 		// to a leaf node by dropping the overflow link, or,
2157 		// if it's already a leaf node, just empty it
2158 		if (nodeAndKey.nodeOffset == fHeader.RootNode()
2159 			&& (node->NumKeys() == 0
2160 				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
2161 			writableNode->overflow_link
2162 				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2163 			writableNode->all_key_count = 0;
2164 			writableNode->all_key_length = 0;
2165 
2166 			// if we've made a leaf node out of the root node, we need
2167 			// to reset the maximum number of levels in the header
2168 			if (fHeader.MaxNumberOfLevels() != 1) {
2169 				CachedNode cached(this);
2170 				bplustree_header* header
2171 					= cached.SetToWritableHeader(transaction);
2172 				if (header == NULL)
2173 					return B_IO_ERROR;
2174 
2175 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
2176 			}
2177 			return B_OK;
2178 		}
2179 
2180 		// if there is only one key left, we don't have to remove
2181 		// it, we can just dump the node (index nodes still have
2182 		// the overflow link, so we have to drop the last key)
2183 		if (writableNode->NumKeys() > 1
2184 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
2185 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
2186 			return B_OK;
2187 		}
2188 
2189 		// when we are here, we can just free the node, but
2190 		// we have to update the right/left link of the
2191 		// siblings first
2192 		CachedNode otherCached(this);
2193 		bplustree_node* other = otherCached.SetToWritable(transaction,
2194 			writableNode->LeftLink());
2195 		if (other != NULL)
2196 			other->right_link = writableNode->right_link;
2197 
2198 		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
2199 				!= NULL) {
2200 			other->left_link = writableNode->left_link;
2201 		}
2202 
2203 		cached.Free(transaction, nodeAndKey.nodeOffset);
2204 	}
2205 	RETURN_ERROR(B_ERROR);
2206 }
2207 
2208 
2209 /*!	Replaces the value for the key in the tree.
2210 	Returns B_OK if the key could be found and its value replaced,
2211 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
2212 	to indicate that something went terribly wrong.
2213 	Note that this doesn't work with duplicates - it will just
2214 	return B_BAD_TYPE if you call this function on a tree where
2215 	duplicates are allowed.
2216 	You need to have the inode write locked.
2217 */
2218 status_t
2219 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
2220 	off_t value)
2221 {
2222 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2223 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2224 		|| key == NULL)
2225 		RETURN_ERROR(B_BAD_VALUE);
2226 
2227 	if (fAllowDuplicates)
2228 		RETURN_ERROR(B_BAD_TYPE);
2229 
2230 	ASSERT_WRITE_LOCKED_INODE(fStream);
2231 
2232 	off_t nodeOffset = fHeader.RootNode();
2233 	CachedNode cached(this);
2234 	const bplustree_node* node;
2235 
2236 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2237 		uint16 keyIndex = 0;
2238 		off_t nextOffset;
2239 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2240 			&nextOffset);
2241 
2242 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2243 			if (status == B_OK) {
2244 				bplustree_node* writableNode = cached.MakeWritable(transaction);
2245 				if (writableNode != NULL) {
2246 					writableNode->Values()[keyIndex]
2247 						= HOST_ENDIAN_TO_BFS_INT64(value);
2248 				} else
2249 					status = B_IO_ERROR;
2250 			}
2251 
2252 			return status;
2253 		} else if (nextOffset == nodeOffset)
2254 			RETURN_ERROR(B_ERROR);
2255 
2256 		nodeOffset = nextOffset;
2257 	}
2258 	RETURN_ERROR(B_ERROR);
2259 }
2260 #endif // !_BOOT_MODE
2261 
2262 
2263 /*!	Searches the key in the tree, and stores the offset found in _value,
2264 	if successful.
2265 	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
2266 	while it descends the tree.
2267 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
2268 	It can also return other errors to indicate that something went wrong.
2269 	Note that this doesn't work with duplicates - it will just return
2270 	B_BAD_TYPE if you call this function on a tree where duplicates are
2271 	allowed.
2272 	You need to have the inode read or write locked.
2273 */
2274 status_t
2275 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
2276 {
2277 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2278 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2279 		|| key == NULL)
2280 		RETURN_ERROR(B_BAD_VALUE);
2281 
2282 	if (fAllowDuplicates)
2283 		RETURN_ERROR(B_BAD_TYPE);
2284 
2285 #if !_BOOT_MODE
2286 	ASSERT_READ_LOCKED_INODE(fStream);
2287 #endif
2288 
2289 	off_t nodeOffset = fHeader.RootNode();
2290 	CachedNode cached(this);
2291 	const bplustree_node* node;
2292 
2293 #ifdef DEBUG
2294 	int32 levels = 0;
2295 #endif
2296 
2297 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2298 		uint16 keyIndex = 0;
2299 		off_t nextOffset;
2300 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2301 			&nextOffset);
2302 
2303 #ifdef DEBUG
2304 		levels++;
2305 #endif
2306 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2307 			if (status == B_OK && _value != NULL)
2308 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
2309 
2310 #ifdef DEBUG
2311 			if (levels != (int32)fHeader.MaxNumberOfLevels())
2312 				DEBUGGER(("levels don't match"));
2313 #endif
2314 			return status;
2315 		} else if (nextOffset == nodeOffset)
2316 			RETURN_ERROR(B_ERROR);
2317 
2318 		nodeOffset = nextOffset;
2319 	}
2320 	FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
2321 		B_PRIdOFF "\n", nodeOffset, fStream->ID()));
2322 	RETURN_ERROR(B_ERROR);
2323 }
2324 
2325 
2326 #if !_BOOT_MODE
2327 status_t
2328 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
2329 	const uint8* largestKey, uint16 largestKeyLength,
2330 	const bplustree_node* parent)
2331 {
2332 	if (parent->CheckIntegrity(fNodeSize) != B_OK) {
2333 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
2334 			"failed!\n", fStream->ID(), offset);
2335 		check.FoundError();
2336 		return B_OK;
2337 	}
2338 	if (level >= fHeader.MaxNumberOfLevels()) {
2339 		dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
2340 			"!\n", fStream->ID(), offset);
2341 		check.FoundError();
2342 		return B_OK;
2343 	}
2344 
2345 	check.SetLevel(level);
2346 
2347 	if (check.Visited(offset)) {
2348 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
2349 			fStream->ID(), offset);
2350 		check.FoundError();
2351 		return B_OK;
2352 	}
2353 
2354 	check.SetVisited(offset);
2355 
2356 	uint32 count = parent->NumKeys();
2357 	off_t* values = parent->Values();
2358 	off_t lastOffset = check.PreviousOffset(level);
2359 	CachedNode cached(this);
2360 
2361 	for (uint32 i = 0; i < count; i++) {
2362 		uint16 keyLength;
2363 		uint8* key = parent->KeyAt(i, &keyLength);
2364 		if (largestKey != NULL) {
2365 			int result = compareKeys(fHeader.DataType(), key, keyLength,
2366 				largestKey, largestKeyLength);
2367 			if (result >= 0) {
2368 				dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
2369 					B_PRIu32 " larger than it should!\n",
2370 					fStream->ID(), offset, i);
2371 				check.FoundError();
2372 			}
2373 		}
2374 
2375 		off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
2376 		if (bplustree_node::IsDuplicate(childOffset)) {
2377 			// Walk the duplicate nodes
2378 			off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
2379 			off_t lastDuplicateOffset = BPLUSTREE_NULL;
2380 
2381 			while (duplicateOffset != BPLUSTREE_NULL) {
2382 				const bplustree_node* node;
2383 				status_t status = cached.SetTo(duplicateOffset, &node, false);
2384 				if (status != B_OK) {
2385 					if (status == B_IO_ERROR)
2386 						return B_IO_ERROR;
2387 
2388 					dprintf("inode %" B_PRIdOFF ": duplicate node at %"
2389 						B_PRIdOFF " could not be read: %s\n", fStream->ID(),
2390 						duplicateOffset, strerror(status));
2391 					check.FoundError();
2392 					break;
2393 				}
2394 
2395 				bool isFragmentNode = bplustree_node::LinkType(childOffset)
2396 					== BPLUSTREE_DUPLICATE_FRAGMENT;
2397 				bool isKnownFragment = isFragmentNode
2398 					&& check.VisitedFragment(duplicateOffset);
2399 
2400 				if (!isKnownFragment && check.Visited(duplicateOffset)) {
2401 					dprintf("inode %" B_PRIdOFF ": %s node at %"
2402 						B_PRIdOFF " already visited, referenced from %"
2403 						B_PRIdOFF "!\n", fStream->ID(),
2404 						isFragmentNode ? "fragment" : "duplicate",
2405 						duplicateOffset, offset);
2406 					check.FoundError();
2407 					break;
2408 				}
2409 
2410 				// Fragment nodes may be visited more than once from different
2411 				// places
2412 				if (!check.Visited(duplicateOffset))
2413 					check.SetVisited(duplicateOffset);
2414 				if (!isKnownFragment && isFragmentNode)
2415 					check.SetVisitedFragment(duplicateOffset);
2416 
2417 				duplicate_array* array;
2418 				int32 minSize;
2419 				int32 maxSize;
2420 				if (isFragmentNode) {
2421 					array = node->FragmentAt(
2422 						bplustree_node::FragmentIndex(childOffset));
2423 					minSize = 2;
2424 					maxSize = NUM_FRAGMENT_VALUES;
2425 				} else {
2426 					array = node->DuplicateArray();
2427 					minSize = 1;
2428 					maxSize = NUM_DUPLICATE_VALUES;
2429 				}
2430 				int32 arrayCount = array->Count();
2431 
2432 				if (arrayCount < minSize || arrayCount > maxSize) {
2433 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2434 						" has invalid array size %" B_PRId32 "!\n",
2435 						fStream->ID(), duplicateOffset, arrayCount);
2436 					check.FoundError();
2437 				} else {
2438 					// Simple check if the values in the array may be valid
2439 					for (int32 j = 0; j < arrayCount; j++) {
2440 						if (!fStream->GetVolume()->IsValidInodeBlock(
2441 								array->ValueAt(j))) {
2442 							dprintf("inode %" B_PRIdOFF ": duplicate at %"
2443 								B_PRIdOFF " contains invalid block %" B_PRIdOFF
2444 								" at %" B_PRId32 "!\n", fStream->ID(),
2445 								duplicateOffset, array->ValueAt(j), j);
2446 							check.FoundError();
2447 							break;
2448 						}
2449 					}
2450 				}
2451 
2452 				// A fragment node is not linked (and does not have valid links)
2453 				if (isFragmentNode)
2454 					break;
2455 
2456 				if (node->LeftLink() != lastDuplicateOffset) {
2457 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2458 						" has wrong left link %" B_PRIdOFF ", expected %"
2459 						B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
2460 						node->LeftLink(), lastDuplicateOffset);
2461 					check.FoundError();
2462 				}
2463 
2464 				lastDuplicateOffset = duplicateOffset;
2465 				duplicateOffset = node->RightLink();
2466 			}
2467 		} else if (!parent->IsLeaf()) {
2468 			// Test a regular child node recursively
2469 			off_t nextOffset = parent->OverflowLink();
2470 			if (i < count - 1)
2471 				nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
2472 
2473 			if (i == 0 && lastOffset != BPLUSTREE_NULL) {
2474 				// Test right link of the previous node
2475 				const bplustree_node* previous = cached.SetTo(lastOffset, true);
2476 				if (previous == NULL)
2477 					return B_IO_ERROR;
2478 
2479 				if (previous->RightLink() != childOffset) {
2480 					dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2481 						"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2482 						"!\n", fStream->ID(), lastOffset, previous->RightLink(),
2483 						childOffset);
2484 					check.FoundError();
2485 				}
2486 			}
2487 
2488 			status_t status = _ValidateChild(check, cached, level, childOffset,
2489 				lastOffset, nextOffset, key, keyLength);
2490 			if (status != B_OK)
2491 				return status;
2492 		} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
2493 			dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
2494 				"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
2495 				fStream->ID(), offset, childOffset, i);
2496 			check.FoundError();
2497 		}
2498 
2499 		lastOffset = childOffset;
2500 	}
2501 
2502 	if (parent->OverflowLink() != BPLUSTREE_NULL) {
2503 		off_t childOffset = parent->OverflowLink();
2504 		status_t status = _ValidateChild(check, cached, level, childOffset,
2505 			lastOffset, 0, NULL, 0);
2506 		if (status != B_OK)
2507 			return status;
2508 
2509 		lastOffset = childOffset;
2510 	}
2511 
2512 	check.SetPreviousOffset(level, lastOffset);
2513 	return B_OK;
2514 }
2515 
2516 
2517 status_t
2518 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
2519 	off_t offset, off_t lastOffset, off_t nextOffset,
2520 	const uint8* key, uint16 keyLength)
2521 {
2522 	const bplustree_node* node;
2523 	status_t status = cached.SetTo(offset, &node, true);
2524 	if (status != B_OK) {
2525 		if (status == B_IO_ERROR)
2526 			return B_IO_ERROR;
2527 
2528 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
2529 			"read: %s\n", fStream->ID(), offset, strerror(status));
2530 		check.FoundError();
2531 		return B_OK;
2532 	}
2533 
2534 	if (node->LeftLink() != lastOffset) {
2535 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2536 			"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
2537 			"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
2538 		check.FoundError();
2539 	}
2540 
2541 	if (nextOffset != 0 && node->RightLink() != nextOffset) {
2542 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2543 			"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2544 			"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
2545 		check.FoundError();
2546 	}
2547 
2548 	return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
2549 }
2550 #endif // !_BOOT_MODE
2551 
2552 
2553 //	#pragma mark -
2554 
2555 
2556 TreeIterator::TreeIterator(BPlusTree* tree)
2557 	:
2558 	fTree(tree),
2559 	fCurrentNodeOffset(BPLUSTREE_NULL)
2560 {
2561 #if !_BOOT_MODE
2562 	tree->_AddIterator(this);
2563 #endif
2564 }
2565 
2566 
2567 TreeIterator::~TreeIterator()
2568 {
2569 #if !_BOOT_MODE
2570 	if (fTree)
2571 		fTree->_RemoveIterator(this);
2572 #endif
2573 }
2574 
2575 
2576 status_t
2577 TreeIterator::Goto(int8 to)
2578 {
2579 	if (fTree == NULL || fTree->fStream == NULL)
2580 		RETURN_ERROR(B_BAD_VALUE);
2581 
2582 #if !_BOOT_MODE
2583 	// lock access to stream
2584 	InodeReadLocker locker(fTree->fStream);
2585 #endif
2586 
2587 	off_t nodeOffset = fTree->fHeader.RootNode();
2588 	CachedNode cached(fTree);
2589 	const bplustree_node* node;
2590 
2591 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2592 		// is the node a leaf node?
2593 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2594 			fCurrentNodeOffset = nodeOffset;
2595 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
2596 			fDuplicateNode = BPLUSTREE_NULL;
2597 
2598 			return B_OK;
2599 		}
2600 
2601 		// get the next node offset depending on the direction (and if there
2602 		// are any keys in that node at all)
2603 		off_t nextOffset;
2604 		if (to == BPLUSTREE_END || node->all_key_count == 0)
2605 			nextOffset = node->OverflowLink();
2606 		else {
2607 			if (node->AllKeyLength() > fTree->fNodeSize
2608 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
2609 					- 8 * node->NumKeys())
2610 				RETURN_ERROR(B_ERROR);
2611 
2612 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
2613 		}
2614 		if (nextOffset == nodeOffset)
2615 			break;
2616 
2617 		nodeOffset = nextOffset;
2618 	}
2619 	FATAL(("%s fails\n", __FUNCTION__));
2620 
2621 	RETURN_ERROR(B_ERROR);
2622 }
2623 
2624 
2625 /*!	Iterates through the tree in the specified direction.
2626 	When it iterates through duplicates, the "key" is only updated for the
2627 	first entry - if you need to know when this happens, use the "duplicate"
2628 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2629 	the other duplicates.
2630 	That's not too nice, but saves the 256 bytes that would be needed to
2631 	store the last key - if this will ever become an issue, it will be
2632 	easy to change.
2633 	The other advantage of this is, that the queries can skip all duplicates
2634 	at once when they are not relevant to them.
2635 */
2636 status_t
2637 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2638 	uint16 maxLength, off_t* value, uint16* duplicate)
2639 {
2640 	if (fTree == NULL)
2641 		return B_INTERRUPTED;
2642 
2643 	bool forward = direction == BPLUSTREE_FORWARD;
2644 
2645 	if (fCurrentNodeOffset == BPLUSTREE_NULL
2646 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
2647 		RETURN_ERROR(B_ERROR);
2648 
2649 	// if the tree was emptied since the last call
2650 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
2651 		return B_ENTRY_NOT_FOUND;
2652 
2653 #if !_BOOT_MODE
2654 	// lock access to stream
2655 	InodeReadLocker locker(fTree->fStream);
2656 #endif
2657 
2658 	CachedNode cached(fTree);
2659 	const bplustree_node* node;
2660 
2661 	if (fDuplicateNode != BPLUSTREE_NULL) {
2662 		// regardless of traverse direction the duplicates are always presented
2663 		// in the same order; since they are all considered as equal, this
2664 		// shouldn't cause any problems
2665 
2666 		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2667 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2668 				false);
2669 		} else
2670 			node = NULL;
2671 
2672 		if (node != NULL) {
2673 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2674 				// If the node is out of duplicates, we go directly to the next
2675 				// one
2676 				fDuplicateNode = node->RightLink();
2677 				if (fDuplicateNode != BPLUSTREE_NULL
2678 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2679 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2680 						false);
2681 					fDuplicate = 0;
2682 				}
2683 			}
2684 			if (fDuplicate < fNumDuplicates) {
2685 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2686 					fDuplicate++);
2687 				if (duplicate)
2688 					*duplicate = 2;
2689 				return B_OK;
2690 			}
2691 		}
2692 		fDuplicateNode = BPLUSTREE_NULL;
2693 	}
2694 
2695 	off_t savedNodeOffset = fCurrentNodeOffset;
2696 	int32 savedKey = fCurrentKey;
2697 
2698 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2699 		RETURN_ERROR(B_ERROR);
2700 
2701 	if (duplicate)
2702 		*duplicate = 0;
2703 
2704 	fCurrentKey += direction;
2705 
2706 	// is the current key in the current node?
2707 	while ((forward && fCurrentKey >= node->NumKeys())
2708 			|| (!forward && fCurrentKey < 0)) {
2709 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2710 
2711 		// are there any more nodes?
2712 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2713 			node = cached.SetTo(fCurrentNodeOffset);
2714 			if (!node)
2715 				RETURN_ERROR(B_ERROR);
2716 
2717 			// reset current key
2718 			fCurrentKey = forward ? 0 : node->NumKeys() - 1;
2719 		} else {
2720 			// there are no nodes left, so turn back to the last key
2721 			fCurrentNodeOffset = savedNodeOffset;
2722 			fCurrentKey = savedKey;
2723 
2724 			return B_ENTRY_NOT_FOUND;
2725 		}
2726 	}
2727 
2728 	if (node->all_key_count == 0)
2729 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2730 
2731 	uint16 length;
2732 	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2733 	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2734 			> (uint8*)node + fTree->fNodeSize
2735 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2736 #if !_BOOT_MODE
2737 		fTree->fStream->GetVolume()->Panic();
2738 #endif
2739 		RETURN_ERROR(B_BAD_DATA);
2740 	}
2741 
2742 	// include the termination for string types
2743 	bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE;
2744 	if (length + (needsTermination ? 1 : 0) > maxLength) {
2745 		// the buffer is too small, restore the last key and return an error
2746 		fCurrentNodeOffset = savedNodeOffset;
2747 		fCurrentKey = savedKey;
2748 		return B_BUFFER_OVERFLOW;
2749 	}
2750 
2751 	memcpy(key, keyStart, length);
2752 
2753 	if (needsTermination)
2754 		((char*)key)[length] = '\0';
2755 
2756 	*keyLength = length;
2757 
2758 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2759 
2760 	// duplicate fragments?
2761 	uint8 type = bplustree_node::LinkType(offset);
2762 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2763 		|| type == BPLUSTREE_DUPLICATE_NODE) {
2764 		fDuplicateNode = offset;
2765 
2766 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2767 			false);
2768 		if (node == NULL)
2769 			RETURN_ERROR(B_ERROR);
2770 
2771 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2772 
2773 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2774 		if (fNumDuplicates) {
2775 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2776 			fDuplicate = 1;
2777 			if (duplicate)
2778 				*duplicate = 1;
2779 		} else {
2780 			// Shouldn't happen, but we're dealing here with potentially
2781 			// corrupt disks...
2782 			fDuplicateNode = BPLUSTREE_NULL;
2783 			offset = 0;
2784 		}
2785 	}
2786 	*value = offset;
2787 
2788 	return B_OK;
2789 }
2790 
2791 
2792 /*!	This is more or less a copy of BPlusTree::Find() - but it just
2793 	sets the current position in the iterator, regardless of if the
2794 	key could be found or not.
2795 */
2796 status_t
2797 TreeIterator::Find(const uint8* key, uint16 keyLength)
2798 {
2799 	if (fTree == NULL)
2800 		return B_INTERRUPTED;
2801 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2802 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2803 		|| key == NULL)
2804 		RETURN_ERROR(B_BAD_VALUE);
2805 
2806 #if !_BOOT_MODE
2807 	// lock access to stream
2808 	InodeReadLocker locker(fTree->fStream);
2809 #endif
2810 
2811 	off_t nodeOffset = fTree->fHeader.RootNode();
2812 
2813 	CachedNode cached(fTree);
2814 	const bplustree_node* node;
2815 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2816 		uint16 keyIndex = 0;
2817 		off_t nextOffset;
2818 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2819 			&nextOffset);
2820 
2821 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2822 			fCurrentNodeOffset = nodeOffset;
2823 			fCurrentKey = keyIndex - 1;
2824 			fDuplicateNode = BPLUSTREE_NULL;
2825 
2826 			return status;
2827 		} else if (nextOffset == nodeOffset)
2828 			RETURN_ERROR(B_ERROR);
2829 
2830 		nodeOffset = nextOffset;
2831 	}
2832 	RETURN_ERROR(B_ERROR);
2833 }
2834 
2835 
2836 void
2837 TreeIterator::SkipDuplicates()
2838 {
2839 	fDuplicateNode = BPLUSTREE_NULL;
2840 }
2841 
2842 
2843 void
2844 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2845 	uint16 splitAt, int8 change)
2846 {
2847 	if (offset != fCurrentNodeOffset)
2848 		return;
2849 
2850 	if (nextOffset != BPLUSTREE_NULL) {
2851 		fCurrentNodeOffset = nextOffset;
2852 		if (splitAt <= fCurrentKey) {
2853 			fCurrentKey -= splitAt;
2854 			keyIndex -= splitAt;
2855 		}
2856 	}
2857 
2858 	// Adjust fCurrentKey to point to the same key as before.
2859 	// Note, that if a key is inserted at the current position
2860 	// it won't be included in this tree transition.
2861 	if (keyIndex <= fCurrentKey)
2862 		fCurrentKey += change;
2863 
2864 	// TODO: duplicate handling!
2865 }
2866 
2867 
2868 void
2869 TreeIterator::Stop()
2870 {
2871 	fTree = NULL;
2872 }
2873 
2874 
2875 #ifdef DEBUG
2876 void
2877 TreeIterator::Dump()
2878 {
2879 	__out("TreeIterator at %p:\n", this);
2880 	__out("\tfTree = %p\n", fTree);
2881 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2882 	__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2883 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n",
2884 		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2885 		fDuplicateNode);
2886 	__out("\tfDuplicate = %u\n", fDuplicate);
2887 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2888 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2889 }
2890 #endif
2891 
2892 
2893 // #pragma mark -
2894 
2895 
2896 bool
2897 bplustree_header::IsValid() const
2898 {
2899 	return Magic() == BPLUSTREE_MAGIC
2900 		&& (RootNode() % NodeSize()) == 0
2901 		&& IsValidLink(RootNode())
2902 		&& IsValidLink(FreeNode());
2903 }
2904 
2905 
2906 // #pragma mark -
2907 
2908 
2909 void
2910 bplustree_node::Initialize()
2911 {
2912 	left_link = right_link = overflow_link
2913 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2914 	all_key_count = 0;
2915 	all_key_length = 0;
2916 }
2917 
2918 
2919 uint8*
2920 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2921 {
2922 	if (index < 0 || index > NumKeys())
2923 		return NULL;
2924 
2925 	uint8* keyStart = Keys();
2926 	uint16* keyLengths = KeyLengths();
2927 
2928 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2929 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2930 	if (index > 0)
2931 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2932 
2933 	return keyStart;
2934 }
2935 
2936 
2937 uint8
2938 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2939 {
2940 	// the duplicate fragment handling is currently hard-coded to a node size
2941 	// of 1024 bytes - with future versions of BFS, this may be a problem
2942 
2943 	if (isFragment) {
2944 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2945 
2946 		return ((off_t*)this)[fragment];
2947 	}
2948 	return OverflowLink();
2949 }
2950 
2951 
2952 off_t
2953 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2954 {
2955 	uint32 start;
2956 	if (isFragment)
2957 		start = 8 * ((uint64)offset & 0x3ff);
2958 	else
2959 		start = 2;
2960 
2961 	return ((off_t*)this)[start + 1 + index];
2962 }
2963 
2964 
2965 /*!	Although the name suggests it, this function doesn't return the real
2966 	used fragment count; at least, it can only count to two: it returns
2967 	0, if there is no fragment used, 1 if there is only one fragment
2968 	used, and 2 if there are at least 2 fragments used.
2969 */
2970 uint32
2971 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2972 {
2973 	uint32 used = 0;
2974 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2975 		duplicate_array* array = FragmentAt(i);
2976 		if (array->Count() > 0 && ++used > 1)
2977 			return used;
2978 	}
2979 	return used;
2980 }
2981 
2982 
2983 status_t
2984 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2985 {
2986 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2987 		DEBUGGER(("invalid node: key/length count"));
2988 
2989 	for (int32 i = 0; i < NumKeys(); i++) {
2990 		uint16 length;
2991 		uint8* key = KeyAt(i, &length);
2992 		if (key + length + sizeof(off_t) + sizeof(uint16)
2993 				> (uint8*)this + nodeSize
2994 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2995 			dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
2996 			return B_BAD_DATA;
2997 		}
2998 		if (Values()[i] == -1) {
2999 			dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
3000 				"corrupted\n", this, (int)i, Values()[i]);
3001 			return B_BAD_DATA;
3002 		}
3003 	}
3004 	return B_OK;
3005 }
3006 
3007 
3008 // #pragma mark -
3009 
3010 
3011 #if !_BOOT_MODE
3012 BitmapArray::BitmapArray(size_t numBits)
3013 {
3014 	fSize = (numBits + 7) / 8;
3015 	fBitmap = (uint8*)calloc(fSize, 1);
3016 	fCountSet = 0;
3017 }
3018 
3019 
3020 BitmapArray::~BitmapArray()
3021 {
3022 	free(fBitmap);
3023 }
3024 
3025 
3026 status_t
3027 BitmapArray::InitCheck() const
3028 {
3029 	return fBitmap != NULL ? B_OK : B_NO_MEMORY;
3030 }
3031 
3032 
3033 bool
3034 BitmapArray::IsSet(size_t index) const
3035 {
3036 	uint32 byteIndex = index / 8;
3037 	if (byteIndex >= fSize)
3038 		return false;
3039 
3040 	return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
3041 }
3042 
3043 
3044 void
3045 BitmapArray::Set(size_t index, bool set)
3046 {
3047 	uint32 byteIndex = index / 8;
3048 	if (byteIndex >= fSize)
3049 		return;
3050 
3051 	if (set) {
3052 		fBitmap[byteIndex] |= 1UL << (index & 0x7);
3053 		fCountSet++;
3054 	} else {
3055 		fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
3056 		fCountSet--;
3057 	}
3058 }
3059 #endif // !_BOOT_MODE
3060 
3061 
3062 // #pragma mark -
3063 
3064 
3065 bool
3066 duplicate_array::_FindInternal(off_t value, int32& index) const
3067 {
3068 	int32 min = 0, max = Count() - 1;
3069 	off_t cmp;
3070 	while (min <= max) {
3071 		index = (min + max) / 2;
3072 
3073 		cmp = ValueAt(index) - value;
3074 		if (cmp < 0)
3075 			min = index + 1;
3076 		else if (cmp > 0)
3077 			max = index - 1;
3078 		else
3079 			return true;
3080 	}
3081 	return false;
3082 }
3083 
3084 
3085 void
3086 duplicate_array::Insert(off_t value)
3087 {
3088 	// if there are more than 8 values in this array, use a
3089 	// binary search, if not, just iterate linearly to find
3090 	// the insertion point
3091 	int32 size = Count();
3092 	int32 i = 0;
3093 	if (size > 8 ) {
3094 		if (!_FindInternal(value, i) && ValueAt(i) <= value)
3095 			i++;
3096 	} else {
3097 		for (i = 0; i < size; i++) {
3098 			if (ValueAt(i) > value)
3099 				break;
3100 		}
3101 	}
3102 
3103 	memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
3104 	values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
3105 	count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
3106 }
3107 
3108 
3109 bool
3110 duplicate_array::Remove(off_t value)
3111 {
3112 	int32 index = Find(value);
3113 	if (index == -1)
3114 		return false;
3115 
3116 	int32 newSize = Count() - 1;
3117 	memmove(&values[index], &values[index + 1],
3118 		(newSize - index) * sizeof(off_t));
3119 	count = HOST_ENDIAN_TO_BFS_INT64(newSize);
3120 
3121 	return true;
3122 }
3123 
3124 
3125 // #pragma mark -
3126 
3127 
3128 int32
3129 compareKeys(type_code type, const void* key1, int keyLength1,
3130 	const void* key2, int keyLength2)
3131 {
3132 	// if one of the keys is NULL, bail out gracefully
3133 	if (key1 == NULL || key2 == NULL) {
3134 		// even if it's most probably a bug in the calling code, we try to
3135 		// give a meaningful result
3136 		if (key1 == NULL && key2 != NULL)
3137 			return 1;
3138 		if (key2 == NULL && key1 != NULL)
3139 			return -1;
3140 
3141 		return 0;
3142 	}
3143 
3144 	switch (type) {
3145 	    case B_STRING_TYPE:
3146     	{
3147 			int result = memcmp(key1, key2, min_c(keyLength1, keyLength2));
3148 			if (result == 0) {
3149 				// ignore trailing null bytes
3150 				if ((keyLength1 == keyLength2 + 1
3151 						&& ((uint8*)key1)[keyLength2] == '\0')
3152 					|| (keyLength2 == keyLength1 + 1
3153 						&& ((uint8*)key2)[keyLength1] == '\0'))
3154 					return 0;
3155 
3156 				result = keyLength1 - keyLength2;
3157 			}
3158 
3159 			return result;
3160 		}
3161 
3162 		case B_SSIZE_T_TYPE:
3163 		case B_INT32_TYPE:
3164 			return *(int32*)key1 - *(int32*)key2;
3165 
3166 		case B_SIZE_T_TYPE:
3167 		case B_UINT32_TYPE:
3168 			if (*(uint32*)key1 == *(uint32*)key2)
3169 				return 0;
3170 			if (*(uint32*)key1 > *(uint32*)key2)
3171 				return 1;
3172 
3173 			return -1;
3174 
3175 		case B_OFF_T_TYPE:
3176 		case B_INT64_TYPE:
3177 			if (*(int64*)key1 == *(int64*)key2)
3178 				return 0;
3179 			if (*(int64*)key1 > *(int64*)key2)
3180 				return 1;
3181 
3182 			return -1;
3183 
3184 		case B_UINT64_TYPE:
3185 			if (*(uint64*)key1 == *(uint64*)key2)
3186 				return 0;
3187 			if (*(uint64*)key1 > *(uint64*)key2)
3188 				return 1;
3189 
3190 			return -1;
3191 
3192 		case B_FLOAT_TYPE:
3193 		{
3194 			float result = *(float*)key1 - *(float*)key2;
3195 			if (result == 0.0f)
3196 				return 0;
3197 
3198 			return (result < 0.0f) ? -1 : 1;
3199 		}
3200 
3201 		case B_DOUBLE_TYPE:
3202 		{
3203 			double result = *(double*)key1 - *(double*)key2;
3204 			if (result == 0.0)
3205 				return 0;
3206 
3207 			return (result < 0.0) ? -1 : 1;
3208 		}
3209 	}
3210 
3211 	// if the type is unknown, the entries don't match...
3212 	return -1;
3213 }
3214 
3215 
3216 #if _BOOT_MODE
3217 } // namespace BFS
3218 #endif
3219