xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision a5a3b2d9a3d95cbae71eaf371708c73a1780ac0d)
1 /*
2  * Copyright 2001-2015, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 
10 //! B+Tree implementation
11 
12 
13 #include "BPlusTree.h"
14 
15 #include <file_systems/QueryParserUtils.h>
16 
17 #include "Debug.h"
18 #include "Utility.h"
19 
20 #if !_BOOT_MODE
21 #	include "Inode.h"
22 #else
23 #	include "Stream.h"
24 
25 // BFS::Stream from the bootloader has the same API as Inode.
26 #	define Inode BFS::Stream
27 
28 #	define strerror(x)		"error message unavailable"
29 
30 namespace BFS {
31 #endif
32 
33 
34 /*!	Simple array used for the duplicate handling in the B+Tree. This is an
35 	on disk structure.
36 */
37 struct duplicate_array {
38 	off_t	count;
39 	off_t	values[0];
40 
41 	inline bool IsEmpty() const
42 	{
43 		return count == 0;
44 	}
45 
46 	inline int32 Count() const
47 	{
48 		return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
49 	}
50 
51 	inline off_t ValueAt(uint32 index) const
52 	{
53 		return BFS_ENDIAN_TO_HOST_INT64(values[index]);
54 	}
55 
56 	inline void SetValueAt(uint32 index, off_t value)
57 	{
58 		values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
59 	}
60 
61 	inline int32 Find(off_t value) const
62 	{
63 		int32 i;
64 		return _FindInternal(value, i) ? i : -1;
65 	}
66 
67 	void Insert(off_t value);
68 	bool Remove(off_t value);
69 
70 private:
71 	bool _FindInternal(off_t value, int32& index) const;
72 } _PACKED;
73 
74 
75 #ifdef DEBUG
76 class NodeChecker {
77 public:
78 	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
79 		:
80 		fNode(node),
81 		fSize(nodeSize),
82 		fText(text)
83 	{
84 		Check("integrity check failed on construction.");
85 	}
86 
87 	~NodeChecker()
88 	{
89 		Check("integrity check failed on destruction.");
90 	}
91 
92 	void
93 	Check(const char* message)
94 	{
95 		if (fNode->CheckIntegrity(fSize) != B_OK) {
96 			dprintf("%s: %s\n", fText, message);
97 			DEBUGGER(("NodeChecker integrity check failed!"));
98 		}
99 	}
100 
101 private:
102 	const bplustree_node*	fNode;
103 	int32					fSize;
104 	const char*				fText;
105 };
106 #endif // DEBUG
107 
108 
109 #if !_BOOT_MODE
110 class BitmapArray {
111 public:
112 								BitmapArray(size_t numBits);
113 								~BitmapArray();
114 
115 			status_t			InitCheck() const;
116 
117 			bool				IsSet(size_t index) const;
118 			void				Set(size_t index, bool set);
119 
120 			size_t				CountSet() const { return fCountSet; }
121 
122 private:
123 			uint8*				fBitmap;
124 			size_t				fSize;
125 			size_t				fCountSet;
126 };
127 
128 
129 struct TreeCheck {
130 	TreeCheck(BPlusTree* tree)
131 		:
132 		fLevelCount(0),
133 		fFreeCount(0),
134 		fNodeSize(tree->NodeSize()),
135 		fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
136 		fFoundErrors(0),
137 		fVisited(tree->Stream()->Size() / tree->NodeSize()),
138 		fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
139 	{
140 		fPreviousOffsets = (off_t*)malloc(
141 			sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
142 		if (fPreviousOffsets != NULL) {
143 			for (size_t i = 0; i < fMaxLevels; i++)
144 				fPreviousOffsets[i] = BPLUSTREE_NULL;
145 		}
146 	}
147 
148 	~TreeCheck()
149 	{
150 		free(fPreviousOffsets);
151 	}
152 
153 	status_t InitCheck() const
154 	{
155 		if (fPreviousOffsets == NULL)
156 			return B_NO_MEMORY;
157 
158 		status_t status = fVisited.InitCheck();
159 		if (status != B_OK)
160 			return status;
161 
162 		return fVisitedFragment.InitCheck();
163 	}
164 
165 	bool Visited(off_t offset) const
166 	{
167 		return fVisited.IsSet(offset / fNodeSize);
168 	}
169 
170 	void SetVisited(off_t offset)
171 	{
172 		fVisited.Set(offset / fNodeSize, true);
173 	}
174 
175 	size_t VisitedCount() const
176 	{
177 		return fVisited.CountSet();
178 	}
179 
180 	bool VisitedFragment(off_t offset) const
181 	{
182 		return fVisitedFragment.IsSet(offset / fNodeSize);
183 	}
184 
185 	void SetVisitedFragment(off_t offset)
186 	{
187 		fVisitedFragment.Set(offset / fNodeSize, true);
188 	}
189 
190 	uint32 MaxLevels() const
191 	{
192 		return fLevelCount;
193 	}
194 
195 	void SetLevel(uint32 level)
196 	{
197 		if (fLevelCount < level)
198 			fLevelCount = level;
199 	}
200 
201 	off_t PreviousOffset(uint32 level)
202 	{
203 		return fPreviousOffsets[level];
204 	}
205 
206 	void SetPreviousOffset(uint32 level, off_t offset)
207 	{
208 		fPreviousOffsets[level] = offset;
209 	}
210 
211 	void FoundError()
212 	{
213 		fFoundErrors++;
214 	}
215 
216 	bool ErrorsFound()
217 	{
218 		return fFoundErrors != 0;
219 	}
220 
221 private:
222 			uint32				fLevelCount;
223 			uint32				fFreeCount;
224 			uint32				fNodeSize;
225 			uint32				fMaxLevels;
226 			uint32				fFoundErrors;
227 			BitmapArray			fVisited;
228 			BitmapArray			fVisitedFragment;
229 			off_t*				fPreviousOffsets;
230 };
231 
232 
233 // #pragma mark -
234 
235 
236 // Node Caching for the BPlusTree class
237 //
238 // With write support, there is the need for a function that allocates new
239 // nodes by either returning empty nodes, or by growing the file's data stream
240 //
241 // !! The CachedNode class assumes that you have properly locked the stream
242 // !! before asking for nodes.
243 //
244 // Note: This code will fail if the block size is smaller than the node size!
245 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
246 // is hard-coded to 1024 bytes, that's not an issue now.
247 
248 void
249 CachedNode::UnsetUnchanged(Transaction& transaction)
250 {
251 	if (fTree == NULL || fTree->fStream == NULL)
252 		return;
253 
254 	if (fNode != NULL) {
255 		void* cache = fTree->fStream->GetVolume()->BlockCache();
256 
257 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
258 		block_cache_put(cache, fBlockNumber);
259 		fNode = NULL;
260 	}
261 }
262 #endif // !_BOOT_MODE
263 
264 
265 void
266 CachedNode::Unset()
267 {
268 	if (fTree == NULL || fTree->fStream == NULL)
269 		return;
270 
271 	if (fNode != NULL) {
272 #if !_BOOT_MODE
273 		if (fWritable && fOffset == 0) {
274 			// The B+tree header has been updated - we need to update the
275 			// BPlusTrees copy of it, as well.
276 			memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
277 		}
278 
279 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
280 			fBlockNumber);
281 #endif // !_BOOT_MODE
282 
283 		fNode = NULL;
284 	}
285 }
286 
287 
288 const bplustree_node*
289 CachedNode::SetTo(off_t offset, bool check)
290 {
291 	const bplustree_node* node;
292 	if (SetTo(offset, &node, check) == B_OK)
293 		return node;
294 
295 	return NULL;
296 }
297 
298 
299 status_t
300 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
301 {
302 	if (fTree == NULL || fTree->fStream == NULL)
303 		RETURN_ERROR(B_BAD_VALUE);
304 
305 	Unset();
306 
307 	// You can only ask for nodes at valid positions - you can't
308 	// even access the b+tree header with this method (use SetToHeader()
309 	// instead)
310 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
311 		|| offset <= 0
312 		|| (offset % fTree->fNodeSize) != 0) {
313 		RETURN_ERROR(B_BAD_VALUE);
314 	}
315 
316 	if (InternalSetTo(NULL, offset) != NULL && check) {
317 		// sanity checks (links, all_key_count)
318 		if (!fTree->fHeader.CheckNode(fNode)) {
319 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
320 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
321 				fBlockNumber, fTree->fStream->ID()));
322 			return B_BAD_DATA;
323 		}
324 	}
325 
326 	*_node = fNode;
327 	return B_OK;
328 }
329 
330 
331 #if !_BOOT_MODE
332 bplustree_node*
333 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
334 {
335 	if (fTree == NULL || fTree->fStream == NULL) {
336 		REPORT_ERROR(B_BAD_VALUE);
337 		return NULL;
338 	}
339 
340 	Unset();
341 
342 	// You can only ask for nodes at valid positions - you can't
343 	// even access the b+tree header with this method (use SetToHeader()
344 	// instead)
345 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
346 		|| offset <= 0
347 		|| (offset % fTree->fNodeSize) != 0)
348 		return NULL;
349 
350 	if (InternalSetTo(&transaction, offset) != NULL && check) {
351 		// sanity checks (links, all_key_count)
352 		if (!fTree->fHeader.CheckNode(fNode)) {
353 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
354 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
355 				fBlockNumber, fTree->fStream->ID()));
356 			return NULL;
357 		}
358 	}
359 	return fNode;
360 }
361 
362 
363 bplustree_node*
364 CachedNode::MakeWritable(Transaction& transaction)
365 {
366 	if (fNode == NULL)
367 		return NULL;
368 
369 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
370 			fBlockNumber, transaction.ID()) == B_OK) {
371 		return fNode;
372 	}
373 
374 	return NULL;
375 }
376 #endif // !_BOOT_MODE
377 
378 
379 const bplustree_header*
380 CachedNode::SetToHeader()
381 {
382 	if (fTree == NULL || fTree->fStream == NULL) {
383 		REPORT_ERROR(B_BAD_VALUE);
384 		return NULL;
385 	}
386 
387 	Unset();
388 
389 	InternalSetTo(NULL, 0LL);
390 	return (bplustree_header*)fNode;
391 }
392 
393 
394 #if !_BOOT_MODE
395 bplustree_header*
396 CachedNode::SetToWritableHeader(Transaction& transaction)
397 {
398 	if (fTree == NULL || fTree->fStream == NULL) {
399 		REPORT_ERROR(B_BAD_VALUE);
400 		return NULL;
401 	}
402 
403 	Unset();
404 
405 	InternalSetTo(&transaction, 0LL);
406 
407 	if (fNode != NULL && !fTree->fInTransaction) {
408 		transaction.AddListener(fTree);
409 		fTree->fInTransaction = true;
410 
411 		if (!transaction.GetVolume()->IsInitializing()) {
412 			acquire_vnode(transaction.GetVolume()->FSVolume(),
413 				fTree->fStream->ID());
414 		}
415 	}
416 
417 	return (bplustree_header*)fNode;
418 }
419 #endif // !_BOOT_MODE
420 
421 
422 bplustree_node*
423 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
424 {
425 	fNode = NULL;
426 	fOffset = offset;
427 
428 	off_t fileOffset;
429 	block_run run;
430 	if (offset < fTree->fStream->Size()
431 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
432 
433 #if !_BOOT_MODE
434 		Volume* volume = fTree->fStream->GetVolume();
435 #else
436 		Volume* volume = &fTree->fStream->GetVolume();
437 #endif
438 
439 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
440 		fBlockNumber = volume->ToBlock(run) + blockOffset;
441 		uint8* block = NULL;
442 
443 #if !_BOOT_MODE
444 		if (transaction != NULL) {
445 			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
446 				fBlockNumber, transaction->ID());
447 			fWritable = true;
448 		} else {
449 			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
450 			fWritable = false;
451 		}
452 #else // !_BOOT_MODE
453 		if (fBlock == NULL) {
454 			fBlock = (uint8*)malloc(volume->BlockSize());
455 			if (fBlock == NULL)
456 				return NULL;
457 		}
458 
459 		if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(),
460 				fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) {
461 			block = fBlock;
462 		}
463 
464 		fWritable = false;
465 #endif // _BOOT_MODE
466 
467 		if (block != NULL) {
468 			// The node is somewhere in that block...
469 			// (confusing offset calculation)
470 			fNode = (bplustree_node*)(block + offset
471 				- (fileOffset + (blockOffset << volume->BlockShift())));
472 		} else
473 			REPORT_ERROR(B_IO_ERROR);
474 	}
475 	return fNode;
476 }
477 
478 
479 #if !_BOOT_MODE
480 status_t
481 CachedNode::Free(Transaction& transaction, off_t offset)
482 {
483 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
484 		RETURN_ERROR(B_BAD_VALUE);
485 
486 	// TODO: scan the free nodes list and remove all nodes at the end
487 	// of the tree - perhaps that shouldn't be done everytime that
488 	// function is called, perhaps it should be done when the directory
489 	// inode is closed or based on some calculation or whatever...
490 
491 	CachedNode cached(fTree);
492 	bplustree_header* header = cached.SetToWritableHeader(transaction);
493 	if (header == NULL)
494 		return B_IO_ERROR;
495 
496 #if 0
497 	// TODO: temporarily disabled because CheckNode() doesn't like this...
498 	// 		Also, it's such an edge case that it's almost useless, anyway.
499 	// if the node is the last one in the tree, we shrink
500 	// the tree and file size by one node
501 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
502 	if (offset == lastOffset) {
503 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
504 		if (status != B_OK)
505 			return status;
506 
507 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
508 		return B_OK;
509 	}
510 #endif
511 
512 	// add the node to the free nodes list
513 	fNode->left_link = header->free_node_pointer;
514 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
515 
516 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
517 	return B_OK;
518 }
519 
520 
521 status_t
522 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
523 	off_t* _offset)
524 {
525 	if (fTree == NULL || fTree->fStream == NULL)
526 		RETURN_ERROR(B_BAD_VALUE);
527 
528 	Unset();
529 
530 	bplustree_header* header;
531 	status_t status;
532 
533 	// if there are any free nodes, recycle them
534 	if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
535 		CachedNode cached(fTree);
536 		header = cached.SetToWritableHeader(transaction);
537 		if (header == NULL)
538 			return B_IO_ERROR;
539 
540 		*_offset = header->FreeNode();
541 		*_node = fNode;
542 
543 		// set new free node pointer
544 		header->free_node_pointer = fNode->left_link;
545 		fNode->Initialize();
546 		return B_OK;
547 	}
548 
549 	// allocate space for a new node
550 	Inode* stream = fTree->fStream;
551 	if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
552 		return status;
553 
554 	CachedNode cached(fTree);
555 	header = cached.SetToWritableHeader(transaction);
556 	if (header == NULL)
557 		return B_IO_ERROR;
558 
559 	// the maximum_size has to be changed before the call to SetTo() - or
560 	// else it will fail because the requested node is out of bounds
561 	off_t offset = header->MaximumSize();
562 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
563 		+ fTree->fNodeSize);
564 
565 	cached.Unset();
566 		// SetToWritable() below needs the new values in the tree's header
567 
568 	if (SetToWritable(transaction, offset, false) == NULL)
569 		RETURN_ERROR(B_ERROR);
570 
571 	fNode->Initialize();
572 
573 	*_offset = offset;
574 	*_node = fNode;
575 	return B_OK;
576 }
577 
578 
579 //	#pragma mark -
580 
581 
582 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
583 	:
584 	fStream(NULL),
585 	fInTransaction(false)
586 {
587 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
588 	SetTo(transaction, stream);
589 }
590 #endif // !_BOOT_MODE
591 
592 
593 BPlusTree::BPlusTree(Inode* stream)
594 	:
595 	fStream(NULL),
596 	fInTransaction(false)
597 {
598 #if !_BOOT_MODE
599 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
600 #endif
601 
602 	SetTo(stream);
603 }
604 
605 
606 BPlusTree::BPlusTree()
607 	:
608 	fStream(NULL),
609 	fNodeSize(BPLUSTREE_NODE_SIZE),
610 	fAllowDuplicates(true),
611 	fInTransaction(false),
612 	fStatus(B_NO_INIT)
613 {
614 #if !_BOOT_MODE
615 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
616 #endif
617 }
618 
619 
620 BPlusTree::~BPlusTree()
621 {
622 #if !_BOOT_MODE
623 	// if there are any TreeIterators left, we need to stop them
624 	// (can happen when the tree's inode gets deleted while
625 	// traversing the tree - a TreeIterator doesn't lock the inode)
626 	mutex_lock(&fIteratorLock);
627 
628 	SinglyLinkedList<TreeIterator>::Iterator iterator
629 		= fIterators.GetIterator();
630 	while (iterator.HasNext())
631 		iterator.Next()->Stop();
632 
633 	mutex_destroy(&fIteratorLock);
634 
635 	ASSERT(!fInTransaction);
636 #endif // !_BOOT_MODE
637 }
638 
639 
640 #if !_BOOT_MODE
641 /*! Create a new B+Tree on the specified stream */
642 status_t
643 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
644 {
645 	// initializes in-memory B+Tree
646 
647 	fStream = stream;
648 
649 	CachedNode cached(this);
650 	bplustree_header* header = cached.SetToWritableHeader(transaction);
651 	if (header == NULL) {
652 		// allocate space for new header + node!
653 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
654 		if (fStatus != B_OK)
655 			RETURN_ERROR(fStatus);
656 
657 		header = cached.SetToWritableHeader(transaction);
658 		if (header == NULL)
659 			RETURN_ERROR(fStatus = B_ERROR);
660 	}
661 
662 	fAllowDuplicates = stream->IsIndex()
663 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
664 
665 	fNodeSize = nodeSize;
666 
667 	// initialize b+tree header
668  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
669  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
670  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
671  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
672  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
673  	header->free_node_pointer
674  		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
675  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
676 
677 	cached.Unset();
678 
679 	// initialize b+tree root node
680 	cached.SetToWritable(transaction, fHeader.RootNode(), false);
681 	if (cached.Node() == NULL)
682 		RETURN_ERROR(B_IO_ERROR);
683 
684 	cached.Node()->Initialize();
685 
686 	return fStatus = B_OK;
687 }
688 #endif // !_BOOT_MODE
689 
690 
691 status_t
692 BPlusTree::SetTo(Inode* stream)
693 {
694 	if (stream == NULL)
695 		RETURN_ERROR(fStatus = B_BAD_VALUE);
696 
697 	fStream = stream;
698 
699 	// get on-disk B+Tree header
700 
701 	CachedNode cached(this);
702 	const bplustree_header* header = cached.SetToHeader();
703 	if (header != NULL)
704 		memcpy(&fHeader, header, sizeof(bplustree_header));
705 	else
706 		RETURN_ERROR(fStatus = B_IO_ERROR);
707 
708 	// is header valid?
709 
710 	if (fHeader.MaximumSize() != stream->Size()) {
711 		dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
712 			B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
713 		// we can't change the header since we don't have a transaction
714 		//fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
715 	}
716 	if (!fHeader.IsValid()) {
717 #ifdef DEBUG
718 		dump_bplustree_header(&fHeader);
719 		dump_block((const char*)&fHeader, 128);
720 #endif
721 		RETURN_ERROR(fStatus = B_BAD_DATA);
722 	}
723 
724 	fNodeSize = fHeader.NodeSize();
725 
726 	// validity check
727 	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
728 		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
729 		S_DOUBLE_INDEX};
730 	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
731 		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
732 		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
733 
734 	if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
735 		|| ((stream->Mode() & S_INDEX_DIR) != 0
736 			&& kToMode[fHeader.DataType()] != mode)
737 		|| !stream->IsContainer()) {
738 		D(	dump_bplustree_header(&fHeader);
739 			dump_inode(&stream->Node());
740 		);
741 		RETURN_ERROR(fStatus = B_BAD_TYPE);
742 	}
743 
744 	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
745 	// in the original BFS code - we will honour it nevertheless
746 	fAllowDuplicates = stream->IsIndex()
747 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
748 
749 	cached.SetTo(fHeader.RootNode());
750 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
751 }
752 
753 
754 status_t
755 BPlusTree::InitCheck()
756 {
757 	return fStatus;
758 }
759 
760 
761 #if !_BOOT_MODE
762 status_t
763 BPlusTree::Validate(bool repair, bool& _errorsFound)
764 {
765 	TreeCheck check(this);
766 	if (check.InitCheck() != B_OK)
767 		return B_NO_MEMORY;
768 
769 	check.SetVisited(0);
770 
771 	// Walk the free nodes
772 
773 	CachedNode cached(this);
774 	off_t freeOffset = fHeader.FreeNode();
775 	while (freeOffset > 0) {
776 		const bplustree_node* node;
777 		status_t status = cached.SetTo(freeOffset, &node, false);
778 		if (status != B_OK) {
779 			if (status == B_IO_ERROR)
780 				return B_IO_ERROR;
781 
782 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
783 				"not be read: %s\n", fStream->ID(), freeOffset,
784 				strerror(status));
785 			check.FoundError();
786 			break;
787 		}
788 
789 		if (check.Visited(freeOffset)) {
790 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
791 				" circular!\n", fStream->ID(), freeOffset);
792 			// TODO: if 'repair' is true, we could collect all unvisited nodes
793 			// at the end, and put the into the free list
794 			check.FoundError();
795 			break;
796 		}
797 
798 		check.SetVisited(freeOffset);
799 
800 		if (node->OverflowLink() != BPLUSTREE_FREE) {
801 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
802 				" misses free mark!\n", fStream->ID(), freeOffset);
803 		}
804 		freeOffset = node->LeftLink();
805 	}
806 
807 	// Iterate over the complete tree recursively
808 
809 	const bplustree_node* root;
810 	status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
811 	if (status != B_OK)
812 		return status;
813 
814 	status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
815 
816 	if (check.ErrorsFound())
817 		_errorsFound = true;
818 
819 	if (status != B_OK)
820 		return status;
821 
822 	if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
823 		dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
824 			"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
825 			fHeader.MaxNumberOfLevels());
826 	}
827 
828 	if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
829 		dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
830 			" nodes.\n", fStream->ID(), check.VisitedCount(),
831 			fHeader.MaximumSize() / fNodeSize);
832 	}
833 
834 	return B_OK;
835 }
836 
837 
838 status_t
839 BPlusTree::MakeEmpty()
840 {
841 	// Put all nodes into the free list in order
842 	Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
843 
844 	// Reset the header, and root node
845 	CachedNode cached(this);
846 	bplustree_header* header = cached.SetToWritableHeader(transaction);
847 	if (header == NULL)
848 		return B_IO_ERROR;
849 
850 	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
851 	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
852 	if (fStream->Size() > (off_t)NodeSize() * 2)
853 		header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
854 	else {
855 		header->free_node_pointer
856 			= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
857 	}
858 
859 	bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
860 	if (node == NULL)
861 		return B_IO_ERROR;
862 
863 	node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
864 	node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
865 	node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
866 	node->all_key_count = 0;
867 	node->all_key_length = 0;
868 
869 	for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
870 			offset += NodeSize()) {
871 		bplustree_node* node = cached.SetToWritable(transaction, offset, false);
872 		if (node == NULL) {
873 			dprintf("--> could not open %" B_PRIdOFF "\n", offset);
874 			return B_IO_ERROR;
875 		}
876 		if (offset < fStream->Size() - (off_t)NodeSize())
877 			node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
878 		else
879 			node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
880 
881 		node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
882 
883 		// It's not important to write it out in a single transaction; just
884 		// make sure it doesn't get too large
885 		if (offset % (1024 * 1024) == 0) {
886 			transaction.Done();
887 			transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
888 		}
889 	}
890 
891 	return transaction.Done();
892 }
893 
894 
895 int32
896 BPlusTree::TypeCodeToKeyType(type_code code)
897 {
898 	switch (code) {
899 		case B_STRING_TYPE:
900 			return BPLUSTREE_STRING_TYPE;
901 		case B_SSIZE_T_TYPE:
902 		case B_INT32_TYPE:
903 			return BPLUSTREE_INT32_TYPE;
904 		case B_SIZE_T_TYPE:
905 		case B_UINT32_TYPE:
906 			return BPLUSTREE_UINT32_TYPE;
907 		case B_OFF_T_TYPE:
908 		case B_INT64_TYPE:
909 			return BPLUSTREE_INT64_TYPE;
910 		case B_UINT64_TYPE:
911 			return BPLUSTREE_UINT64_TYPE;
912 		case B_FLOAT_TYPE:
913 			return BPLUSTREE_FLOAT_TYPE;
914 		case B_DOUBLE_TYPE:
915 			return BPLUSTREE_DOUBLE_TYPE;
916 	}
917 	return -1;
918 }
919 
920 
921 int32
922 BPlusTree::ModeToKeyType(mode_t mode)
923 {
924 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
925 			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
926 		case S_INT_INDEX:
927 			return BPLUSTREE_INT32_TYPE;
928 		case S_UINT_INDEX:
929 			return BPLUSTREE_UINT32_TYPE;
930 		case S_LONG_LONG_INDEX:
931 			return BPLUSTREE_INT64_TYPE;
932 		case S_ULONG_LONG_INDEX:
933 			return BPLUSTREE_UINT64_TYPE;
934 		case S_FLOAT_INDEX:
935 			return BPLUSTREE_FLOAT_TYPE;
936 		case S_DOUBLE_INDEX:
937 			return BPLUSTREE_DOUBLE_TYPE;
938 		case S_STR_INDEX:
939 		default:
940 			// default is for standard directories
941 			return BPLUSTREE_STRING_TYPE;
942 	}
943 }
944 #endif // !_BOOT_MODE
945 
946 
947 //	#pragma mark - TransactionListener implementation
948 
949 
950 #if !_BOOT_MODE
951 void
952 BPlusTree::TransactionDone(bool success)
953 {
954 	if (!success) {
955 		// update header from disk
956 		CachedNode cached(this);
957 		const bplustree_header* header = cached.SetToHeader();
958 		if (header != NULL)
959 			memcpy(&fHeader, header, sizeof(bplustree_header));
960 	}
961 }
962 
963 
964 void
965 BPlusTree::RemovedFromTransaction()
966 {
967 	fInTransaction = false;
968 
969 	if (!fStream->GetVolume()->IsInitializing())
970 		put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
971 }
972 
973 
974 //	#pragma mark -
975 
976 
977 void
978 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
979 	uint16 splitAt, int8 change)
980 {
981 	// Although every iterator which is affected by this update currently
982 	// waits on a semaphore, other iterators could be added/removed at
983 	// any time, so we need to protect this loop
984 	MutexLocker _(fIteratorLock);
985 
986 	SinglyLinkedList<TreeIterator>::Iterator iterator
987 		= fIterators.GetIterator();
988 	while (iterator.HasNext())
989 		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
990 }
991 
992 
993 void
994 BPlusTree::_AddIterator(TreeIterator* iterator)
995 {
996 	MutexLocker _(fIteratorLock);
997 	fIterators.Add(iterator);
998 }
999 
1000 
1001 void
1002 BPlusTree::_RemoveIterator(TreeIterator* iterator)
1003 {
1004 	MutexLocker _(fIteratorLock);
1005 	fIterators.Remove(iterator);
1006 }
1007 #endif // !_BOOT_MODE
1008 
1009 
1010 int32
1011 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
1012 	int keyLength2)
1013 {
1014 	type_code type = 0;
1015 	switch (fHeader.DataType()) {
1016 	    case BPLUSTREE_STRING_TYPE:
1017 	    	type = B_STRING_TYPE;
1018 	    	break;
1019 		case BPLUSTREE_INT32_TYPE:
1020 	    	type = B_INT32_TYPE;
1021 	    	break;
1022 		case BPLUSTREE_UINT32_TYPE:
1023 	    	type = B_UINT32_TYPE;
1024 	    	break;
1025 		case BPLUSTREE_INT64_TYPE:
1026 	    	type = B_INT64_TYPE;
1027 	    	break;
1028 		case BPLUSTREE_UINT64_TYPE:
1029 	    	type = B_UINT64_TYPE;
1030 	    	break;
1031 		case BPLUSTREE_FLOAT_TYPE:
1032 	    	type = B_FLOAT_TYPE;
1033 	    	break;
1034 		case BPLUSTREE_DOUBLE_TYPE:
1035 	    	type = B_DOUBLE_TYPE;
1036 	    	break;
1037 	}
1038 	return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2);
1039 }
1040 
1041 
1042 status_t
1043 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
1044 	uint16 keyLength, uint16* _index, off_t* _next)
1045 {
1046 #ifdef DEBUG
1047 	NodeChecker checker(node, fNodeSize, "find");
1048 #endif
1049 
1050 	if (node->all_key_count == 0) {
1051 		if (_index)
1052 			*_index = 0;
1053 		if (_next)
1054 			*_next = node->OverflowLink();
1055 		return B_ENTRY_NOT_FOUND;
1056 	}
1057 
1058 	off_t* values = node->Values();
1059 	int16 saveIndex = -1;
1060 
1061 	// binary search in the key array
1062 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
1063 		uint16 i = (first + last) >> 1;
1064 
1065 		uint16 searchLength = 0;
1066 		uint8* searchKey = node->KeyAt(i, &searchLength);
1067 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
1068 				> (uint8*)node + fNodeSize
1069 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
1070 #if !_BOOT_MODE
1071 			fStream->GetVolume()->Panic();
1072 #endif
1073 			RETURN_ERROR(B_BAD_DATA);
1074 		}
1075 
1076 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
1077 		if (cmp < 0) {
1078 			last = i - 1;
1079 			saveIndex = i;
1080 		} else if (cmp > 0) {
1081 			saveIndex = first = i + 1;
1082 		} else {
1083 			if (_index)
1084 				*_index = i;
1085 			if (_next)
1086 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1087 			return B_OK;
1088 		}
1089 	}
1090 
1091 	if (_index)
1092 		*_index = saveIndex;
1093 	if (_next) {
1094 		if (saveIndex == node->NumKeys())
1095 			*_next = node->OverflowLink();
1096 		else
1097 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
1098 	}
1099 	return B_ENTRY_NOT_FOUND;
1100 }
1101 
1102 
1103 #if !_BOOT_MODE
1104 /*!	Prepares the stack to contain all nodes that were passed while
1105 	following the key, from the root node to the leaf node that could
1106 	or should contain that key.
1107 */
1108 status_t
1109 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
1110 	uint16 keyLength)
1111 {
1112 	// set the root node to begin with
1113 	node_and_key nodeAndKey;
1114 	nodeAndKey.nodeOffset = fHeader.RootNode();
1115 
1116 	CachedNode cached(this);
1117 	const bplustree_node* node;
1118 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1119 		// if we are already on leaf level, we're done
1120 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1121 			// node that the keyIndex is not properly set here (but it's not
1122 			// needed in the calling functions anyway)!
1123 			nodeAndKey.keyIndex = 0;
1124 			stack.Push(nodeAndKey);
1125 			return B_OK;
1126 		}
1127 
1128 		off_t nextOffset;
1129 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
1130 			&nextOffset);
1131 
1132 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
1133 			RETURN_ERROR(B_ERROR);
1134 
1135 		if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
1136 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
1137 			break;
1138 		}
1139 
1140 		// put the node offset & the correct keyIndex on the stack
1141 		stack.Push(nodeAndKey);
1142 
1143 		nodeAndKey.nodeOffset = nextOffset;
1144 	}
1145 
1146 	FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
1147 		B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
1148 	return B_ERROR;
1149 }
1150 
1151 
1152 /*!	This will find a free duplicate fragment in the given bplustree_node.
1153 	The CachedNode will be set to the writable fragment on success.
1154 */
1155 status_t
1156 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
1157 	const bplustree_node* node, CachedNode& cached,
1158 	off_t* _offset, bplustree_node** _fragment, uint32* _index)
1159 {
1160 	off_t* values = node->Values();
1161 	for (int32 i = 0; i < node->NumKeys(); i++) {
1162 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1163 
1164 		// does the value link to a duplicate fragment?
1165 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
1166 			continue;
1167 
1168 		const bplustree_node* fragment = cached.SetTo(
1169 			bplustree_node::FragmentOffset(value), false);
1170 		if (fragment == NULL) {
1171 			FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
1172 				B_PRIdOFF "\n", value, fStream->ID()));
1173 			continue;
1174 		}
1175 
1176 		// see if there is some space left for us
1177 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
1178 		for (uint32 j = 0; j < num; j++) {
1179 			duplicate_array* array = fragment->FragmentAt(j);
1180 
1181 			if (array->IsEmpty()) {
1182 				// found an unused fragment
1183 				*_fragment = cached.MakeWritable(transaction);
1184 				if (*_fragment == NULL)
1185 					return B_IO_ERROR;
1186 
1187 				*_offset = bplustree_node::FragmentOffset(value);
1188 				*_index = j;
1189 				return B_OK;
1190 			}
1191 		}
1192 	}
1193 	return B_ENTRY_NOT_FOUND;
1194 }
1195 
1196 
1197 status_t
1198 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
1199 	const bplustree_node* node, uint16 index, off_t value)
1200 {
1201 	CachedNode cachedDuplicate(this);
1202 	off_t* values = node->Values();
1203 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1204 	status_t status;
1205 	off_t offset;
1206 
1207 	if (bplustree_node::IsDuplicate(oldValue)) {
1208 		// If it's a duplicate fragment, try to insert it into that, or if it
1209 		// doesn't fit anymore, create a new duplicate node
1210 
1211 		if (bplustree_node::LinkType(oldValue)
1212 				== BPLUSTREE_DUPLICATE_FRAGMENT) {
1213 			bplustree_node* duplicate = cachedDuplicate.SetToWritable(
1214 				transaction, bplustree_node::FragmentOffset(oldValue), false);
1215 			if (duplicate == NULL)
1216 				return B_IO_ERROR;
1217 
1218 			duplicate_array* array = duplicate->FragmentAt(
1219 				bplustree_node::FragmentIndex(oldValue));
1220 			int32 arrayCount = array->Count();
1221 			if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
1222 				FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
1223 					"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1224 					(int)bplustree_node::FragmentIndex(oldValue),
1225 					bplustree_node::FragmentOffset(oldValue), arrayCount,
1226 					fStream->ID()));
1227 				return B_BAD_DATA;
1228 			}
1229 
1230 			if (arrayCount < NUM_FRAGMENT_VALUES) {
1231 				array->Insert(value);
1232 			} else {
1233 				// Test if the fragment will be empty if we remove this key's
1234 				// values
1235 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
1236 					// The node will be empty without our values, so let us
1237 					// reuse it as a duplicate node
1238 					offset = bplustree_node::FragmentOffset(oldValue);
1239 
1240 					memmove(duplicate->DuplicateArray(), array,
1241 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1242 					duplicate->left_link = duplicate->right_link
1243 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1244 
1245 					array = duplicate->DuplicateArray();
1246 					array->Insert(value);
1247 				} else {
1248 					// Create a new duplicate node
1249 
1250 					cachedDuplicate.UnsetUnchanged(transaction);
1251 						// The old duplicate has not been touched, so we can
1252 						// reuse it
1253 
1254 					bplustree_node* newDuplicate;
1255 					status = cachedDuplicate.Allocate(transaction,
1256 						&newDuplicate, &offset);
1257 					if (status != B_OK)
1258 						RETURN_ERROR(status);
1259 
1260 					// Copy the array from the fragment node to the duplicate
1261 					// node and free the old entry (by zero'ing all values)
1262 					newDuplicate->overflow_link = array->count;
1263 					memcpy(&newDuplicate->all_key_count, &array->values[0],
1264 						array->Count() * sizeof(off_t));
1265 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1266 
1267 					array = newDuplicate->DuplicateArray();
1268 					array->Insert(value);
1269 				}
1270 
1271 				// Update the main pointer to link to a duplicate node
1272 				if (cached.MakeWritable(transaction) == NULL)
1273 					return B_IO_ERROR;
1274 
1275 				values[index]
1276 					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1277 						BPLUSTREE_DUPLICATE_NODE, offset));
1278 			}
1279 
1280 			return B_OK;
1281 		}
1282 
1283 		// Put the value into a dedicated duplicate node
1284 
1285 		// search for free space in the duplicate nodes of that key
1286 		duplicate_array* array;
1287 		int32 arrayCount;
1288 		const bplustree_node* duplicate;
1289 		off_t duplicateOffset;
1290 		do {
1291 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1292 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1293 			if (duplicate == NULL)
1294 				return B_IO_ERROR;
1295 
1296 			array = duplicate->DuplicateArray();
1297 			arrayCount =array->Count();
1298 			if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1299 				FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
1300 					B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1301 					duplicateOffset, arrayCount, fStream->ID()));
1302 				return B_BAD_DATA;
1303 			}
1304 		} while (arrayCount >= NUM_DUPLICATE_VALUES
1305 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
1306 
1307 		bplustree_node* writableDuplicate
1308 			= cachedDuplicate.MakeWritable(transaction);
1309 		if (writableDuplicate == NULL)
1310 			return B_IO_ERROR;
1311 
1312 		if (arrayCount < NUM_DUPLICATE_VALUES) {
1313 			array = writableDuplicate->DuplicateArray();
1314 			array->Insert(value);
1315 		} else {
1316 			// no space left - add a new duplicate node
1317 
1318 			bplustree_node* newDuplicate;
1319 			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
1320 				&offset);
1321 			if (status != B_OK)
1322 				RETURN_ERROR(status);
1323 
1324 			// link the two nodes together
1325 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
1326 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
1327 
1328 			array = newDuplicate->DuplicateArray();
1329 			array->count = 0;
1330 			array->Insert(value);
1331 		}
1332 		return B_OK;
1333 	}
1334 
1335 	// Search for a free duplicate fragment or create a new one
1336 	// to insert the duplicate value into
1337 
1338 	uint32 fragmentIndex = 0;
1339 	bplustree_node* fragment;
1340 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
1341 			&offset, &fragment, &fragmentIndex) != B_OK) {
1342 		// allocate a new duplicate fragment node
1343 		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
1344 		if (status != B_OK)
1345 			RETURN_ERROR(status);
1346 
1347 		memset(fragment, 0, fNodeSize);
1348 	}
1349 
1350 	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
1351 	array->Insert(oldValue);
1352 	array->Insert(value);
1353 
1354 	if (cached.MakeWritable(transaction) == NULL)
1355 		return B_IO_ERROR;
1356 
1357 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1358 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
1359 
1360 	return B_OK;
1361 }
1362 
1363 
1364 void
1365 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
1366 	uint16 keyLength, off_t value)
1367 {
1368 	// should never happen, but who knows?
1369 	if (index > node->NumKeys())
1370 		return;
1371 
1372 	off_t* values = node->Values();
1373 	uint16* keyLengths = node->KeyLengths();
1374 	uint8* keys = node->Keys();
1375 
1376 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1377 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1378 		+ keyLength);
1379 
1380 	off_t* newValues = node->Values();
1381 	uint16* newKeyLengths = node->KeyLengths();
1382 
1383 	// move values and copy new value into them
1384 	memmove(newValues + index + 1, values + index,
1385 		sizeof(off_t) * (node->NumKeys() - 1 - index));
1386 	memmove(newValues, values, sizeof(off_t) * index);
1387 
1388 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1389 
1390 	// move and update key length index
1391 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1392 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1393 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1394 	}
1395 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1396 
1397 	int32 keyStart;
1398 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1399 		+ (keyStart = index > 0
1400 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1401 
1402 	// move keys and copy new key into them
1403 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1404 	int32 size = node->AllKeyLength() - length;
1405 	if (size > 0)
1406 		memmove(keys + length, keys + length - keyLength, size);
1407 
1408 	memcpy(keys + keyStart, key, keyLength);
1409 }
1410 
1411 
1412 /*!	Splits the \a node into two halves - the other half will be put into
1413 	\a other. It also takes care to create a new overflow link if the node
1414 	to split is an index node.
1415 */
1416 status_t
1417 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1418 	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1419 	uint16* _keyLength, off_t* _value)
1420 {
1421 	if (*_keyIndex > node->NumKeys() + 1)
1422 		return B_BAD_VALUE;
1423 
1424 	uint16* inKeyLengths = node->KeyLengths();
1425 	off_t* inKeyValues = node->Values();
1426 	uint8* inKeys = node->Keys();
1427 	uint8* outKeys = other->Keys();
1428 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1429 
1430 	if (keyIndex > node->NumKeys()) {
1431 		FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
1432 			"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
1433 		return B_BAD_VALUE;
1434 	}
1435 
1436 	// How many keys will fit in one (half) page?
1437 	// The following loop will find the answer to this question and
1438 	// change the key lengths indices for their new home
1439 
1440 	// "bytes" is the number of bytes written for the new key,
1441 	// "bytesBefore" are the bytes before that key
1442 	// "bytesAfter" are the bytes after the new key, if any
1443 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1444 
1445 	size_t size = fNodeSize >> 1;
1446 	int32 out, in;
1447 	size_t keyLengths = 0;
1448 	for (in = out = 0; in < node->NumKeys() + 1;) {
1449 		keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]);
1450 
1451 		if (in == keyIndex && !bytes) {
1452 			bytes = *_keyLength;
1453 			bytesBefore = in > 0
1454 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1455 		} else {
1456 			if (keyIndex < out)
1457 				bytesAfter = keyLengths - bytesBefore;
1458 
1459 			in++;
1460 		}
1461 		out++;
1462 
1463 		if (key_align(sizeof(bplustree_node) + bytes + keyLengths)
1464 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1465 			// we have found the number of keys in the new node!
1466 			break;
1467 		}
1468 	}
1469 
1470 	// if the new key was not inserted, set the length of the keys
1471 	// that can be copied directly
1472 
1473 	if (keyIndex >= out && in > 0)
1474 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1475 	else if (keyIndex + 1 < out)
1476 		bytesAfter = keyLengths - bytesBefore;
1477 
1478 	if (bytesBefore < 0 || bytesAfter < 0)
1479 		return B_BAD_DATA;
1480 
1481 	other->left_link = node->left_link;
1482 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1483 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1484 		+ bytesAfter);
1485 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1486 
1487 	uint16* outKeyLengths = other->KeyLengths();
1488 	off_t* outKeyValues = other->Values();
1489 	int32 keys = out > keyIndex ? keyIndex : out;
1490 
1491 	if (bytesBefore) {
1492 		// copy the keys
1493 		memcpy(outKeys, inKeys, bytesBefore);
1494 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1495 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1496 	}
1497 	if (bytes) {
1498 		// copy the newly inserted key
1499 		memcpy(outKeys + bytesBefore, key, bytes);
1500 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1501 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1502 
1503 		if (bytesAfter) {
1504 			// copy the keys after the new key
1505 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1506 				bytesAfter);
1507 			keys = out - keyIndex - 1;
1508 			for (int32 i = 0;i < keys;i++) {
1509 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1510 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1511 						+ bytes);
1512 			}
1513 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1514 				keys * sizeof(off_t));
1515 		}
1516 	}
1517 
1518 	// if the new key was already inserted, we shouldn't use it again
1519 	if (in != out)
1520 		keyIndex--;
1521 
1522 	int32 total = bytesBefore + bytesAfter;
1523 
1524 	// these variables are for the key that will be returned
1525 	// to the parent node
1526 	uint8* newKey = NULL;
1527 	uint16 newLength;
1528 	bool newAllocated = false;
1529 
1530 	// If we have split an index node, we have to drop the first key
1531 	// of the next node (which can also be the new key to insert).
1532 	// The dropped key is also the one which has to be inserted in
1533 	// the parent node, so we will set the "newKey" already here.
1534 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1535 		if (in == keyIndex) {
1536 			newKey = key;
1537 			newLength = *_keyLength;
1538 
1539 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1540 			keyIndex--;
1541 		} else {
1542 			// If a key is dropped (is not the new key), we have to copy
1543 			// it, because it would be lost if not.
1544 			uint8* droppedKey = node->KeyAt(in, &newLength);
1545 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1546 					> (uint8*)node + fNodeSize
1547 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1548 				fStream->GetVolume()->Panic();
1549 				RETURN_ERROR(B_BAD_DATA);
1550 			}
1551 			newKey = (uint8*)malloc(newLength);
1552 			if (newKey == NULL)
1553 				return B_NO_MEMORY;
1554 
1555 			newAllocated = true;
1556 			memcpy(newKey, droppedKey, newLength);
1557 
1558 			other->overflow_link = inKeyValues[in];
1559 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1560 		}
1561 	}
1562 
1563 	// and now the same game for the other page and the rest of the keys
1564 	// (but with memmove() instead of memcpy(), because they may overlap)
1565 
1566 	bytesBefore = bytesAfter = bytes = 0;
1567 	out = 0;
1568 	int32 skip = in;
1569 	while (in < node->NumKeys() + 1) {
1570 		if (in == keyIndex && !bytes) {
1571 			// it's enough to set bytesBefore once here, because we do
1572 			// not need to know the exact length of all keys in this
1573 			// loop
1574 			bytesBefore = in > skip
1575 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1576 			bytes = *_keyLength;
1577 			out++;
1578 		} else {
1579 			if (in < node->NumKeys()) {
1580 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1581 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1582 
1583 				if (bytes) {
1584 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1585 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1586 
1587 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1588 						- bytesBefore - bytes;
1589 				}
1590 				out++;
1591 			}
1592 			in++;
1593 		}
1594 	}
1595 
1596 	// adjust the byte counts (since we were a bit lazy in the loop)
1597 	if (keyIndex < skip)
1598 		bytesBefore = node->AllKeyLength() - total;
1599 
1600 	if (bytesBefore < 0 || bytesAfter < 0) {
1601 		if (newAllocated)
1602 			free(newKey);
1603 		return B_BAD_DATA;
1604 	}
1605 
1606 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1607 		// right link, and overflow link can stay the same
1608 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1609 		+ bytesAfter);
1610 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1611 
1612 	// array positions have changed
1613 	outKeyLengths = node->KeyLengths();
1614 	outKeyValues = node->Values();
1615 
1616 	// move the keys in the old node: the order is important here,
1617 	// because we don't want to overwrite any contents
1618 
1619 	keys = keyIndex <= skip ? out : keyIndex - skip;
1620 	keyIndex -= skip;
1621 	in = out - keyIndex - 1;
1622 		// Note: keyIndex and in will contain invalid values when the new key
1623 		// went to the other node. But in this case bytes and bytesAfter are
1624 		// 0 and subsequently we never use keyIndex and in.
1625 
1626 	if (bytesBefore)
1627 		memmove(inKeys, inKeys + total, bytesBefore);
1628 	if (bytesAfter) {
1629 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1630 			bytesAfter);
1631 	}
1632 
1633 	if (bytesBefore)
1634 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1635 	if (bytesAfter) {
1636 		// if byteAfter is > 0, keyIndex is larger than skip
1637 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1638 			in * sizeof(uint16));
1639 	}
1640 
1641 	if (bytesBefore)
1642 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1643 	if (bytesAfter) {
1644 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1645 			in * sizeof(off_t));
1646 	}
1647 
1648 	if (bytes) {
1649 		// finally, copy the newly inserted key (don't overwrite anything)
1650 		memcpy(inKeys + bytesBefore, key, bytes);
1651 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1652 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1653 	}
1654 
1655 	// Prepare the key that will be inserted in the parent node which
1656 	// is either the dropped key or the last of the other node.
1657 	// If it's the dropped key, "newKey" was already set earlier.
1658 
1659 	if (newKey == NULL)
1660 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1661 
1662 	memcpy(key, newKey, newLength);
1663 	*_keyLength = newLength;
1664 	*_value = otherOffset;
1665 
1666 	if (newAllocated)
1667 		free(newKey);
1668 
1669 	return B_OK;
1670 }
1671 
1672 
1673 /*!	This inserts a key into the tree. The changes made to the tree will
1674 	all be part of the \a transaction.
1675 	You need to have the inode write locked.
1676 */
1677 status_t
1678 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1679 	off_t value)
1680 {
1681 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1682 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1683 		RETURN_ERROR(B_BAD_VALUE);
1684 #ifdef DEBUG
1685 	if (value < 0)
1686 		panic("tried to insert invalid value %" B_PRId64 "!\n", value);
1687 #endif
1688 
1689 	ASSERT_WRITE_LOCKED_INODE(fStream);
1690 
1691 	Stack<node_and_key> stack;
1692 	if (_SeekDown(stack, key, keyLength) != B_OK)
1693 		RETURN_ERROR(B_ERROR);
1694 
1695 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1696 
1697 	memcpy(keyBuffer, key, keyLength);
1698 	keyBuffer[keyLength] = 0;
1699 
1700 	node_and_key nodeAndKey;
1701 	const bplustree_node* node;
1702 
1703 	CachedNode cached(this);
1704 	while (stack.Pop(&nodeAndKey)
1705 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1706 #ifdef DEBUG
1707 		NodeChecker checker(node, fNodeSize, "insert");
1708 #endif
1709 		if (node->IsLeaf()) {
1710 			// first round, check for duplicate entries
1711 			status_t status = _FindKey(node, key, keyLength,
1712 				&nodeAndKey.keyIndex);
1713 
1714 			// is this a duplicate entry?
1715 			if (status == B_OK) {
1716 				if (fAllowDuplicates) {
1717 					status = _InsertDuplicate(transaction, cached, node,
1718 						nodeAndKey.keyIndex, value);
1719 					if (status != B_OK)
1720 						RETURN_ERROR(status);
1721 					return B_OK;
1722 				}
1723 
1724 				return B_NAME_IN_USE;
1725 			}
1726 		}
1727 
1728 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1729 		if (writableNode == NULL)
1730 			return B_IO_ERROR;
1731 
1732 		// is the node big enough to hold the pair?
1733 		if (int32(key_align(sizeof(bplustree_node)
1734 				+ writableNode->AllKeyLength() + keyLength)
1735 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1736 				+ sizeof(off_t))) < fNodeSize) {
1737 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1738 				keyBuffer, keyLength, value);
1739 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1740 				nodeAndKey.keyIndex, 0, 1);
1741 
1742 			return B_OK;
1743 		} else {
1744 			CachedNode cachedNewRoot(this);
1745 			CachedNode cachedOther(this);
1746 
1747 			// do we need to allocate a new root node? if so, then do
1748 			// it now
1749 			off_t newRoot = BPLUSTREE_NULL;
1750 			if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1751 				bplustree_node* root;
1752 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1753 					&newRoot);
1754 				if (status != B_OK) {
1755 					// The tree is most likely corrupted!
1756 					// But it's still sane at leaf level - we could set
1757 					// a flag in the header that forces the tree to be
1758 					// rebuild next time...
1759 					// But since we will have journaling, that's not a big
1760 					// problem anyway.
1761 					RETURN_ERROR(status);
1762 				}
1763 			}
1764 
1765 			// reserve space for the other node
1766 			bplustree_node* other;
1767 			off_t otherOffset;
1768 			status_t status = cachedOther.Allocate(transaction, &other,
1769 				&otherOffset);
1770 			if (status != B_OK) {
1771 				cachedNewRoot.Free(transaction, newRoot);
1772 				RETURN_ERROR(status);
1773 			}
1774 
1775 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1776 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1777 					&value) != B_OK) {
1778 				// free root node & other node here
1779 				cachedOther.Free(transaction, otherOffset);
1780 				cachedNewRoot.Free(transaction, newRoot);
1781 
1782 				RETURN_ERROR(B_ERROR);
1783 			}
1784 #ifdef DEBUG
1785 			checker.Check("insert split");
1786 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1787 #endif
1788 
1789 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1790 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1791 
1792 			// update the right link of the node in the left of the new node
1793 			if ((other = cachedOther.SetToWritable(transaction,
1794 					other->LeftLink())) != NULL) {
1795 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1796 			}
1797 
1798 			// create a new root if necessary
1799 			if (newRoot != BPLUSTREE_NULL) {
1800 				bplustree_node* root = cachedNewRoot.Node();
1801 
1802 				_InsertKey(root, 0, keyBuffer, keyLength,
1803 					writableNode->LeftLink());
1804 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1805 					nodeAndKey.nodeOffset);
1806 
1807 				CachedNode cached(this);
1808 				bplustree_header* header
1809 					= cached.SetToWritableHeader(transaction);
1810 				if (header == NULL)
1811 					return B_IO_ERROR;
1812 
1813 				// finally, update header to point to the new root
1814 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1815 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1816 					header->MaxNumberOfLevels() + 1);
1817 
1818 				return B_OK;
1819 			}
1820 		}
1821 	}
1822 	RETURN_ERROR(B_ERROR);
1823 }
1824 
1825 
1826 /*!	Removes the duplicate index/value pair from the tree.
1827 	It's part of the private tree interface.
1828 */
1829 status_t
1830 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1831 	const bplustree_node* node, CachedNode& cached, uint16 index,
1832 	off_t value)
1833 {
1834 	off_t* values = node->Values();
1835 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1836 
1837 	CachedNode cachedDuplicate(this);
1838 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1839 	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1840 		duplicateOffset, false);
1841 	if (duplicate == NULL)
1842 		return B_IO_ERROR;
1843 
1844 	// if it's a duplicate fragment, remove the entry from there
1845 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1846 		duplicate_array* array = duplicate->FragmentAt(
1847 			bplustree_node::FragmentIndex(oldValue));
1848 		int32 arrayCount = array->Count();
1849 
1850 		if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
1851 			FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
1852 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1853 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1854 				arrayCount, fStream->ID()));
1855 			return B_BAD_DATA;
1856 		}
1857 		if (!array->Remove(value)) {
1858 			FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1859 				"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
1860 				duplicateOffset, fStream->ID()));
1861 			return B_ENTRY_NOT_FOUND;
1862 		}
1863 
1864 		// remove the array from the fragment node if it is empty
1865 		if (--arrayCount == 1) {
1866 			// set the link to the remaining value
1867 			if (cached.MakeWritable(transaction) == NULL)
1868 				return B_IO_ERROR;
1869 
1870 			values[index] = array->values[0];
1871 
1872 			// Remove the whole fragment node, if this was the only array,
1873 			// otherwise free just the array
1874 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1875 				status_t status = cachedDuplicate.Free(transaction,
1876 					duplicateOffset);
1877 				if (status != B_OK)
1878 					return status;
1879 			} else
1880 				array->count = 0;
1881 		}
1882 		return B_OK;
1883 	}
1884 
1885 	// Remove value from a duplicate node!
1886 
1887 	duplicate_array* array = NULL;
1888 	int32 arrayCount = 0;
1889 
1890 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1891 		FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1892 			", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
1893 		return B_BAD_DATA;
1894 	}
1895 
1896 	// Search the duplicate nodes until the entry could be found (and removed)
1897 	while (duplicate != NULL) {
1898 		array = duplicate->DuplicateArray();
1899 		arrayCount = array->Count();
1900 
1901 		if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1902 			FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
1903 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1904 				duplicateOffset, arrayCount, fStream->ID()));
1905 			return B_BAD_DATA;
1906 		}
1907 
1908 		if (array->Remove(value)) {
1909 			arrayCount--;
1910 			break;
1911 		}
1912 
1913 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1914 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1915 
1916 		cachedDuplicate.UnsetUnchanged(transaction);
1917 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1918 			false);
1919 	}
1920 	if (duplicate == NULL)
1921 		RETURN_ERROR(B_IO_ERROR);
1922 
1923 	// The entry got removed from the duplicate node, but we might want to free
1924 	// it now in case it's empty
1925 
1926 	while (true) {
1927 		off_t left = duplicate->LeftLink();
1928 		off_t right = duplicate->RightLink();
1929 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1930 
1931 		if ((isLast && arrayCount == 1) || arrayCount == 0) {
1932 			// Free empty duplicate page, link their siblings together, and
1933 			// update the duplicate link if needed (ie. when we either remove
1934 			// the last duplicate node or have a new first one)
1935 
1936 			if (left == BPLUSTREE_NULL) {
1937 				// the duplicate link points to us
1938 				if (cached.MakeWritable(transaction) == NULL)
1939 					return B_IO_ERROR;
1940 
1941 				if (arrayCount == 1) {
1942 					// This is the last node, and there is only one value left;
1943 					// replace the duplicate link with that value, it's no
1944 					// duplicate anymore
1945 					values[index] = array->values[0];
1946 				} else {
1947 					// Move the duplicate link to the next node
1948 					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1949 						bplustree_node::MakeLink(
1950 							BPLUSTREE_DUPLICATE_NODE, right));
1951 				}
1952 			}
1953 
1954 			status_t status = cachedDuplicate.Free(transaction,
1955 				duplicateOffset);
1956 			if (status != B_OK)
1957 				return status;
1958 
1959 			if (left != BPLUSTREE_NULL
1960 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1961 						false)) != NULL) {
1962 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1963 
1964 				// If the next node is the last node, we need to free that node
1965 				// and convert the duplicate entry back into a normal entry
1966 				array = duplicate->DuplicateArray();
1967 				arrayCount = array->Count();
1968 				if (right == BPLUSTREE_NULL
1969 					&& duplicate->LeftLink() == BPLUSTREE_NULL
1970 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1971 					duplicateOffset = left;
1972 					continue;
1973 				}
1974 			}
1975 			if (right != BPLUSTREE_NULL
1976 				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1977 						right, false)) != NULL) {
1978 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1979 
1980 				// Again, we may need to turn the duplicate entry back into a
1981 				// normal entry
1982 				array = duplicate->DuplicateArray();
1983 				arrayCount = array->Count();
1984 				if (left == BPLUSTREE_NULL
1985 					&& duplicate->RightLink() == BPLUSTREE_NULL
1986 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1987 					duplicateOffset = right;
1988 					continue;
1989 				}
1990 			}
1991 			return B_OK;
1992 		}
1993 		if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
1994 			// If the number of entries fits in a duplicate fragment, then
1995 			// either find a free fragment node, or convert this node to a
1996 			// fragment node.
1997 			CachedNode cachedOther(this);
1998 
1999 			bplustree_node* fragment = NULL;
2000 			uint32 fragmentIndex = 0;
2001 			off_t offset;
2002 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
2003 					&offset, &fragment, &fragmentIndex) == B_OK) {
2004 				// move to other node
2005 				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
2006 				memcpy(target, array,
2007 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2008 
2009 				cachedDuplicate.Free(transaction, duplicateOffset);
2010 				duplicateOffset = offset;
2011 			} else {
2012 				// convert node
2013 				memmove(duplicate, array,
2014 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2015 				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
2016 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2017 			}
2018 
2019 			if (cached.MakeWritable(transaction) == NULL)
2020 				return B_IO_ERROR;
2021 
2022 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
2023 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
2024 		}
2025 		return B_OK;
2026 	}
2027 }
2028 
2029 
2030 /*!	Removes the key with the given index from the specified node.
2031 	Since it has to get the key from the node anyway (to obtain it's
2032 	pointer), it's not needed to pass the key & its length, although
2033 	the calling method (BPlusTree::Remove()) have this data.
2034 */
2035 void
2036 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
2037 {
2038 	// should never happen, but who knows?
2039 	if (index > node->NumKeys() && node->NumKeys() > 0) {
2040 		FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
2041 			"\n", index, fStream->ID()));
2042 		return;
2043 	}
2044 
2045 	off_t* values = node->Values();
2046 
2047 	// if we would have to drop the overflow link, drop
2048 	// the last key instead and update the overflow link
2049 	// to the value of that one
2050 	if (!node->IsLeaf() && index == node->NumKeys())
2051 		node->overflow_link = values[--index];
2052 
2053 	uint16 length;
2054 	uint8* key = node->KeyAt(index, &length);
2055 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
2056 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2057 		FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
2058 			fStream->ID()));
2059 		fStream->GetVolume()->Panic();
2060 		return;
2061 	}
2062 
2063 	uint16* keyLengths = node->KeyLengths();
2064 	uint8* keys = node->Keys();
2065 
2066 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
2067 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
2068 		node->AllKeyLength() - length);
2069 
2070 	off_t* newValues = node->Values();
2071 	uint16* newKeyLengths = node->KeyLengths();
2072 
2073 	// move key data
2074 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
2075 
2076 	// move and update key lengths
2077 	if (index > 0 && newKeyLengths != keyLengths)
2078 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
2079 	for (uint16 i = index; i < node->NumKeys(); i++) {
2080 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
2081 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
2082 	}
2083 
2084 	// move values
2085 	if (index > 0)
2086 		memmove(newValues, values, index * sizeof(off_t));
2087 	if (node->NumKeys() > index) {
2088 		memmove(newValues + index, values + index + 1,
2089 			(node->NumKeys() - index) * sizeof(off_t));
2090 	}
2091 }
2092 
2093 
2094 /*!	Removes the specified key from the tree. The "value" parameter is only used
2095 	for trees which allow duplicates, so you may safely ignore it.
2096 	It's not an optional parameter, so at least you have to think about it.
2097 	You need to have the inode write locked.
2098 */
2099 status_t
2100 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
2101 	off_t value)
2102 {
2103 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2104 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2105 		RETURN_ERROR(B_BAD_VALUE);
2106 
2107 	ASSERT_WRITE_LOCKED_INODE(fStream);
2108 
2109 	Stack<node_and_key> stack;
2110 	if (_SeekDown(stack, key, keyLength) != B_OK)
2111 		RETURN_ERROR(B_ERROR);
2112 
2113 	node_and_key nodeAndKey;
2114 	const bplustree_node* node;
2115 
2116 	CachedNode cached(this);
2117 	while (stack.Pop(&nodeAndKey)
2118 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
2119 #ifdef DEBUG
2120 		NodeChecker checker(node, fNodeSize, "remove");
2121 #endif
2122 		if (node->IsLeaf()) {
2123 			// first round, check for duplicate entries
2124 			status_t status = _FindKey(node, key, keyLength,
2125 				&nodeAndKey.keyIndex);
2126 			if (status != B_OK)
2127 				RETURN_ERROR(status);
2128 
2129 			// Is this a duplicate entry?
2130 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
2131 					node->Values()[nodeAndKey.keyIndex]))) {
2132 				if (fAllowDuplicates) {
2133 					return _RemoveDuplicate(transaction, node, cached,
2134 						nodeAndKey.keyIndex, value);
2135 				}
2136 
2137 				FATAL(("dupliate node found where no duplicates are "
2138 					"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
2139 				RETURN_ERROR(B_ERROR);
2140 			} else {
2141 				if (node->Values()[nodeAndKey.keyIndex] != value)
2142 					return B_ENTRY_NOT_FOUND;
2143 
2144 				// If we will remove the last key, the iterator will be set
2145 				// to the next node after the current - if there aren't any
2146 				// more nodes, we need a way to prevent the TreeIterators to
2147 				// touch the old node again, we use BPLUSTREE_FREE for this
2148 				off_t next = node->RightLink() == BPLUSTREE_NULL
2149 					? BPLUSTREE_FREE : node->RightLink();
2150 				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
2151 					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
2152 			}
2153 		}
2154 
2155 		bplustree_node* writableNode = cached.MakeWritable(transaction);
2156 		if (writableNode == NULL)
2157 			return B_IO_ERROR;
2158 
2159 		// if it's an empty root node, we have to convert it
2160 		// to a leaf node by dropping the overflow link, or,
2161 		// if it's already a leaf node, just empty it
2162 		if (nodeAndKey.nodeOffset == fHeader.RootNode()
2163 			&& (node->NumKeys() == 0
2164 				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
2165 			writableNode->overflow_link
2166 				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2167 			writableNode->all_key_count = 0;
2168 			writableNode->all_key_length = 0;
2169 
2170 			// if we've made a leaf node out of the root node, we need
2171 			// to reset the maximum number of levels in the header
2172 			if (fHeader.MaxNumberOfLevels() != 1) {
2173 				CachedNode cached(this);
2174 				bplustree_header* header
2175 					= cached.SetToWritableHeader(transaction);
2176 				if (header == NULL)
2177 					return B_IO_ERROR;
2178 
2179 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
2180 			}
2181 			return B_OK;
2182 		}
2183 
2184 		// if there is only one key left, we don't have to remove
2185 		// it, we can just dump the node (index nodes still have
2186 		// the overflow link, so we have to drop the last key)
2187 		if (writableNode->NumKeys() > 1
2188 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
2189 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
2190 			return B_OK;
2191 		}
2192 
2193 		// when we are here, we can just free the node, but
2194 		// we have to update the right/left link of the
2195 		// siblings first
2196 		CachedNode otherCached(this);
2197 		bplustree_node* other = otherCached.SetToWritable(transaction,
2198 			writableNode->LeftLink());
2199 		if (other != NULL)
2200 			other->right_link = writableNode->right_link;
2201 
2202 		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
2203 				!= NULL) {
2204 			other->left_link = writableNode->left_link;
2205 		}
2206 
2207 		cached.Free(transaction, nodeAndKey.nodeOffset);
2208 	}
2209 	RETURN_ERROR(B_ERROR);
2210 }
2211 
2212 
2213 /*!	Replaces the value for the key in the tree.
2214 	Returns B_OK if the key could be found and its value replaced,
2215 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
2216 	to indicate that something went terribly wrong.
2217 	Note that this doesn't work with duplicates - it will just
2218 	return B_BAD_TYPE if you call this function on a tree where
2219 	duplicates are allowed.
2220 	You need to have the inode write locked.
2221 */
2222 status_t
2223 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
2224 	off_t value)
2225 {
2226 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2227 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2228 		|| key == NULL)
2229 		RETURN_ERROR(B_BAD_VALUE);
2230 
2231 	if (fAllowDuplicates)
2232 		RETURN_ERROR(B_BAD_TYPE);
2233 
2234 	ASSERT_WRITE_LOCKED_INODE(fStream);
2235 
2236 	off_t nodeOffset = fHeader.RootNode();
2237 	CachedNode cached(this);
2238 	const bplustree_node* node;
2239 
2240 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2241 		uint16 keyIndex = 0;
2242 		off_t nextOffset;
2243 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2244 			&nextOffset);
2245 
2246 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2247 			if (status == B_OK) {
2248 				bplustree_node* writableNode = cached.MakeWritable(transaction);
2249 				if (writableNode != NULL) {
2250 					writableNode->Values()[keyIndex]
2251 						= HOST_ENDIAN_TO_BFS_INT64(value);
2252 				} else
2253 					status = B_IO_ERROR;
2254 			}
2255 
2256 			return status;
2257 		} else if (nextOffset == nodeOffset)
2258 			RETURN_ERROR(B_ERROR);
2259 
2260 		nodeOffset = nextOffset;
2261 	}
2262 	RETURN_ERROR(B_ERROR);
2263 }
2264 #endif // !_BOOT_MODE
2265 
2266 
2267 /*!	Searches the key in the tree, and stores the offset found in _value,
2268 	if successful.
2269 	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
2270 	while it descends the tree.
2271 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
2272 	It can also return other errors to indicate that something went wrong.
2273 	Note that this doesn't work with duplicates - it will just return
2274 	B_BAD_TYPE if you call this function on a tree where duplicates are
2275 	allowed.
2276 	You need to have the inode read or write locked.
2277 */
2278 status_t
2279 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
2280 {
2281 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2282 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2283 		|| key == NULL)
2284 		RETURN_ERROR(B_BAD_VALUE);
2285 
2286 	if (fAllowDuplicates)
2287 		RETURN_ERROR(B_BAD_TYPE);
2288 
2289 #if !_BOOT_MODE
2290 	ASSERT_READ_LOCKED_INODE(fStream);
2291 #endif
2292 
2293 	off_t nodeOffset = fHeader.RootNode();
2294 	CachedNode cached(this);
2295 	const bplustree_node* node;
2296 
2297 #ifdef DEBUG
2298 	int32 levels = 0;
2299 #endif
2300 
2301 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2302 		uint16 keyIndex = 0;
2303 		off_t nextOffset;
2304 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2305 			&nextOffset);
2306 
2307 #ifdef DEBUG
2308 		levels++;
2309 #endif
2310 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2311 			if (status == B_OK && _value != NULL)
2312 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
2313 
2314 #ifdef DEBUG
2315 			if (levels != (int32)fHeader.MaxNumberOfLevels())
2316 				DEBUGGER(("levels don't match"));
2317 #endif
2318 			return status;
2319 		} else if (nextOffset == nodeOffset)
2320 			RETURN_ERROR(B_ERROR);
2321 
2322 		nodeOffset = nextOffset;
2323 	}
2324 	FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
2325 		B_PRIdOFF "\n", nodeOffset, fStream->ID()));
2326 	RETURN_ERROR(B_ERROR);
2327 }
2328 
2329 
2330 #if !_BOOT_MODE
2331 status_t
2332 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
2333 	const uint8* largestKey, uint16 largestKeyLength,
2334 	const bplustree_node* parent)
2335 {
2336 	if (parent->CheckIntegrity(fNodeSize) != B_OK) {
2337 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
2338 			"failed!\n", fStream->ID(), offset);
2339 		check.FoundError();
2340 		return B_OK;
2341 	}
2342 	if (level >= fHeader.MaxNumberOfLevels()) {
2343 		dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
2344 			"!\n", fStream->ID(), offset);
2345 		check.FoundError();
2346 		return B_OK;
2347 	}
2348 
2349 	check.SetLevel(level);
2350 
2351 	if (check.Visited(offset)) {
2352 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
2353 			fStream->ID(), offset);
2354 		check.FoundError();
2355 		return B_OK;
2356 	}
2357 
2358 	check.SetVisited(offset);
2359 
2360 	uint32 count = parent->NumKeys();
2361 	off_t* values = parent->Values();
2362 	off_t lastOffset = check.PreviousOffset(level);
2363 	CachedNode cached(this);
2364 
2365 	for (uint32 i = 0; i < count; i++) {
2366 		uint16 keyLength;
2367 		uint8* key = parent->KeyAt(i, &keyLength);
2368 		if (largestKey != NULL) {
2369 			int result = _CompareKeys(key, keyLength, largestKey,
2370 				largestKeyLength);
2371 			if (result > 0 || (result == 0 && i != count - 1)) {
2372 				dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
2373 					B_PRIu32 " larger than it should!\n",
2374 					fStream->ID(), offset, i);
2375 				check.FoundError();
2376 			}
2377 		}
2378 
2379 		off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
2380 		if (bplustree_node::IsDuplicate(childOffset)) {
2381 			// Walk the duplicate nodes
2382 			off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
2383 			off_t lastDuplicateOffset = BPLUSTREE_NULL;
2384 
2385 			while (duplicateOffset != BPLUSTREE_NULL) {
2386 				const bplustree_node* node;
2387 				status_t status = cached.SetTo(duplicateOffset, &node, false);
2388 				if (status != B_OK) {
2389 					if (status == B_IO_ERROR)
2390 						return B_IO_ERROR;
2391 
2392 					dprintf("inode %" B_PRIdOFF ": duplicate node at %"
2393 						B_PRIdOFF " could not be read: %s\n", fStream->ID(),
2394 						duplicateOffset, strerror(status));
2395 					check.FoundError();
2396 					break;
2397 				}
2398 
2399 				bool isFragmentNode = bplustree_node::LinkType(childOffset)
2400 					== BPLUSTREE_DUPLICATE_FRAGMENT;
2401 				bool isKnownFragment = isFragmentNode
2402 					&& check.VisitedFragment(duplicateOffset);
2403 
2404 				if (!isKnownFragment && check.Visited(duplicateOffset)) {
2405 					dprintf("inode %" B_PRIdOFF ": %s node at %"
2406 						B_PRIdOFF " already visited, referenced from %"
2407 						B_PRIdOFF "!\n", fStream->ID(),
2408 						isFragmentNode ? "fragment" : "duplicate",
2409 						duplicateOffset, offset);
2410 					check.FoundError();
2411 					break;
2412 				}
2413 
2414 				// Fragment nodes may be visited more than once from different
2415 				// places
2416 				if (!check.Visited(duplicateOffset))
2417 					check.SetVisited(duplicateOffset);
2418 				if (!isKnownFragment && isFragmentNode)
2419 					check.SetVisitedFragment(duplicateOffset);
2420 
2421 				duplicate_array* array;
2422 				int32 minSize;
2423 				int32 maxSize;
2424 				if (isFragmentNode) {
2425 					array = node->FragmentAt(
2426 						bplustree_node::FragmentIndex(childOffset));
2427 					minSize = 2;
2428 					maxSize = NUM_FRAGMENT_VALUES;
2429 				} else {
2430 					array = node->DuplicateArray();
2431 					minSize = 1;
2432 					maxSize = NUM_DUPLICATE_VALUES;
2433 				}
2434 				int32 arrayCount = array->Count();
2435 
2436 				if (arrayCount < minSize || arrayCount > maxSize) {
2437 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2438 						" has invalid array size %" B_PRId32 "!\n",
2439 						fStream->ID(), duplicateOffset, arrayCount);
2440 					check.FoundError();
2441 				} else {
2442 					// Simple check if the values in the array may be valid
2443 					for (int32 j = 0; j < arrayCount; j++) {
2444 						if (!fStream->GetVolume()->IsValidInodeBlock(
2445 								array->ValueAt(j))) {
2446 							dprintf("inode %" B_PRIdOFF ": duplicate at %"
2447 								B_PRIdOFF " contains invalid block %" B_PRIdOFF
2448 								" at %" B_PRId32 "!\n", fStream->ID(),
2449 								duplicateOffset, array->ValueAt(j), j);
2450 							check.FoundError();
2451 							break;
2452 						}
2453 					}
2454 				}
2455 
2456 				// A fragment node is not linked (and does not have valid links)
2457 				if (isFragmentNode)
2458 					break;
2459 
2460 				if (node->LeftLink() != lastDuplicateOffset) {
2461 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2462 						" has wrong left link %" B_PRIdOFF ", expected %"
2463 						B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
2464 						node->LeftLink(), lastDuplicateOffset);
2465 					check.FoundError();
2466 				}
2467 
2468 				lastDuplicateOffset = duplicateOffset;
2469 				duplicateOffset = node->RightLink();
2470 			}
2471 		} else if (!parent->IsLeaf()) {
2472 			// Test a regular child node recursively
2473 			off_t nextOffset = parent->OverflowLink();
2474 			if (i < count - 1)
2475 				nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
2476 
2477 			if (i == 0 && lastOffset != BPLUSTREE_NULL) {
2478 				// Test right link of the previous node
2479 				const bplustree_node* previous = cached.SetTo(lastOffset, true);
2480 				if (previous == NULL)
2481 					return B_IO_ERROR;
2482 
2483 				if (previous->RightLink() != childOffset) {
2484 					dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2485 						"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2486 						"!\n", fStream->ID(), lastOffset, previous->RightLink(),
2487 						childOffset);
2488 					check.FoundError();
2489 				}
2490 			}
2491 
2492 			status_t status = _ValidateChild(check, cached, level, childOffset,
2493 				lastOffset, nextOffset, key, keyLength);
2494 			if (status != B_OK)
2495 				return status;
2496 		} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
2497 			dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
2498 				"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
2499 				fStream->ID(), offset, childOffset, i);
2500 			check.FoundError();
2501 		}
2502 
2503 		lastOffset = childOffset;
2504 	}
2505 
2506 	if (parent->OverflowLink() != BPLUSTREE_NULL) {
2507 		off_t childOffset = parent->OverflowLink();
2508 		status_t status = _ValidateChild(check, cached, level, childOffset,
2509 			lastOffset, 0, NULL, 0);
2510 		if (status != B_OK)
2511 			return status;
2512 
2513 		lastOffset = childOffset;
2514 	}
2515 
2516 	check.SetPreviousOffset(level, lastOffset);
2517 	return B_OK;
2518 }
2519 
2520 
2521 status_t
2522 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
2523 	off_t offset, off_t lastOffset, off_t nextOffset,
2524 	const uint8* key, uint16 keyLength)
2525 {
2526 	const bplustree_node* node;
2527 	status_t status = cached.SetTo(offset, &node, true);
2528 	if (status != B_OK) {
2529 		if (status == B_IO_ERROR)
2530 			return B_IO_ERROR;
2531 
2532 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
2533 			"read: %s\n", fStream->ID(), offset, strerror(status));
2534 		check.FoundError();
2535 		return B_OK;
2536 	}
2537 
2538 	if (node->LeftLink() != lastOffset) {
2539 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2540 			"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
2541 			"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
2542 		check.FoundError();
2543 	}
2544 
2545 	if (nextOffset != 0 && node->RightLink() != nextOffset) {
2546 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2547 			"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2548 			"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
2549 		check.FoundError();
2550 	}
2551 
2552 	return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
2553 }
2554 #endif // !_BOOT_MODE
2555 
2556 
2557 //	#pragma mark -
2558 
2559 
2560 TreeIterator::TreeIterator(BPlusTree* tree)
2561 	:
2562 	fTree(tree),
2563 	fCurrentNodeOffset(BPLUSTREE_NULL)
2564 {
2565 #if !_BOOT_MODE
2566 	tree->_AddIterator(this);
2567 #endif
2568 }
2569 
2570 
2571 TreeIterator::~TreeIterator()
2572 {
2573 #if !_BOOT_MODE
2574 	if (fTree)
2575 		fTree->_RemoveIterator(this);
2576 #endif
2577 }
2578 
2579 
2580 status_t
2581 TreeIterator::Goto(int8 to)
2582 {
2583 	if (fTree == NULL || fTree->fStream == NULL)
2584 		RETURN_ERROR(B_BAD_VALUE);
2585 
2586 #if !_BOOT_MODE
2587 	// lock access to stream
2588 	InodeReadLocker locker(fTree->fStream);
2589 #endif
2590 
2591 	off_t nodeOffset = fTree->fHeader.RootNode();
2592 	CachedNode cached(fTree);
2593 	const bplustree_node* node;
2594 
2595 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2596 		// is the node a leaf node?
2597 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2598 			fCurrentNodeOffset = nodeOffset;
2599 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
2600 			fDuplicateNode = BPLUSTREE_NULL;
2601 
2602 			return B_OK;
2603 		}
2604 
2605 		// get the next node offset depending on the direction (and if there
2606 		// are any keys in that node at all)
2607 		off_t nextOffset;
2608 		if (to == BPLUSTREE_END || node->all_key_count == 0)
2609 			nextOffset = node->OverflowLink();
2610 		else {
2611 			if (node->AllKeyLength() > fTree->fNodeSize
2612 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
2613 					- 8 * node->NumKeys())
2614 				RETURN_ERROR(B_ERROR);
2615 
2616 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
2617 		}
2618 		if (nextOffset == nodeOffset)
2619 			break;
2620 
2621 		nodeOffset = nextOffset;
2622 	}
2623 	FATAL(("%s fails\n", __FUNCTION__));
2624 
2625 	RETURN_ERROR(B_ERROR);
2626 }
2627 
2628 
2629 /*!	Iterates through the tree in the specified direction.
2630 	When it iterates through duplicates, the "key" is only updated for the
2631 	first entry - if you need to know when this happens, use the "duplicate"
2632 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2633 	the other duplicates.
2634 	That's not too nice, but saves the 256 bytes that would be needed to
2635 	store the last key - if this will ever become an issue, it will be
2636 	easy to change.
2637 	The other advantage of this is, that the queries can skip all duplicates
2638 	at once when they are not relevant to them.
2639 */
2640 status_t
2641 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2642 	uint16 maxLength, off_t* value, uint16* duplicate)
2643 {
2644 	if (fTree == NULL)
2645 		return B_INTERRUPTED;
2646 
2647 	bool forward = direction == BPLUSTREE_FORWARD;
2648 
2649 	if (fCurrentNodeOffset == BPLUSTREE_NULL
2650 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
2651 		RETURN_ERROR(B_ERROR);
2652 
2653 	// if the tree was emptied since the last call
2654 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
2655 		return B_ENTRY_NOT_FOUND;
2656 
2657 #if !_BOOT_MODE
2658 	// lock access to stream
2659 	InodeReadLocker locker(fTree->fStream);
2660 #endif
2661 
2662 	CachedNode cached(fTree);
2663 	const bplustree_node* node;
2664 
2665 	if (fDuplicateNode != BPLUSTREE_NULL) {
2666 		// regardless of traverse direction the duplicates are always presented
2667 		// in the same order; since they are all considered as equal, this
2668 		// shouldn't cause any problems
2669 
2670 		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2671 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2672 				false);
2673 		} else
2674 			node = NULL;
2675 
2676 		if (node != NULL) {
2677 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2678 				// If the node is out of duplicates, we go directly to the next
2679 				// one
2680 				fDuplicateNode = node->RightLink();
2681 				if (fDuplicateNode != BPLUSTREE_NULL
2682 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2683 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2684 						false);
2685 					fDuplicate = 0;
2686 				}
2687 			}
2688 			if (fDuplicate < fNumDuplicates) {
2689 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2690 					fDuplicate++);
2691 				if (duplicate)
2692 					*duplicate = 2;
2693 				return B_OK;
2694 			}
2695 		}
2696 		fDuplicateNode = BPLUSTREE_NULL;
2697 	}
2698 
2699 	off_t savedNodeOffset = fCurrentNodeOffset;
2700 	int32 savedKey = fCurrentKey;
2701 
2702 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2703 		RETURN_ERROR(B_ERROR);
2704 
2705 	if (duplicate)
2706 		*duplicate = 0;
2707 
2708 	fCurrentKey += direction;
2709 
2710 	// is the current key in the current node?
2711 	while ((forward && fCurrentKey >= node->NumKeys())
2712 			|| (!forward && fCurrentKey < 0)) {
2713 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2714 
2715 		// are there any more nodes?
2716 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2717 			node = cached.SetTo(fCurrentNodeOffset);
2718 			if (!node)
2719 				RETURN_ERROR(B_ERROR);
2720 
2721 			// reset current key
2722 			fCurrentKey = forward ? 0 : node->NumKeys() - 1;
2723 		} else {
2724 			// there are no nodes left, so turn back to the last key
2725 			fCurrentNodeOffset = savedNodeOffset;
2726 			fCurrentKey = savedKey;
2727 
2728 			return B_ENTRY_NOT_FOUND;
2729 		}
2730 	}
2731 
2732 	if (node->all_key_count == 0)
2733 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2734 
2735 	uint16 length = 0;
2736 	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2737 	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2738 			> (uint8*)node + fTree->fNodeSize
2739 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2740 #if !_BOOT_MODE
2741 		fTree->fStream->GetVolume()->Panic();
2742 #endif
2743 		RETURN_ERROR(B_BAD_DATA);
2744 	}
2745 
2746 	// include the termination for string types
2747 	bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE;
2748 	if (length + (needsTermination ? 1 : 0) > maxLength) {
2749 		// the buffer is too small, restore the last key and return an error
2750 		fCurrentNodeOffset = savedNodeOffset;
2751 		fCurrentKey = savedKey;
2752 		return B_BUFFER_OVERFLOW;
2753 	}
2754 
2755 	memcpy(key, keyStart, length);
2756 
2757 	if (needsTermination)
2758 		((char*)key)[length] = '\0';
2759 
2760 	*keyLength = length;
2761 
2762 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2763 
2764 	// duplicate fragments?
2765 	uint8 type = bplustree_node::LinkType(offset);
2766 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2767 		|| type == BPLUSTREE_DUPLICATE_NODE) {
2768 		fDuplicateNode = offset;
2769 
2770 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2771 			false);
2772 		if (node == NULL)
2773 			RETURN_ERROR(B_ERROR);
2774 
2775 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2776 
2777 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2778 		if (fNumDuplicates) {
2779 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2780 			fDuplicate = 1;
2781 			if (duplicate)
2782 				*duplicate = 1;
2783 		} else {
2784 			// Shouldn't happen, but we're dealing here with potentially
2785 			// corrupt disks...
2786 			fDuplicateNode = BPLUSTREE_NULL;
2787 			offset = 0;
2788 		}
2789 	}
2790 	*value = offset;
2791 
2792 	return B_OK;
2793 }
2794 
2795 
2796 /*!	This is more or less a copy of BPlusTree::Find() - but it just
2797 	sets the current position in the iterator, regardless of if the
2798 	key could be found or not.
2799 */
2800 status_t
2801 TreeIterator::Find(const uint8* key, uint16 keyLength)
2802 {
2803 	if (fTree == NULL)
2804 		return B_INTERRUPTED;
2805 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2806 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2807 		|| key == NULL)
2808 		RETURN_ERROR(B_BAD_VALUE);
2809 
2810 #if !_BOOT_MODE
2811 	// lock access to stream
2812 	InodeReadLocker locker(fTree->fStream);
2813 #endif
2814 
2815 	off_t nodeOffset = fTree->fHeader.RootNode();
2816 
2817 	CachedNode cached(fTree);
2818 	const bplustree_node* node;
2819 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2820 		uint16 keyIndex = 0;
2821 		off_t nextOffset;
2822 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2823 			&nextOffset);
2824 
2825 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2826 			fCurrentNodeOffset = nodeOffset;
2827 			fCurrentKey = keyIndex - 1;
2828 			fDuplicateNode = BPLUSTREE_NULL;
2829 
2830 			return status;
2831 		} else if (nextOffset == nodeOffset)
2832 			RETURN_ERROR(B_ERROR);
2833 
2834 		nodeOffset = nextOffset;
2835 	}
2836 	RETURN_ERROR(B_ERROR);
2837 }
2838 
2839 
2840 void
2841 TreeIterator::SkipDuplicates()
2842 {
2843 	fDuplicateNode = BPLUSTREE_NULL;
2844 }
2845 
2846 
2847 void
2848 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2849 	uint16 splitAt, int8 change)
2850 {
2851 	if (offset != fCurrentNodeOffset)
2852 		return;
2853 
2854 	if (nextOffset != BPLUSTREE_NULL) {
2855 		fCurrentNodeOffset = nextOffset;
2856 		if (splitAt <= fCurrentKey) {
2857 			fCurrentKey -= splitAt;
2858 			keyIndex -= splitAt;
2859 		}
2860 	}
2861 
2862 	// Adjust fCurrentKey to point to the same key as before.
2863 	// Note, that if a key is inserted at the current position
2864 	// it won't be included in this tree transition.
2865 	if (keyIndex <= fCurrentKey)
2866 		fCurrentKey += change;
2867 
2868 	// TODO: duplicate handling!
2869 }
2870 
2871 
2872 void
2873 TreeIterator::Stop()
2874 {
2875 	fTree = NULL;
2876 }
2877 
2878 
2879 #ifdef DEBUG
2880 void
2881 TreeIterator::Dump()
2882 {
2883 	__out("TreeIterator at %p:\n", this);
2884 	__out("\tfTree = %p\n", fTree);
2885 	__out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset);
2886 	__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2887 	__out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n",
2888 		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2889 		fDuplicateNode);
2890 	__out("\tfDuplicate = %u\n", fDuplicate);
2891 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2892 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2893 }
2894 #endif
2895 
2896 
2897 // #pragma mark -
2898 
2899 
2900 bool
2901 bplustree_header::IsValid() const
2902 {
2903 	return Magic() == BPLUSTREE_MAGIC
2904 		&& (RootNode() % NodeSize()) == 0
2905 		&& IsValidLink(RootNode())
2906 		&& IsValidLink(FreeNode());
2907 }
2908 
2909 
2910 // #pragma mark -
2911 
2912 
2913 void
2914 bplustree_node::Initialize()
2915 {
2916 	left_link = right_link = overflow_link
2917 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2918 	all_key_count = 0;
2919 	all_key_length = 0;
2920 }
2921 
2922 
2923 uint8*
2924 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2925 {
2926 	if (index < 0 || index > NumKeys())
2927 		return NULL;
2928 
2929 	uint8* keyStart = Keys();
2930 	uint16* keyLengths = KeyLengths();
2931 
2932 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2933 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2934 	if (index > 0)
2935 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2936 
2937 	return keyStart;
2938 }
2939 
2940 
2941 uint8
2942 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2943 {
2944 	// the duplicate fragment handling is currently hard-coded to a node size
2945 	// of 1024 bytes - with future versions of BFS, this may be a problem
2946 
2947 	if (isFragment) {
2948 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2949 
2950 		return ((off_t*)this)[fragment];
2951 	}
2952 	return OverflowLink();
2953 }
2954 
2955 
2956 off_t
2957 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2958 {
2959 	uint32 start;
2960 	if (isFragment)
2961 		start = 8 * ((uint64)offset & 0x3ff);
2962 	else
2963 		start = 2;
2964 
2965 	return ((off_t*)this)[start + 1 + index];
2966 }
2967 
2968 
2969 /*!	Although the name suggests it, this function doesn't return the real
2970 	used fragment count; at least, it can only count to two: it returns
2971 	0, if there is no fragment used, 1 if there is only one fragment
2972 	used, and 2 if there are at least 2 fragments used.
2973 */
2974 uint32
2975 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2976 {
2977 	uint32 used = 0;
2978 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2979 		duplicate_array* array = FragmentAt(i);
2980 		if (array->Count() > 0 && ++used > 1)
2981 			return used;
2982 	}
2983 	return used;
2984 }
2985 
2986 
2987 status_t
2988 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2989 {
2990 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2991 		DEBUGGER(("invalid node: key/length count"));
2992 
2993 	for (int32 i = 0; i < NumKeys(); i++) {
2994 		uint16 length = 0;
2995 		uint8* key = KeyAt(i, &length);
2996 		if (key + length + sizeof(off_t) + sizeof(uint16)
2997 				> (uint8*)this + nodeSize
2998 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2999 			dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
3000 			return B_BAD_DATA;
3001 		}
3002 		if (Values()[i] == -1) {
3003 			dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
3004 				"corrupted\n", this, (int)i, Values()[i]);
3005 			return B_BAD_DATA;
3006 		}
3007 	}
3008 	return B_OK;
3009 }
3010 
3011 
3012 // #pragma mark -
3013 
3014 
3015 #if !_BOOT_MODE
3016 BitmapArray::BitmapArray(size_t numBits)
3017 {
3018 	fSize = (numBits + 7) / 8;
3019 	fBitmap = (uint8*)calloc(fSize, 1);
3020 	fCountSet = 0;
3021 }
3022 
3023 
3024 BitmapArray::~BitmapArray()
3025 {
3026 	free(fBitmap);
3027 }
3028 
3029 
3030 status_t
3031 BitmapArray::InitCheck() const
3032 {
3033 	return fBitmap != NULL ? B_OK : B_NO_MEMORY;
3034 }
3035 
3036 
3037 bool
3038 BitmapArray::IsSet(size_t index) const
3039 {
3040 	uint32 byteIndex = index / 8;
3041 	if (byteIndex >= fSize)
3042 		return false;
3043 
3044 	return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
3045 }
3046 
3047 
3048 void
3049 BitmapArray::Set(size_t index, bool set)
3050 {
3051 	uint32 byteIndex = index / 8;
3052 	if (byteIndex >= fSize)
3053 		return;
3054 
3055 	if (set) {
3056 		fBitmap[byteIndex] |= 1UL << (index & 0x7);
3057 		fCountSet++;
3058 	} else {
3059 		fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
3060 		fCountSet--;
3061 	}
3062 }
3063 #endif // !_BOOT_MODE
3064 
3065 
3066 // #pragma mark -
3067 
3068 
3069 bool
3070 duplicate_array::_FindInternal(off_t value, int32& index) const
3071 {
3072 	int32 min = 0, max = Count() - 1;
3073 	off_t cmp;
3074 	while (min <= max) {
3075 		index = (min + max) / 2;
3076 
3077 		cmp = ValueAt(index) - value;
3078 		if (cmp < 0)
3079 			min = index + 1;
3080 		else if (cmp > 0)
3081 			max = index - 1;
3082 		else
3083 			return true;
3084 	}
3085 	return false;
3086 }
3087 
3088 
3089 void
3090 duplicate_array::Insert(off_t value)
3091 {
3092 	// if there are more than 8 values in this array, use a
3093 	// binary search, if not, just iterate linearly to find
3094 	// the insertion point
3095 	int32 size = Count();
3096 	int32 i = 0;
3097 	if (size > 8 ) {
3098 		if (!_FindInternal(value, i) && ValueAt(i) <= value)
3099 			i++;
3100 	} else {
3101 		for (i = 0; i < size; i++) {
3102 			if (ValueAt(i) > value)
3103 				break;
3104 		}
3105 	}
3106 
3107 	memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
3108 	values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
3109 	count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
3110 }
3111 
3112 
3113 bool
3114 duplicate_array::Remove(off_t value)
3115 {
3116 	int32 index = Find(value);
3117 	if (index == -1)
3118 		return false;
3119 
3120 	int32 newSize = Count() - 1;
3121 	memmove(&values[index], &values[index + 1],
3122 		(newSize - index) * sizeof(off_t));
3123 	count = HOST_ENDIAN_TO_BFS_INT64(newSize);
3124 
3125 	return true;
3126 }
3127 
3128 
3129 #if _BOOT_MODE
3130 } // namespace BFS
3131 #endif
3132