xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 6c4a44e36ba846c54467103f884d65dfa13e7fcb)
1 /*
2  * Copyright 2001-2012, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 
10 //! B+Tree implementation
11 
12 
13 #include "Debug.h"
14 #include "BPlusTree.h"
15 #include "Inode.h"
16 #include "Utility.h"
17 
18 
19 /*!	Simple array used for the duplicate handling in the B+Tree. This is an
20 	on disk structure.
21 */
22 struct duplicate_array {
23 	off_t	count;
24 	off_t	values[0];
25 
26 	inline bool IsEmpty() const
27 	{
28 		return count == 0;
29 	}
30 
31 	inline int32 Count() const
32 	{
33 		return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
34 	}
35 
36 	inline off_t ValueAt(uint32 index) const
37 	{
38 		return BFS_ENDIAN_TO_HOST_INT64(values[index]);
39 	}
40 
41 	inline void SetValueAt(uint32 index, off_t value)
42 	{
43 		values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
44 	}
45 
46 	inline int32 Find(off_t value) const
47 	{
48 		int32 i;
49 		return _FindInternal(value, i) ? i : -1;
50 	}
51 
52 	void Insert(off_t value);
53 	bool Remove(off_t value);
54 
55 private:
56 	bool _FindInternal(off_t value, int32& index) const;
57 } _PACKED;
58 
59 
60 #ifdef DEBUG
61 class NodeChecker {
62 public:
63 	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
64 		:
65 		fNode(node),
66 		fSize(nodeSize),
67 		fText(text)
68 	{
69 		Check("integrity check failed on construction.");
70 	}
71 
72 	~NodeChecker()
73 	{
74 		Check("integrity check failed on destruction.");
75 	}
76 
77 	void
78 	Check(const char* message)
79 	{
80 		if (fNode->CheckIntegrity(fSize) != B_OK) {
81 			dprintf("%s: %s\n", fText, message);
82 			DEBUGGER(("NodeChecker integrity check failed!"));
83 		}
84 	}
85 
86 private:
87 	const bplustree_node*	fNode;
88 	int32					fSize;
89 	const char*				fText;
90 };
91 #endif
92 
93 
94 class BitmapArray {
95 public:
96 								BitmapArray(size_t numBits);
97 								~BitmapArray();
98 
99 			status_t			InitCheck() const;
100 
101 			bool				IsSet(size_t index) const;
102 			void				Set(size_t index, bool set);
103 
104 			size_t				CountSet() const { return fCountSet; }
105 
106 private:
107 			uint8*				fBitmap;
108 			size_t				fSize;
109 			size_t				fCountSet;
110 };
111 
112 
113 struct TreeCheck {
114 	TreeCheck(BPlusTree* tree)
115 		:
116 		fLevelCount(0),
117 		fFreeCount(0),
118 		fNodeSize(tree->NodeSize()),
119 		fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
120 		fFoundErrors(0),
121 		fVisited(tree->Stream()->Size() / tree->NodeSize()),
122 		fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
123 	{
124 		fPreviousOffsets = (off_t*)malloc(
125 			sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
126 		if (fPreviousOffsets != NULL) {
127 			for (size_t i = 0; i < fMaxLevels; i++)
128 				fPreviousOffsets[i] = BPLUSTREE_NULL;
129 		}
130 	}
131 
132 	~TreeCheck()
133 	{
134 		free(fPreviousOffsets);
135 	}
136 
137 	status_t InitCheck() const
138 	{
139 		if (fPreviousOffsets == NULL)
140 			return B_NO_MEMORY;
141 
142 		status_t status = fVisited.InitCheck();
143 		if (status != B_OK)
144 			return status;
145 
146 		return fVisitedFragment.InitCheck();
147 	}
148 
149 	bool Visited(off_t offset) const
150 	{
151 		return fVisited.IsSet(offset / fNodeSize);
152 	}
153 
154 	void SetVisited(off_t offset)
155 	{
156 		fVisited.Set(offset / fNodeSize, true);
157 	}
158 
159 	size_t VisitedCount() const
160 	{
161 		return fVisited.CountSet();
162 	}
163 
164 	bool VisitedFragment(off_t offset) const
165 	{
166 		return fVisitedFragment.IsSet(offset / fNodeSize);
167 	}
168 
169 	void SetVisitedFragment(off_t offset)
170 	{
171 		fVisitedFragment.Set(offset / fNodeSize, true);
172 	}
173 
174 	uint32 MaxLevels() const
175 	{
176 		return fLevelCount;
177 	}
178 
179 	void SetLevel(uint32 level)
180 	{
181 		if (fLevelCount < level)
182 			fLevelCount = level;
183 	}
184 
185 	off_t PreviousOffset(uint32 level)
186 	{
187 		return fPreviousOffsets[level];
188 	}
189 
190 	void SetPreviousOffset(uint32 level, off_t offset)
191 	{
192 		fPreviousOffsets[level] = offset;
193 	}
194 
195 	void FoundError()
196 	{
197 		fFoundErrors++;
198 	}
199 
200 	bool ErrorsFound()
201 	{
202 		return fFoundErrors != 0;
203 	}
204 
205 private:
206 			uint32				fLevelCount;
207 			uint32				fFreeCount;
208 			uint32				fNodeSize;
209 			uint32				fMaxLevels;
210 			uint32				fFoundErrors;
211 			BitmapArray			fVisited;
212 			BitmapArray			fVisitedFragment;
213 			off_t*				fPreviousOffsets;
214 };
215 
216 
217 // #pragma mark -
218 
219 
220 // Node Caching for the BPlusTree class
221 //
222 // With write support, there is the need for a function that allocates new
223 // nodes by either returning empty nodes, or by growing the file's data stream
224 //
225 // !! The CachedNode class assumes that you have properly locked the stream
226 // !! before asking for nodes.
227 //
228 // Note: This code will fail if the block size is smaller than the node size!
229 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
230 // is hard-coded to 1024 bytes, that's not an issue now.
231 
232 void
233 CachedNode::UnsetUnchanged(Transaction& transaction)
234 {
235 	if (fTree == NULL || fTree->fStream == NULL)
236 		return;
237 
238 	if (fNode != NULL) {
239 		void* cache = fTree->fStream->GetVolume()->BlockCache();
240 
241 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
242 		block_cache_put(cache, fBlockNumber);
243 		fNode = NULL;
244 	}
245 }
246 
247 
248 void
249 CachedNode::Unset()
250 {
251 	if (fTree == NULL || fTree->fStream == NULL)
252 		return;
253 
254 	if (fNode != NULL) {
255 		if (fWritable && fOffset == 0) {
256 			// The B+tree header has been updated - we need to update the
257 			// BPlusTrees copy of it, as well.
258 			memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
259 		}
260 
261 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
262 			fBlockNumber);
263 		fNode = NULL;
264 	}
265 }
266 
267 
268 const bplustree_node*
269 CachedNode::SetTo(off_t offset, bool check)
270 {
271 	const bplustree_node* node;
272 	if (SetTo(offset, &node, check) == B_OK)
273 		return node;
274 
275 	return NULL;
276 }
277 
278 
279 status_t
280 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
281 {
282 	if (fTree == NULL || fTree->fStream == NULL)
283 		RETURN_ERROR(B_BAD_VALUE);
284 
285 	Unset();
286 
287 	// You can only ask for nodes at valid positions - you can't
288 	// even access the b+tree header with this method (use SetToHeader()
289 	// instead)
290 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
291 		|| offset <= 0
292 		|| (offset % fTree->fNodeSize) != 0)
293 		RETURN_ERROR(B_BAD_VALUE);
294 
295 	if (InternalSetTo(NULL, offset) != NULL && check) {
296 		// sanity checks (links, all_key_count)
297 		if (!fTree->fHeader.CheckNode(fNode)) {
298 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
299 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
300 				fBlockNumber, fTree->fStream->ID()));
301 			return B_BAD_DATA;
302 		}
303 	}
304 
305 	*_node = fNode;
306 	return B_OK;
307 }
308 
309 
310 bplustree_node*
311 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
312 {
313 	if (fTree == NULL || fTree->fStream == NULL) {
314 		REPORT_ERROR(B_BAD_VALUE);
315 		return NULL;
316 	}
317 
318 	Unset();
319 
320 	// You can only ask for nodes at valid positions - you can't
321 	// even access the b+tree header with this method (use SetToHeader()
322 	// instead)
323 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
324 		|| offset <= 0
325 		|| (offset % fTree->fNodeSize) != 0)
326 		return NULL;
327 
328 	if (InternalSetTo(&transaction, offset) != NULL && check) {
329 		// sanity checks (links, all_key_count)
330 		if (!fTree->fHeader.CheckNode(fNode)) {
331 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
332 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
333 				fBlockNumber, fTree->fStream->ID()));
334 			return NULL;
335 		}
336 	}
337 	return fNode;
338 }
339 
340 
341 bplustree_node*
342 CachedNode::MakeWritable(Transaction& transaction)
343 {
344 	if (fNode == NULL)
345 		return NULL;
346 
347 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
348 			fBlockNumber, transaction.ID()) == B_OK)
349 		return fNode;
350 
351 	return NULL;
352 }
353 
354 
355 const bplustree_header*
356 CachedNode::SetToHeader()
357 {
358 	if (fTree == NULL || fTree->fStream == NULL) {
359 		REPORT_ERROR(B_BAD_VALUE);
360 		return NULL;
361 	}
362 
363 	Unset();
364 
365 	InternalSetTo(NULL, 0LL);
366 	return (bplustree_header*)fNode;
367 }
368 
369 
370 bplustree_header*
371 CachedNode::SetToWritableHeader(Transaction& transaction)
372 {
373 	if (fTree == NULL || fTree->fStream == NULL) {
374 		REPORT_ERROR(B_BAD_VALUE);
375 		return NULL;
376 	}
377 
378 	Unset();
379 
380 	InternalSetTo(&transaction, 0LL);
381 
382 	if (fNode != NULL && !fTree->fInTransaction) {
383 		transaction.AddListener(fTree);
384 		fTree->fInTransaction = true;
385 
386 		if (!transaction.GetVolume()->IsInitializing()) {
387 			acquire_vnode(transaction.GetVolume()->FSVolume(),
388 				fTree->fStream->ID());
389 		}
390 	}
391 
392 	return (bplustree_header*)fNode;
393 }
394 
395 
396 bplustree_node*
397 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
398 {
399 	fNode = NULL;
400 	fOffset = offset;
401 
402 	off_t fileOffset;
403 	block_run run;
404 	if (offset < fTree->fStream->Size()
405 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
406 		Volume* volume = fTree->fStream->GetVolume();
407 
408 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
409 		fBlockNumber = volume->ToBlock(run) + blockOffset;
410 		uint8* block;
411 
412 		if (transaction != NULL) {
413 			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
414 				fBlockNumber, transaction->ID());
415 			fWritable = true;
416 		} else {
417 			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
418 			fWritable = false;
419 		}
420 
421 		if (block != NULL) {
422 			// The node is somewhere in that block...
423 			// (confusing offset calculation)
424 			fNode = (bplustree_node*)(block + offset
425 				- (fileOffset + (blockOffset << volume->BlockShift())));
426 		} else
427 			REPORT_ERROR(B_IO_ERROR);
428 	}
429 	return fNode;
430 }
431 
432 
433 status_t
434 CachedNode::Free(Transaction& transaction, off_t offset)
435 {
436 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
437 		RETURN_ERROR(B_BAD_VALUE);
438 
439 	// TODO: scan the free nodes list and remove all nodes at the end
440 	// of the tree - perhaps that shouldn't be done everytime that
441 	// function is called, perhaps it should be done when the directory
442 	// inode is closed or based on some calculation or whatever...
443 
444 	CachedNode cached(fTree);
445 	bplustree_header* header = cached.SetToWritableHeader(transaction);
446 	if (header == NULL)
447 		return B_IO_ERROR;
448 
449 #if 0
450 	// TODO: temporarily disabled because CheckNode() doesn't like this...
451 	// 		Also, it's such an edge case that it's almost useless, anyway.
452 	// if the node is the last one in the tree, we shrink
453 	// the tree and file size by one node
454 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
455 	if (offset == lastOffset) {
456 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
457 		if (status != B_OK)
458 			return status;
459 
460 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
461 		return B_OK;
462 	}
463 #endif
464 
465 	// add the node to the free nodes list
466 	fNode->left_link = header->free_node_pointer;
467 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
468 
469 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
470 	return B_OK;
471 }
472 
473 
474 status_t
475 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
476 	off_t* _offset)
477 {
478 	if (fTree == NULL || fTree->fStream == NULL)
479 		RETURN_ERROR(B_BAD_VALUE);
480 
481 	Unset();
482 
483 	bplustree_header* header;
484 	status_t status;
485 
486 	// if there are any free nodes, recycle them
487 	if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
488 		CachedNode cached(fTree);
489 		header = cached.SetToWritableHeader(transaction);
490 		if (header == NULL)
491 			return B_IO_ERROR;
492 
493 		*_offset = header->FreeNode();
494 		*_node = fNode;
495 
496 		// set new free node pointer
497 		header->free_node_pointer = fNode->left_link;
498 		fNode->Initialize();
499 		return B_OK;
500 	}
501 
502 	// allocate space for a new node
503 	Inode* stream = fTree->fStream;
504 	if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
505 		return status;
506 
507 	CachedNode cached(fTree);
508 	header = cached.SetToWritableHeader(transaction);
509 	if (header == NULL)
510 		return B_IO_ERROR;
511 
512 	// the maximum_size has to be changed before the call to SetTo() - or
513 	// else it will fail because the requested node is out of bounds
514 	off_t offset = header->MaximumSize();
515 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
516 		+ fTree->fNodeSize);
517 
518 	cached.Unset();
519 		// SetToWritable() below needs the new values in the tree's header
520 
521 	if (SetToWritable(transaction, offset, false) == NULL)
522 		RETURN_ERROR(B_ERROR);
523 
524 	fNode->Initialize();
525 
526 	*_offset = offset;
527 	*_node = fNode;
528 	return B_OK;
529 }
530 
531 
532 //	#pragma mark -
533 
534 
535 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
536 	:
537 	fStream(NULL),
538 	fInTransaction(false)
539 {
540 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
541 	SetTo(transaction, stream);
542 }
543 
544 
545 BPlusTree::BPlusTree(Inode* stream)
546 	:
547 	fStream(NULL),
548 	fInTransaction(false)
549 {
550 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
551 	SetTo(stream);
552 }
553 
554 
555 BPlusTree::BPlusTree()
556 	:
557 	fStream(NULL),
558 	fNodeSize(BPLUSTREE_NODE_SIZE),
559 	fAllowDuplicates(true),
560 	fInTransaction(false),
561 	fStatus(B_NO_INIT)
562 {
563 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
564 }
565 
566 
567 BPlusTree::~BPlusTree()
568 {
569 	// if there are any TreeIterators left, we need to stop them
570 	// (can happen when the tree's inode gets deleted while
571 	// traversing the tree - a TreeIterator doesn't lock the inode)
572 	mutex_lock(&fIteratorLock);
573 
574 	SinglyLinkedList<TreeIterator>::Iterator iterator
575 		= fIterators.GetIterator();
576 	while (iterator.HasNext())
577 		iterator.Next()->Stop();
578 
579 	mutex_destroy(&fIteratorLock);
580 
581 	ASSERT(!fInTransaction);
582 }
583 
584 
585 /*! Create a new B+Tree on the specified stream */
586 status_t
587 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
588 {
589 	// initializes in-memory B+Tree
590 
591 	fStream = stream;
592 
593 	CachedNode cached(this);
594 	bplustree_header* header = cached.SetToWritableHeader(transaction);
595 	if (header == NULL) {
596 		// allocate space for new header + node!
597 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
598 		if (fStatus != B_OK)
599 			RETURN_ERROR(fStatus);
600 
601 		header = cached.SetToWritableHeader(transaction);
602 		if (header == NULL)
603 			RETURN_ERROR(fStatus = B_ERROR);
604 	}
605 
606 	fAllowDuplicates = stream->IsIndex()
607 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
608 
609 	fNodeSize = nodeSize;
610 
611 	// initialize b+tree header
612  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
613  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
614  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
615  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
616  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
617  	header->free_node_pointer
618  		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
619  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
620 
621 	cached.Unset();
622 
623 	// initialize b+tree root node
624 	cached.SetToWritable(transaction, fHeader.RootNode(), false);
625 	if (cached.Node() == NULL)
626 		RETURN_ERROR(B_IO_ERROR);
627 
628 	cached.Node()->Initialize();
629 
630 	return fStatus = B_OK;
631 }
632 
633 
634 status_t
635 BPlusTree::SetTo(Inode* stream)
636 {
637 	if (stream == NULL)
638 		RETURN_ERROR(fStatus = B_BAD_VALUE);
639 
640 	fStream = stream;
641 
642 	// get on-disk B+Tree header
643 
644 	CachedNode cached(this);
645 	const bplustree_header* header = cached.SetToHeader();
646 	if (header != NULL)
647 		memcpy(&fHeader, header, sizeof(bplustree_header));
648 	else
649 		RETURN_ERROR(fStatus = B_IO_ERROR);
650 
651 	// is header valid?
652 
653 	if (fHeader.MaximumSize() != stream->Size()) {
654 		dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
655 			B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
656 		// we can't change the header since we don't have a transaction
657 		//fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
658 	}
659 	if (!fHeader.IsValid()) {
660 #ifdef DEBUG
661 		dump_bplustree_header(&fHeader);
662 		dump_block((const char*)&fHeader, 128);
663 #endif
664 		RETURN_ERROR(fStatus = B_BAD_DATA);
665 	}
666 
667 	fNodeSize = fHeader.NodeSize();
668 
669 	// validity check
670 	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
671 		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
672 		S_DOUBLE_INDEX};
673 	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
674 		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
675 		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
676 
677 	if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
678 		|| ((stream->Mode() & S_INDEX_DIR) != 0
679 			&& kToMode[fHeader.DataType()] != mode)
680 		|| !stream->IsContainer()) {
681 		D(	dump_bplustree_header(&fHeader);
682 			dump_inode(&stream->Node());
683 		);
684 		RETURN_ERROR(fStatus = B_BAD_TYPE);
685 	}
686 
687 	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
688 	// in the original BFS code - we will honour it nevertheless
689 	fAllowDuplicates = stream->IsIndex()
690 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
691 
692 	cached.SetTo(fHeader.RootNode());
693 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
694 }
695 
696 
697 status_t
698 BPlusTree::InitCheck()
699 {
700 	return fStatus;
701 }
702 
703 
704 status_t
705 BPlusTree::Validate(bool repair, bool& _errorsFound)
706 {
707 	TreeCheck check(this);
708 	if (check.InitCheck() != B_OK)
709 		return B_NO_MEMORY;
710 
711 	check.SetVisited(0);
712 
713 	// Walk the free nodes
714 
715 	CachedNode cached(this);
716 	off_t freeOffset = fHeader.FreeNode();
717 	while (freeOffset > 0) {
718 		const bplustree_node* node;
719 		status_t status = cached.SetTo(freeOffset, &node, false);
720 		if (status != B_OK) {
721 			if (status == B_IO_ERROR)
722 				return B_IO_ERROR;
723 
724 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
725 				"not be read: %s\n", fStream->ID(), freeOffset,
726 				strerror(status));
727 			check.FoundError();
728 			break;
729 		}
730 
731 		if (check.Visited(freeOffset)) {
732 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
733 				" circular!\n", fStream->ID(), freeOffset);
734 			// TODO: if 'repair' is true, we could collect all unvisited nodes
735 			// at the end, and put the into the free list
736 			check.FoundError();
737 			break;
738 		}
739 
740 		check.SetVisited(freeOffset);
741 
742 		if (node->OverflowLink() != BPLUSTREE_FREE) {
743 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
744 				" misses free mark!\n", fStream->ID(), freeOffset);
745 		}
746 		freeOffset = node->LeftLink();
747 	}
748 
749 	// Iterate over the complete tree recursively
750 
751 	const bplustree_node* root;
752 	status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
753 	if (status != B_OK)
754 		return status;
755 
756 	status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
757 
758 	if (check.ErrorsFound())
759 		_errorsFound = true;
760 
761 	if (status != B_OK)
762 		return status;
763 
764 	if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
765 		dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
766 			"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
767 			fHeader.MaxNumberOfLevels());
768 	}
769 
770 	if (check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
771 		dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
772 			" nodes.\n", fStream->ID(), check.VisitedCount(),
773 			fHeader.MaximumSize() / fNodeSize);
774 	}
775 
776 	return B_OK;
777 }
778 
779 
780 status_t
781 BPlusTree::MakeEmpty()
782 {
783 	// Put all nodes into the free list in order
784 	Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
785 
786 	// Reset the header, and root node
787 	CachedNode cached(this);
788 	bplustree_header* header = cached.SetToWritableHeader(transaction);
789 	if (header == NULL)
790 		return B_IO_ERROR;
791 
792 	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
793 	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
794 	if (fStream->Size() > NodeSize() * 2)
795 		header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
796 	else {
797 		header->free_node_pointer
798 			= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
799 	}
800 
801 	bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
802 	if (node == NULL)
803 		return B_IO_ERROR;
804 
805 	node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
806 	node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
807 	node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
808 	node->all_key_count = 0;
809 	node->all_key_length = 0;
810 
811 	for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
812 			offset += NodeSize()) {
813 		bplustree_node* node = cached.SetToWritable(transaction, offset, false);
814 		if (node == NULL) {
815 			dprintf("--> could not open %lld\n", offset);
816 			return B_IO_ERROR;
817 		}
818 		if (offset < fStream->Size() - NodeSize())
819 			node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
820 		else
821 			node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
822 
823 		node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
824 
825 		// It's not important to write it out in a single transaction; just
826 		// make sure it doesn't get too large
827 		if (offset % (1024 * 1024) == 0) {
828 			transaction.Done();
829 			transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
830 		}
831 	}
832 
833 	return transaction.Done();
834 }
835 
836 
837 int32
838 BPlusTree::TypeCodeToKeyType(type_code code)
839 {
840 	switch (code) {
841 		case B_STRING_TYPE:
842 			return BPLUSTREE_STRING_TYPE;
843 		case B_SSIZE_T_TYPE:
844 		case B_INT32_TYPE:
845 			return BPLUSTREE_INT32_TYPE;
846 		case B_SIZE_T_TYPE:
847 		case B_UINT32_TYPE:
848 			return BPLUSTREE_UINT32_TYPE;
849 		case B_OFF_T_TYPE:
850 		case B_INT64_TYPE:
851 			return BPLUSTREE_INT64_TYPE;
852 		case B_UINT64_TYPE:
853 			return BPLUSTREE_UINT64_TYPE;
854 		case B_FLOAT_TYPE:
855 			return BPLUSTREE_FLOAT_TYPE;
856 		case B_DOUBLE_TYPE:
857 			return BPLUSTREE_DOUBLE_TYPE;
858 	}
859 	return -1;
860 }
861 
862 
863 int32
864 BPlusTree::ModeToKeyType(mode_t mode)
865 {
866 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
867 			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
868 		case S_INT_INDEX:
869 			return BPLUSTREE_INT32_TYPE;
870 		case S_UINT_INDEX:
871 			return BPLUSTREE_UINT32_TYPE;
872 		case S_LONG_LONG_INDEX:
873 			return BPLUSTREE_INT64_TYPE;
874 		case S_ULONG_LONG_INDEX:
875 			return BPLUSTREE_UINT64_TYPE;
876 		case S_FLOAT_INDEX:
877 			return BPLUSTREE_FLOAT_TYPE;
878 		case S_DOUBLE_INDEX:
879 			return BPLUSTREE_DOUBLE_TYPE;
880 		case S_STR_INDEX:
881 		default:
882 			// default is for standard directories
883 			return BPLUSTREE_STRING_TYPE;
884 	}
885 }
886 
887 
888 //	#pragma mark - TransactionListener implementation
889 
890 
891 void
892 BPlusTree::TransactionDone(bool success)
893 {
894 	if (!success) {
895 		// update header from disk
896 		CachedNode cached(this);
897 		const bplustree_header* header = cached.SetToHeader();
898 		if (header != NULL)
899 			memcpy(&fHeader, header, sizeof(bplustree_header));
900 	}
901 }
902 
903 
904 void
905 BPlusTree::RemovedFromTransaction()
906 {
907 	fInTransaction = false;
908 
909 	if (!fStream->GetVolume()->IsInitializing())
910 		put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
911 }
912 
913 
914 //	#pragma mark -
915 
916 
917 void
918 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
919 	uint16 splitAt, int8 change)
920 {
921 	// Although every iterator which is affected by this update currently
922 	// waits on a semaphore, other iterators could be added/removed at
923 	// any time, so we need to protect this loop
924 	MutexLocker _(fIteratorLock);
925 
926 	SinglyLinkedList<TreeIterator>::Iterator iterator
927 		= fIterators.GetIterator();
928 	while (iterator.HasNext())
929 		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
930 }
931 
932 
933 void
934 BPlusTree::_AddIterator(TreeIterator* iterator)
935 {
936 	MutexLocker _(fIteratorLock);
937 	fIterators.Add(iterator);
938 }
939 
940 
941 void
942 BPlusTree::_RemoveIterator(TreeIterator* iterator)
943 {
944 	MutexLocker _(fIteratorLock);
945 	fIterators.Remove(iterator);
946 }
947 
948 
949 int32
950 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
951 	int keyLength2)
952 {
953 	type_code type = 0;
954 	switch (fHeader.DataType()) {
955 	    case BPLUSTREE_STRING_TYPE:
956 	    	type = B_STRING_TYPE;
957 	    	break;
958 		case BPLUSTREE_INT32_TYPE:
959 	    	type = B_INT32_TYPE;
960 	    	break;
961 		case BPLUSTREE_UINT32_TYPE:
962 	    	type = B_UINT32_TYPE;
963 	    	break;
964 		case BPLUSTREE_INT64_TYPE:
965 	    	type = B_INT64_TYPE;
966 	    	break;
967 		case BPLUSTREE_UINT64_TYPE:
968 	    	type = B_UINT64_TYPE;
969 	    	break;
970 		case BPLUSTREE_FLOAT_TYPE:
971 	    	type = B_FLOAT_TYPE;
972 	    	break;
973 		case BPLUSTREE_DOUBLE_TYPE:
974 	    	type = B_DOUBLE_TYPE;
975 	    	break;
976 	}
977    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
978 }
979 
980 
981 status_t
982 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
983 	uint16 keyLength, uint16* _index, off_t* _next)
984 {
985 #ifdef DEBUG
986 	NodeChecker checker(node, fNodeSize, "find");
987 #endif
988 
989 	if (node->all_key_count == 0) {
990 		if (_index)
991 			*_index = 0;
992 		if (_next)
993 			*_next = node->OverflowLink();
994 		return B_ENTRY_NOT_FOUND;
995 	}
996 
997 	off_t* values = node->Values();
998 	int16 saveIndex = -1;
999 
1000 	// binary search in the key array
1001 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
1002 		uint16 i = (first + last) >> 1;
1003 
1004 		uint16 searchLength;
1005 		uint8* searchKey = node->KeyAt(i, &searchLength);
1006 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
1007 				> (uint8*)node + fNodeSize
1008 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
1009 			fStream->GetVolume()->Panic();
1010 			RETURN_ERROR(B_BAD_DATA);
1011 		}
1012 
1013 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
1014 		if (cmp < 0) {
1015 			last = i - 1;
1016 			saveIndex = i;
1017 		} else if (cmp > 0) {
1018 			saveIndex = first = i + 1;
1019 		} else {
1020 			if (_index)
1021 				*_index = i;
1022 			if (_next)
1023 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1024 			return B_OK;
1025 		}
1026 	}
1027 
1028 	if (_index)
1029 		*_index = saveIndex;
1030 	if (_next) {
1031 		if (saveIndex == node->NumKeys())
1032 			*_next = node->OverflowLink();
1033 		else
1034 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
1035 	}
1036 	return B_ENTRY_NOT_FOUND;
1037 }
1038 
1039 
1040 /*!	Prepares the stack to contain all nodes that were passed while
1041 	following the key, from the root node to the leaf node that could
1042 	or should contain that key.
1043 */
1044 status_t
1045 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
1046 	uint16 keyLength)
1047 {
1048 	// set the root node to begin with
1049 	node_and_key nodeAndKey;
1050 	nodeAndKey.nodeOffset = fHeader.RootNode();
1051 
1052 	CachedNode cached(this);
1053 	const bplustree_node* node;
1054 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1055 		// if we are already on leaf level, we're done
1056 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1057 			// node that the keyIndex is not properly set here (but it's not
1058 			// needed in the calling functions anyway)!
1059 			nodeAndKey.keyIndex = 0;
1060 			stack.Push(nodeAndKey);
1061 			return B_OK;
1062 		}
1063 
1064 		off_t nextOffset;
1065 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
1066 			&nextOffset);
1067 
1068 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
1069 			RETURN_ERROR(B_ERROR);
1070 
1071 		if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
1072 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
1073 			break;
1074 		}
1075 
1076 		// put the node offset & the correct keyIndex on the stack
1077 		stack.Push(nodeAndKey);
1078 
1079 		nodeAndKey.nodeOffset = nextOffset;
1080 	}
1081 
1082 	FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
1083 		B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
1084 	return B_ERROR;
1085 }
1086 
1087 
1088 /*!	This will find a free duplicate fragment in the given bplustree_node.
1089 	The CachedNode will be set to the writable fragment on success.
1090 */
1091 status_t
1092 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
1093 	const bplustree_node* node, CachedNode& cached,
1094 	off_t* _offset, bplustree_node** _fragment, uint32* _index)
1095 {
1096 	off_t* values = node->Values();
1097 	for (int32 i = 0; i < node->NumKeys(); i++) {
1098 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1099 
1100 		// does the value link to a duplicate fragment?
1101 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
1102 			continue;
1103 
1104 		const bplustree_node* fragment = cached.SetTo(
1105 			bplustree_node::FragmentOffset(value), false);
1106 		if (fragment == NULL) {
1107 			FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
1108 				B_PRIdOFF "\n", value, fStream->ID()));
1109 			continue;
1110 		}
1111 
1112 		// see if there is some space left for us
1113 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
1114 		for (uint32 j = 0; j < num; j++) {
1115 			duplicate_array* array = fragment->FragmentAt(j);
1116 
1117 			if (array->IsEmpty()) {
1118 				// found an unused fragment
1119 				*_fragment = cached.MakeWritable(transaction);
1120 				if (*_fragment == NULL)
1121 					return B_IO_ERROR;
1122 
1123 				*_offset = bplustree_node::FragmentOffset(value);
1124 				*_index = j;
1125 				return B_OK;
1126 			}
1127 		}
1128 	}
1129 	return B_ENTRY_NOT_FOUND;
1130 }
1131 
1132 
1133 status_t
1134 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
1135 	const bplustree_node* node, uint16 index, off_t value)
1136 {
1137 	CachedNode cachedDuplicate(this);
1138 	off_t* values = node->Values();
1139 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1140 	status_t status;
1141 	off_t offset;
1142 
1143 	if (bplustree_node::IsDuplicate(oldValue)) {
1144 		// If it's a duplicate fragment, try to insert it into that, or if it
1145 		// doesn't fit anymore, create a new duplicate node
1146 
1147 		if (bplustree_node::LinkType(oldValue)
1148 				== BPLUSTREE_DUPLICATE_FRAGMENT) {
1149 			bplustree_node* duplicate = cachedDuplicate.SetToWritable(
1150 				transaction, bplustree_node::FragmentOffset(oldValue), false);
1151 			if (duplicate == NULL)
1152 				return B_IO_ERROR;
1153 
1154 			duplicate_array* array = duplicate->FragmentAt(
1155 				bplustree_node::FragmentIndex(oldValue));
1156 			int32 arrayCount = array->Count();
1157 			if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
1158 				FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
1159 					"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1160 					(int)bplustree_node::FragmentIndex(oldValue),
1161 					bplustree_node::FragmentOffset(oldValue), arrayCount,
1162 					fStream->ID()));
1163 				return B_BAD_DATA;
1164 			}
1165 
1166 			if (arrayCount < NUM_FRAGMENT_VALUES) {
1167 				array->Insert(value);
1168 			} else {
1169 				// Test if the fragment will be empty if we remove this key's
1170 				// values
1171 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
1172 					// The node will be empty without our values, so let us
1173 					// reuse it as a duplicate node
1174 					offset = bplustree_node::FragmentOffset(oldValue);
1175 
1176 					memmove(duplicate->DuplicateArray(), array,
1177 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1178 					duplicate->left_link = duplicate->right_link
1179 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1180 
1181 					array = duplicate->DuplicateArray();
1182 					array->Insert(value);
1183 				} else {
1184 					// Create a new duplicate node
1185 
1186 					cachedDuplicate.UnsetUnchanged(transaction);
1187 						// The old duplicate has not been touched, so we can
1188 						// reuse it
1189 
1190 					bplustree_node* newDuplicate;
1191 					status = cachedDuplicate.Allocate(transaction,
1192 						&newDuplicate, &offset);
1193 					if (status != B_OK)
1194 						RETURN_ERROR(status);
1195 
1196 					// Copy the array from the fragment node to the duplicate
1197 					// node and free the old entry (by zero'ing all values)
1198 					newDuplicate->overflow_link = array->count;
1199 					memcpy(&newDuplicate->all_key_count, &array->values[0],
1200 						array->Count() * sizeof(off_t));
1201 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1202 
1203 					array = newDuplicate->DuplicateArray();
1204 					array->Insert(value);
1205 				}
1206 
1207 				// Update the main pointer to link to a duplicate node
1208 				if (cached.MakeWritable(transaction) == NULL)
1209 					return B_IO_ERROR;
1210 
1211 				values[index]
1212 					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1213 						BPLUSTREE_DUPLICATE_NODE, offset));
1214 			}
1215 
1216 			return B_OK;
1217 		}
1218 
1219 		// Put the value into a dedicated duplicate node
1220 
1221 		// search for free space in the duplicate nodes of that key
1222 		duplicate_array* array;
1223 		int32 arrayCount;
1224 		const bplustree_node* duplicate;
1225 		off_t duplicateOffset;
1226 		do {
1227 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1228 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1229 			if (duplicate == NULL)
1230 				return B_IO_ERROR;
1231 
1232 			array = duplicate->DuplicateArray();
1233 			arrayCount =array->Count();
1234 			if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1235 				FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
1236 					B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1237 					duplicateOffset, arrayCount, fStream->ID()));
1238 				return B_BAD_DATA;
1239 			}
1240 		} while (arrayCount >= NUM_DUPLICATE_VALUES
1241 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
1242 
1243 		bplustree_node* writableDuplicate
1244 			= cachedDuplicate.MakeWritable(transaction);
1245 		if (writableDuplicate == NULL)
1246 			return B_IO_ERROR;
1247 
1248 		if (arrayCount < NUM_DUPLICATE_VALUES) {
1249 			array = writableDuplicate->DuplicateArray();
1250 			array->Insert(value);
1251 		} else {
1252 			// no space left - add a new duplicate node
1253 
1254 			bplustree_node* newDuplicate;
1255 			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
1256 				&offset);
1257 			if (status != B_OK)
1258 				RETURN_ERROR(status);
1259 
1260 			// link the two nodes together
1261 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
1262 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
1263 
1264 			array = newDuplicate->DuplicateArray();
1265 			array->count = 0;
1266 			array->Insert(value);
1267 		}
1268 		return B_OK;
1269 	}
1270 
1271 	// Search for a free duplicate fragment or create a new one
1272 	// to insert the duplicate value into
1273 
1274 	uint32 fragmentIndex = 0;
1275 	bplustree_node* fragment;
1276 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
1277 			&offset, &fragment, &fragmentIndex) != B_OK) {
1278 		// allocate a new duplicate fragment node
1279 		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
1280 		if (status != B_OK)
1281 			RETURN_ERROR(status);
1282 
1283 		memset(fragment, 0, fNodeSize);
1284 	}
1285 
1286 	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
1287 	array->Insert(oldValue);
1288 	array->Insert(value);
1289 
1290 	if (cached.MakeWritable(transaction) == NULL)
1291 		return B_IO_ERROR;
1292 
1293 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1294 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
1295 
1296 	return B_OK;
1297 }
1298 
1299 
1300 void
1301 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
1302 	uint16 keyLength, off_t value)
1303 {
1304 	// should never happen, but who knows?
1305 	if (index > node->NumKeys())
1306 		return;
1307 
1308 	off_t* values = node->Values();
1309 	uint16* keyLengths = node->KeyLengths();
1310 	uint8* keys = node->Keys();
1311 
1312 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1313 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1314 		+ keyLength);
1315 
1316 	off_t* newValues = node->Values();
1317 	uint16* newKeyLengths = node->KeyLengths();
1318 
1319 	// move values and copy new value into them
1320 	memmove(newValues + index + 1, values + index,
1321 		sizeof(off_t) * (node->NumKeys() - 1 - index));
1322 	memmove(newValues, values, sizeof(off_t) * index);
1323 
1324 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1325 
1326 	// move and update key length index
1327 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1328 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1329 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1330 	}
1331 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1332 
1333 	int32 keyStart;
1334 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1335 		+ (keyStart = index > 0
1336 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1337 
1338 	// move keys and copy new key into them
1339 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1340 	int32 size = node->AllKeyLength() - length;
1341 	if (size > 0)
1342 		memmove(keys + length, keys + length - keyLength, size);
1343 
1344 	memcpy(keys + keyStart, key, keyLength);
1345 }
1346 
1347 
1348 /*!	Splits the \a node into two halves - the other half will be put into
1349 	\a other. It also takes care to create a new overflow link if the node
1350 	to split is an index node.
1351 */
1352 status_t
1353 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1354 	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1355 	uint16* _keyLength, off_t* _value)
1356 {
1357 	if (*_keyIndex > node->NumKeys() + 1)
1358 		return B_BAD_VALUE;
1359 
1360 	uint16* inKeyLengths = node->KeyLengths();
1361 	off_t* inKeyValues = node->Values();
1362 	uint8* inKeys = node->Keys();
1363 	uint8* outKeys = other->Keys();
1364 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1365 
1366 	if (keyIndex > node->NumKeys()) {
1367 		FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
1368 			"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
1369 		return B_BAD_VALUE;
1370 	}
1371 
1372 	// how many keys will fit in one (half) page?
1373 	// that loop will find the answer to this question and
1374 	// change the key lengths indices for their new home
1375 
1376 	// "bytes" is the number of bytes written for the new key,
1377 	// "bytesBefore" are the bytes before that key
1378 	// "bytesAfter" are the bytes after the new key, if any
1379 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1380 
1381 	size_t size = fNodeSize >> 1;
1382 	int32 out, in;
1383 	for (in = out = 0; in < node->NumKeys() + 1;) {
1384 		if (!bytes) {
1385 			bytesBefore = in > 0
1386 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1387 		}
1388 
1389 		if (in == keyIndex && !bytes) {
1390 			bytes = *_keyLength;
1391 		} else {
1392 			if (keyIndex < out) {
1393 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1394 					- bytesBefore;
1395 			}
1396 
1397 			in++;
1398 		}
1399 		out++;
1400 
1401 		if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1402 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1403 			// we have found the number of keys in the new node!
1404 			break;
1405 		}
1406 	}
1407 
1408 	// if the new key was not inserted, set the length of the keys
1409 	// that can be copied directly
1410 	if (keyIndex >= out && in > 0)
1411 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1412 
1413 	if (bytesBefore < 0 || bytesAfter < 0)
1414 		return B_BAD_DATA;
1415 
1416 	other->left_link = node->left_link;
1417 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1418 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1419 		+ bytesAfter);
1420 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1421 
1422 	uint16* outKeyLengths = other->KeyLengths();
1423 	off_t* outKeyValues = other->Values();
1424 	int32 keys = out > keyIndex ? keyIndex : out;
1425 
1426 	if (bytesBefore) {
1427 		// copy the keys
1428 		memcpy(outKeys, inKeys, bytesBefore);
1429 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1430 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1431 	}
1432 	if (bytes) {
1433 		// copy the newly inserted key
1434 		memcpy(outKeys + bytesBefore, key, bytes);
1435 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1436 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1437 
1438 		if (bytesAfter) {
1439 			// copy the keys after the new key
1440 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1441 				bytesAfter);
1442 			keys = out - keyIndex - 1;
1443 			for (int32 i = 0;i < keys;i++) {
1444 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1445 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1446 						+ bytes);
1447 			}
1448 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1449 				keys * sizeof(off_t));
1450 		}
1451 	}
1452 
1453 	// if the new key was already inserted, we shouldn't use it again
1454 	if (in != out)
1455 		keyIndex--;
1456 
1457 	int32 total = bytesBefore + bytesAfter;
1458 
1459 	// these variables are for the key that will be returned
1460 	// to the parent node
1461 	uint8* newKey = NULL;
1462 	uint16 newLength;
1463 	bool newAllocated = false;
1464 
1465 	// If we have split an index node, we have to drop the first key
1466 	// of the next node (which can also be the new key to insert).
1467 	// The dropped key is also the one which has to be inserted in
1468 	// the parent node, so we will set the "newKey" already here.
1469 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1470 		if (in == keyIndex) {
1471 			newKey = key;
1472 			newLength = *_keyLength;
1473 
1474 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1475 			keyIndex--;
1476 		} else {
1477 			// If a key is dropped (is not the new key), we have to copy
1478 			// it, because it would be lost if not.
1479 			uint8* droppedKey = node->KeyAt(in, &newLength);
1480 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1481 					> (uint8*)node + fNodeSize
1482 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1483 				fStream->GetVolume()->Panic();
1484 				RETURN_ERROR(B_BAD_DATA);
1485 			}
1486 			newKey = (uint8*)malloc(newLength);
1487 			if (newKey == NULL)
1488 				return B_NO_MEMORY;
1489 
1490 			newAllocated = true;
1491 			memcpy(newKey, droppedKey, newLength);
1492 
1493 			other->overflow_link = inKeyValues[in];
1494 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1495 		}
1496 	}
1497 
1498 	// and now the same game for the other page and the rest of the keys
1499 	// (but with memmove() instead of memcpy(), because they may overlap)
1500 
1501 	bytesBefore = bytesAfter = bytes = 0;
1502 	out = 0;
1503 	int32 skip = in;
1504 	while (in < node->NumKeys() + 1) {
1505 		if (in == keyIndex && !bytes) {
1506 			// it's enough to set bytesBefore once here, because we do
1507 			// not need to know the exact length of all keys in this
1508 			// loop
1509 			bytesBefore = in > skip
1510 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1511 			bytes = *_keyLength;
1512 			out++;
1513 		} else {
1514 			if (in < node->NumKeys()) {
1515 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1516 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1517 
1518 				if (bytes) {
1519 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1520 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1521 
1522 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1523 						- bytesBefore - bytes;
1524 				}
1525 				out++;
1526 			}
1527 			in++;
1528 		}
1529 	}
1530 
1531 	// adjust the byte counts (since we were a bit lazy in the loop)
1532 	if (keyIndex < skip)
1533 		bytesBefore = node->AllKeyLength() - total;
1534 
1535 	if (bytesBefore < 0 || bytesAfter < 0) {
1536 		if (newAllocated)
1537 			free(newKey);
1538 		return B_BAD_DATA;
1539 	}
1540 
1541 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1542 		// right link, and overflow link can stay the same
1543 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1544 		+ bytesAfter);
1545 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1546 
1547 	// array positions have changed
1548 	outKeyLengths = node->KeyLengths();
1549 	outKeyValues = node->Values();
1550 
1551 	// move the keys in the old node: the order is important here,
1552 	// because we don't want to overwrite any contents
1553 
1554 	keys = keyIndex <= skip ? out : keyIndex - skip;
1555 	keyIndex -= skip;
1556 	in = out - keyIndex - 1;
1557 		// Note: keyIndex and in will contain invalid values when the new key
1558 		// went to the other node. But in this case bytes and bytesAfter are
1559 		// 0 and subsequently we never use keyIndex and in.
1560 
1561 	if (bytesBefore)
1562 		memmove(inKeys, inKeys + total, bytesBefore);
1563 	if (bytesAfter) {
1564 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1565 			bytesAfter);
1566 	}
1567 
1568 	if (bytesBefore)
1569 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1570 	if (bytesAfter) {
1571 		// if byteAfter is > 0, keyIndex is larger than skip
1572 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1573 			in * sizeof(uint16));
1574 	}
1575 
1576 	if (bytesBefore)
1577 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1578 	if (bytesAfter) {
1579 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1580 			in * sizeof(off_t));
1581 	}
1582 
1583 	if (bytes) {
1584 		// finally, copy the newly inserted key (don't overwrite anything)
1585 		memcpy(inKeys + bytesBefore, key, bytes);
1586 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1587 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1588 	}
1589 
1590 	// Prepare the key that will be inserted in the parent node which
1591 	// is either the dropped key or the last of the other node.
1592 	// If it's the dropped key, "newKey" was already set earlier.
1593 
1594 	if (newKey == NULL)
1595 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1596 
1597 	memcpy(key, newKey, newLength);
1598 	*_keyLength = newLength;
1599 	*_value = otherOffset;
1600 
1601 	if (newAllocated)
1602 		free(newKey);
1603 
1604 	return B_OK;
1605 }
1606 
1607 
1608 /*!	This inserts a key into the tree. The changes made to the tree will
1609 	all be part of the \a transaction.
1610 	You need to have the inode write locked.
1611 */
1612 status_t
1613 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1614 	off_t value)
1615 {
1616 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1617 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1618 		RETURN_ERROR(B_BAD_VALUE);
1619 #ifdef DEBUG
1620 	if (value < 0)
1621 		panic("tried to insert invalid value %Ld!\n", value);
1622 #endif
1623 
1624 	ASSERT_WRITE_LOCKED_INODE(fStream);
1625 
1626 	Stack<node_and_key> stack;
1627 	if (_SeekDown(stack, key, keyLength) != B_OK)
1628 		RETURN_ERROR(B_ERROR);
1629 
1630 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1631 
1632 	memcpy(keyBuffer, key, keyLength);
1633 	keyBuffer[keyLength] = 0;
1634 
1635 	node_and_key nodeAndKey;
1636 	const bplustree_node* node;
1637 
1638 	CachedNode cached(this);
1639 	while (stack.Pop(&nodeAndKey)
1640 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1641 #ifdef DEBUG
1642 		NodeChecker checker(node, fNodeSize, "insert");
1643 #endif
1644 		if (node->IsLeaf()) {
1645 			// first round, check for duplicate entries
1646 			status_t status = _FindKey(node, key, keyLength,
1647 				&nodeAndKey.keyIndex);
1648 
1649 			// is this a duplicate entry?
1650 			if (status == B_OK) {
1651 				if (fAllowDuplicates) {
1652 					status = _InsertDuplicate(transaction, cached, node,
1653 						nodeAndKey.keyIndex, value);
1654 					if (status != B_OK)
1655 						RETURN_ERROR(status);
1656 					return B_OK;
1657 				}
1658 
1659 				return B_NAME_IN_USE;
1660 			}
1661 		}
1662 
1663 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1664 		if (writableNode == NULL)
1665 			return B_IO_ERROR;
1666 
1667 		// is the node big enough to hold the pair?
1668 		if (int32(key_align(sizeof(bplustree_node)
1669 				+ writableNode->AllKeyLength() + keyLength)
1670 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1671 				+ sizeof(off_t))) < fNodeSize) {
1672 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1673 				keyBuffer, keyLength, value);
1674 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1675 				nodeAndKey.keyIndex, 0, 1);
1676 
1677 			return B_OK;
1678 		} else {
1679 			CachedNode cachedNewRoot(this);
1680 			CachedNode cachedOther(this);
1681 
1682 			// do we need to allocate a new root node? if so, then do
1683 			// it now
1684 			off_t newRoot = BPLUSTREE_NULL;
1685 			if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1686 				bplustree_node* root;
1687 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1688 					&newRoot);
1689 				if (status != B_OK) {
1690 					// The tree is most likely corrupted!
1691 					// But it's still sane at leaf level - we could set
1692 					// a flag in the header that forces the tree to be
1693 					// rebuild next time...
1694 					// But since we will have journaling, that's not a big
1695 					// problem anyway.
1696 					RETURN_ERROR(status);
1697 				}
1698 			}
1699 
1700 			// reserve space for the other node
1701 			bplustree_node* other;
1702 			off_t otherOffset;
1703 			status_t status = cachedOther.Allocate(transaction, &other,
1704 				&otherOffset);
1705 			if (status != B_OK) {
1706 				cachedNewRoot.Free(transaction, newRoot);
1707 				RETURN_ERROR(status);
1708 			}
1709 
1710 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1711 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1712 					&value) != B_OK) {
1713 				// free root node & other node here
1714 				cachedOther.Free(transaction, otherOffset);
1715 				cachedNewRoot.Free(transaction, newRoot);
1716 
1717 				RETURN_ERROR(B_ERROR);
1718 			}
1719 #ifdef DEBUG
1720 			checker.Check("insert split");
1721 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1722 #endif
1723 
1724 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1725 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1726 
1727 			// update the right link of the node in the left of the new node
1728 			if ((other = cachedOther.SetToWritable(transaction,
1729 					other->LeftLink())) != NULL) {
1730 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1731 			}
1732 
1733 			// create a new root if necessary
1734 			if (newRoot != BPLUSTREE_NULL) {
1735 				bplustree_node* root = cachedNewRoot.Node();
1736 
1737 				_InsertKey(root, 0, keyBuffer, keyLength,
1738 					writableNode->LeftLink());
1739 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1740 					nodeAndKey.nodeOffset);
1741 
1742 				CachedNode cached(this);
1743 				bplustree_header* header
1744 					= cached.SetToWritableHeader(transaction);
1745 				if (header == NULL)
1746 					return B_IO_ERROR;
1747 
1748 				// finally, update header to point to the new root
1749 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1750 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1751 					header->MaxNumberOfLevels() + 1);
1752 
1753 				return B_OK;
1754 			}
1755 		}
1756 	}
1757 	RETURN_ERROR(B_ERROR);
1758 }
1759 
1760 
1761 /*!	Removes the duplicate index/value pair from the tree.
1762 	It's part of the private tree interface.
1763 */
1764 status_t
1765 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1766 	const bplustree_node* node, CachedNode& cached, uint16 index,
1767 	off_t value)
1768 {
1769 	off_t* values = node->Values();
1770 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1771 
1772 	CachedNode cachedDuplicate(this);
1773 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1774 	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1775 		duplicateOffset, false);
1776 	if (duplicate == NULL)
1777 		return B_IO_ERROR;
1778 
1779 	// if it's a duplicate fragment, remove the entry from there
1780 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1781 		duplicate_array* array = duplicate->FragmentAt(
1782 			bplustree_node::FragmentIndex(oldValue));
1783 		int32 arrayCount = array->Count();
1784 
1785 		if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
1786 			FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
1787 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1788 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1789 				arrayCount, fStream->ID()));
1790 			return B_BAD_DATA;
1791 		}
1792 		if (!array->Remove(value)) {
1793 			FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1794 				"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
1795 				duplicateOffset, fStream->ID()));
1796 			return B_ENTRY_NOT_FOUND;
1797 		}
1798 
1799 		// remove the array from the fragment node if it is empty
1800 		if (--arrayCount == 1) {
1801 			// set the link to the remaining value
1802 			if (cached.MakeWritable(transaction) == NULL)
1803 				return B_IO_ERROR;
1804 
1805 			values[index] = array->values[0];
1806 
1807 			// Remove the whole fragment node, if this was the only array,
1808 			// otherwise free just the array
1809 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1810 				status_t status = cachedDuplicate.Free(transaction,
1811 					duplicateOffset);
1812 				if (status != B_OK)
1813 					return status;
1814 			} else
1815 				array->count = 0;
1816 		}
1817 		return B_OK;
1818 	}
1819 
1820 	// Remove value from a duplicate node!
1821 
1822 	duplicate_array* array = NULL;
1823 	int32 arrayCount = 0;
1824 
1825 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1826 		FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1827 			", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
1828 		return B_BAD_DATA;
1829 	}
1830 
1831 	// Search the duplicate nodes until the entry could be found (and removed)
1832 	while (duplicate != NULL) {
1833 		array = duplicate->DuplicateArray();
1834 		arrayCount = array->Count();
1835 
1836 		if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1837 			FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
1838 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1839 				duplicateOffset, arrayCount, fStream->ID()));
1840 			return B_BAD_DATA;
1841 		}
1842 
1843 		if (array->Remove(value)) {
1844 			arrayCount--;
1845 			break;
1846 		}
1847 
1848 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1849 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1850 
1851 		cachedDuplicate.UnsetUnchanged(transaction);
1852 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1853 			false);
1854 	}
1855 	if (duplicate == NULL)
1856 		RETURN_ERROR(B_IO_ERROR);
1857 
1858 	// The entry got removed from the duplicate node, but we might want to free
1859 	// it now in case it's empty
1860 
1861 	while (true) {
1862 		off_t left = duplicate->LeftLink();
1863 		off_t right = duplicate->RightLink();
1864 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1865 
1866 		if ((isLast && arrayCount == 1) || arrayCount == 0) {
1867 			// Free empty duplicate page, link their siblings together, and
1868 			// update the duplicate link if needed (ie. when we either remove
1869 			// the last duplicate node or have a new first one)
1870 
1871 			if (left == BPLUSTREE_NULL) {
1872 				// the duplicate link points to us
1873 				if (cached.MakeWritable(transaction) == NULL)
1874 					return B_IO_ERROR;
1875 
1876 				if (arrayCount == 1) {
1877 					// This is the last node, and there is only one value left;
1878 					// replace the duplicate link with that value, it's no
1879 					// duplicate anymore
1880 					values[index] = array->values[0];
1881 				} else {
1882 					// Move the duplicate link to the next node
1883 					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1884 						bplustree_node::MakeLink(
1885 							BPLUSTREE_DUPLICATE_NODE, right));
1886 				}
1887 			}
1888 
1889 			status_t status = cachedDuplicate.Free(transaction,
1890 				duplicateOffset);
1891 			if (status != B_OK)
1892 				return status;
1893 
1894 			if (left != BPLUSTREE_NULL
1895 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1896 						false)) != NULL) {
1897 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1898 
1899 				// If the next node is the last node, we need to free that node
1900 				// and convert the duplicate entry back into a normal entry
1901 				array = duplicate->DuplicateArray();
1902 				arrayCount = array->Count();
1903 				if (right == BPLUSTREE_NULL
1904 					&& duplicate->LeftLink() == BPLUSTREE_NULL
1905 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1906 					duplicateOffset = left;
1907 					continue;
1908 				}
1909 			}
1910 			if (right != BPLUSTREE_NULL
1911 				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1912 						right, false)) != NULL) {
1913 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1914 
1915 				// Again, we may need to turn the duplicate entry back into a
1916 				// normal entry
1917 				array = duplicate->DuplicateArray();
1918 				arrayCount = array->Count();
1919 				if (left == BPLUSTREE_NULL
1920 					&& duplicate->RightLink() == BPLUSTREE_NULL
1921 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1922 					duplicateOffset = right;
1923 					continue;
1924 				}
1925 			}
1926 			return B_OK;
1927 		}
1928 		if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
1929 			// If the number of entries fits in a duplicate fragment, then
1930 			// either find a free fragment node, or convert this node to a
1931 			// fragment node.
1932 			CachedNode cachedOther(this);
1933 
1934 			bplustree_node* fragment = NULL;
1935 			uint32 fragmentIndex = 0;
1936 			off_t offset;
1937 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
1938 					&offset, &fragment, &fragmentIndex) == B_OK) {
1939 				// move to other node
1940 				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
1941 				memcpy(target, array,
1942 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1943 
1944 				cachedDuplicate.Free(transaction, duplicateOffset);
1945 				duplicateOffset = offset;
1946 			} else {
1947 				// convert node
1948 				memmove(duplicate, array,
1949 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1950 				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1951 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1952 			}
1953 
1954 			if (cached.MakeWritable(transaction) == NULL)
1955 				return B_IO_ERROR;
1956 
1957 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1958 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1959 		}
1960 		return B_OK;
1961 	}
1962 }
1963 
1964 
1965 /*!	Removes the key with the given index from the specified node.
1966 	Since it has to get the key from the node anyway (to obtain it's
1967 	pointer), it's not needed to pass the key & its length, although
1968 	the calling method (BPlusTree::Remove()) have this data.
1969 */
1970 void
1971 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
1972 {
1973 	// should never happen, but who knows?
1974 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1975 		FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
1976 			"\n", index, fStream->ID()));
1977 		return;
1978 	}
1979 
1980 	off_t* values = node->Values();
1981 
1982 	// if we would have to drop the overflow link, drop
1983 	// the last key instead and update the overflow link
1984 	// to the value of that one
1985 	if (!node->IsLeaf() && index == node->NumKeys())
1986 		node->overflow_link = values[--index];
1987 
1988 	uint16 length;
1989 	uint8* key = node->KeyAt(index, &length);
1990 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
1991 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1992 		FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
1993 			fStream->ID()));
1994 		fStream->GetVolume()->Panic();
1995 		return;
1996 	}
1997 
1998 	uint16* keyLengths = node->KeyLengths();
1999 	uint8* keys = node->Keys();
2000 
2001 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
2002 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
2003 		node->AllKeyLength() - length);
2004 
2005 	off_t* newValues = node->Values();
2006 	uint16* newKeyLengths = node->KeyLengths();
2007 
2008 	// move key data
2009 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
2010 
2011 	// move and update key lengths
2012 	if (index > 0 && newKeyLengths != keyLengths)
2013 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
2014 	for (uint16 i = index; i < node->NumKeys(); i++) {
2015 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
2016 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
2017 	}
2018 
2019 	// move values
2020 	if (index > 0)
2021 		memmove(newValues, values, index * sizeof(off_t));
2022 	if (node->NumKeys() > index) {
2023 		memmove(newValues + index, values + index + 1,
2024 			(node->NumKeys() - index) * sizeof(off_t));
2025 	}
2026 }
2027 
2028 
2029 /*!	Removes the specified key from the tree. The "value" parameter is only used
2030 	for trees which allow duplicates, so you may safely ignore it.
2031 	It's not an optional parameter, so at least you have to think about it.
2032 	You need to have the inode write locked.
2033 */
2034 status_t
2035 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
2036 	off_t value)
2037 {
2038 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2039 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2040 		RETURN_ERROR(B_BAD_VALUE);
2041 
2042 	ASSERT_WRITE_LOCKED_INODE(fStream);
2043 
2044 	Stack<node_and_key> stack;
2045 	if (_SeekDown(stack, key, keyLength) != B_OK)
2046 		RETURN_ERROR(B_ERROR);
2047 
2048 	node_and_key nodeAndKey;
2049 	const bplustree_node* node;
2050 
2051 	CachedNode cached(this);
2052 	while (stack.Pop(&nodeAndKey)
2053 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
2054 #ifdef DEBUG
2055 		NodeChecker checker(node, fNodeSize, "remove");
2056 #endif
2057 		if (node->IsLeaf()) {
2058 			// first round, check for duplicate entries
2059 			status_t status = _FindKey(node, key, keyLength,
2060 				&nodeAndKey.keyIndex);
2061 			if (status != B_OK)
2062 				RETURN_ERROR(status);
2063 
2064 			// Is this a duplicate entry?
2065 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
2066 					node->Values()[nodeAndKey.keyIndex]))) {
2067 				if (fAllowDuplicates) {
2068 					return _RemoveDuplicate(transaction, node, cached,
2069 						nodeAndKey.keyIndex, value);
2070 				}
2071 
2072 				FATAL(("dupliate node found where no duplicates are "
2073 					"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
2074 				RETURN_ERROR(B_ERROR);
2075 			} else {
2076 				if (node->Values()[nodeAndKey.keyIndex] != value)
2077 					return B_ENTRY_NOT_FOUND;
2078 
2079 				// If we will remove the last key, the iterator will be set
2080 				// to the next node after the current - if there aren't any
2081 				// more nodes, we need a way to prevent the TreeIterators to
2082 				// touch the old node again, we use BPLUSTREE_FREE for this
2083 				off_t next = node->RightLink() == BPLUSTREE_NULL
2084 					? BPLUSTREE_FREE : node->RightLink();
2085 				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
2086 					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
2087 			}
2088 		}
2089 
2090 		bplustree_node* writableNode = cached.MakeWritable(transaction);
2091 		if (writableNode == NULL)
2092 			return B_IO_ERROR;
2093 
2094 		// if it's an empty root node, we have to convert it
2095 		// to a leaf node by dropping the overflow link, or,
2096 		// if it's already a leaf node, just empty it
2097 		if (nodeAndKey.nodeOffset == fHeader.RootNode()
2098 			&& (node->NumKeys() == 0
2099 				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
2100 			writableNode->overflow_link
2101 				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2102 			writableNode->all_key_count = 0;
2103 			writableNode->all_key_length = 0;
2104 
2105 			// if we've made a leaf node out of the root node, we need
2106 			// to reset the maximum number of levels in the header
2107 			if (fHeader.MaxNumberOfLevels() != 1) {
2108 				CachedNode cached(this);
2109 				bplustree_header* header
2110 					= cached.SetToWritableHeader(transaction);
2111 				if (header == NULL)
2112 					return B_IO_ERROR;
2113 
2114 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
2115 			}
2116 			return B_OK;
2117 		}
2118 
2119 		// if there is only one key left, we don't have to remove
2120 		// it, we can just dump the node (index nodes still have
2121 		// the overflow link, so we have to drop the last key)
2122 		if (writableNode->NumKeys() > 1
2123 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
2124 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
2125 			return B_OK;
2126 		}
2127 
2128 		// when we are here, we can just free the node, but
2129 		// we have to update the right/left link of the
2130 		// siblings first
2131 		CachedNode otherCached(this);
2132 		bplustree_node* other = otherCached.SetToWritable(transaction,
2133 			writableNode->LeftLink());
2134 		if (other != NULL)
2135 			other->right_link = writableNode->right_link;
2136 
2137 		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
2138 				!= NULL) {
2139 			other->left_link = writableNode->left_link;
2140 		}
2141 
2142 		cached.Free(transaction, nodeAndKey.nodeOffset);
2143 	}
2144 	RETURN_ERROR(B_ERROR);
2145 }
2146 
2147 
2148 /*!	Replaces the value for the key in the tree.
2149 	Returns B_OK if the key could be found and its value replaced,
2150 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
2151 	to indicate that something went terribly wrong.
2152 	Note that this doesn't work with duplicates - it will just
2153 	return B_BAD_TYPE if you call this function on a tree where
2154 	duplicates are allowed.
2155 	You need to have the inode write locked.
2156 */
2157 status_t
2158 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
2159 	off_t value)
2160 {
2161 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2162 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2163 		|| key == NULL)
2164 		RETURN_ERROR(B_BAD_VALUE);
2165 
2166 	if (fAllowDuplicates)
2167 		RETURN_ERROR(B_BAD_TYPE);
2168 
2169 	ASSERT_WRITE_LOCKED_INODE(fStream);
2170 
2171 	off_t nodeOffset = fHeader.RootNode();
2172 	CachedNode cached(this);
2173 	const bplustree_node* node;
2174 
2175 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2176 		uint16 keyIndex = 0;
2177 		off_t nextOffset;
2178 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2179 			&nextOffset);
2180 
2181 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2182 			if (status == B_OK) {
2183 				bplustree_node* writableNode = cached.MakeWritable(transaction);
2184 				if (writableNode != NULL) {
2185 					writableNode->Values()[keyIndex]
2186 						= HOST_ENDIAN_TO_BFS_INT64(value);
2187 				} else
2188 					status = B_IO_ERROR;
2189 			}
2190 
2191 			return status;
2192 		} else if (nextOffset == nodeOffset)
2193 			RETURN_ERROR(B_ERROR);
2194 
2195 		nodeOffset = nextOffset;
2196 	}
2197 	RETURN_ERROR(B_ERROR);
2198 }
2199 
2200 
2201 /*!	Searches the key in the tree, and stores the offset found in _value,
2202 	if successful.
2203 	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
2204 	while it descends the tree.
2205 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
2206 	It can also return other errors to indicate that something went wrong.
2207 	Note that this doesn't work with duplicates - it will just return
2208 	B_BAD_TYPE if you call this function on a tree where duplicates are
2209 	allowed.
2210 	You need to have the inode read or write locked.
2211 */
2212 status_t
2213 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
2214 {
2215 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2216 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2217 		|| key == NULL)
2218 		RETURN_ERROR(B_BAD_VALUE);
2219 
2220 	if (fAllowDuplicates)
2221 		RETURN_ERROR(B_BAD_TYPE);
2222 
2223 	ASSERT_READ_LOCKED_INODE(fStream);
2224 
2225 	off_t nodeOffset = fHeader.RootNode();
2226 	CachedNode cached(this);
2227 	const bplustree_node* node;
2228 
2229 #ifdef DEBUG
2230 	int32 levels = 0;
2231 #endif
2232 
2233 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2234 		uint16 keyIndex = 0;
2235 		off_t nextOffset;
2236 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2237 			&nextOffset);
2238 
2239 #ifdef DEBUG
2240 		levels++;
2241 #endif
2242 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2243 			if (status == B_OK && _value != NULL)
2244 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
2245 
2246 #ifdef DEBUG
2247 			if (levels != (int32)fHeader.MaxNumberOfLevels())
2248 				DEBUGGER(("levels don't match"));
2249 #endif
2250 			return status;
2251 		} else if (nextOffset == nodeOffset)
2252 			RETURN_ERROR(B_ERROR);
2253 
2254 		nodeOffset = nextOffset;
2255 	}
2256 	FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
2257 		B_PRIdOFF "\n", nodeOffset, fStream->ID()));
2258 	RETURN_ERROR(B_ERROR);
2259 }
2260 
2261 
2262 status_t
2263 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
2264 	const uint8* largestKey, uint16 largestKeyLength,
2265 	const bplustree_node* parent)
2266 {
2267 	if (parent->CheckIntegrity(fNodeSize) != B_OK) {
2268 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
2269 			"failed!\n", fStream->ID(), offset);
2270 		check.FoundError();
2271 		return B_OK;
2272 	}
2273 	if (level >= fHeader.MaxNumberOfLevels()) {
2274 		dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
2275 			"!\n", fStream->ID(), offset);
2276 		check.FoundError();
2277 		return B_OK;
2278 	}
2279 
2280 	check.SetLevel(level);
2281 
2282 	if (check.Visited(offset)) {
2283 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
2284 			fStream->ID(), offset);
2285 		check.FoundError();
2286 		return B_OK;
2287 	}
2288 
2289 	check.SetVisited(offset);
2290 
2291 	uint32 count = parent->NumKeys();
2292 	off_t* values = parent->Values();
2293 	off_t lastOffset = check.PreviousOffset(level);
2294 	CachedNode cached(this);
2295 
2296 	for (uint32 i = 0; i < count; i++) {
2297 		uint16 keyLength;
2298 		uint8* key = parent->KeyAt(i, &keyLength);
2299 		if (largestKey != NULL) {
2300 			int result = compareKeys(fHeader.DataType(), key, keyLength,
2301 				largestKey, largestKeyLength);
2302 			if (result >= 0) {
2303 				dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
2304 					B_PRIu32 " larger than it should!\n",
2305 					fStream->ID(), offset, i);
2306 				check.FoundError();
2307 			}
2308 		}
2309 
2310 		off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
2311 		if (bplustree_node::IsDuplicate(childOffset)) {
2312 			// Walk the duplicate nodes
2313 			off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
2314 			off_t lastDuplicateOffset = BPLUSTREE_NULL;
2315 
2316 			while (duplicateOffset != BPLUSTREE_NULL) {
2317 				const bplustree_node* node;
2318 				status_t status = cached.SetTo(duplicateOffset, &node, false);
2319 				if (status != B_OK) {
2320 					if (status == B_IO_ERROR)
2321 						return B_IO_ERROR;
2322 
2323 					dprintf("inode %" B_PRIdOFF ": duplicate node at %"
2324 						B_PRIdOFF " could not be read: %s\n", fStream->ID(),
2325 						duplicateOffset, strerror(status));
2326 					check.FoundError();
2327 					break;
2328 				}
2329 
2330 				bool isFragmentNode = bplustree_node::LinkType(childOffset)
2331 					== BPLUSTREE_DUPLICATE_FRAGMENT;
2332 				bool isKnownFragment = isFragmentNode
2333 					&& check.VisitedFragment(duplicateOffset);
2334 
2335 				if (!isKnownFragment && check.Visited(duplicateOffset)) {
2336 					dprintf("inode %" B_PRIdOFF ": %s node at %"
2337 						B_PRIdOFF " already visited, referenced from %"
2338 						B_PRIdOFF "!\n", fStream->ID(),
2339 						isFragmentNode ? "fragment" : "duplicate",
2340 						duplicateOffset, offset);
2341 					check.FoundError();
2342 					break;
2343 				}
2344 
2345 				// Fragment nodes may be visited more than once from different
2346 				// places
2347 				if (!check.Visited(duplicateOffset))
2348 					check.SetVisited(duplicateOffset);
2349 				if (!isKnownFragment && isFragmentNode)
2350 					check.SetVisitedFragment(duplicateOffset);
2351 
2352 				duplicate_array* array;
2353 				int32 minSize;
2354 				int32 maxSize;
2355 				if (isFragmentNode) {
2356 					array = node->FragmentAt(
2357 						bplustree_node::FragmentIndex(childOffset));
2358 					minSize = 2;
2359 					maxSize = NUM_FRAGMENT_VALUES;
2360 				} else {
2361 					array = node->DuplicateArray();
2362 					minSize = 1;
2363 					maxSize = NUM_DUPLICATE_VALUES;
2364 				}
2365 				int32 arrayCount = array->Count();
2366 
2367 				if (arrayCount < minSize || arrayCount > maxSize) {
2368 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2369 						" has invalid array size %" B_PRId32 "!\n",
2370 						fStream->ID(), duplicateOffset, arrayCount);
2371 					check.FoundError();
2372 				} else {
2373 					// Simple check if the values in the array may be valid
2374 					for (int32 j = 0; j < arrayCount; j++) {
2375 						if (!fStream->GetVolume()->IsValidInodeBlock(
2376 								array->ValueAt(j))) {
2377 							dprintf("inode %" B_PRIdOFF ": duplicate at %"
2378 								B_PRIdOFF " contains invalid block %" B_PRIdOFF
2379 								" at %" B_PRId32 "!\n", fStream->ID(),
2380 								duplicateOffset, array->ValueAt(j), j);
2381 							check.FoundError();
2382 							break;
2383 						}
2384 					}
2385 				}
2386 
2387 				// A fragment node is not linked (and does not have valid links)
2388 				if (isFragmentNode)
2389 					break;
2390 
2391 				if (node->LeftLink() != lastDuplicateOffset) {
2392 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2393 						" has wrong left link %" B_PRIdOFF ", expected %"
2394 						B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
2395 						node->LeftLink(), lastDuplicateOffset);
2396 					check.FoundError();
2397 				}
2398 
2399 				lastDuplicateOffset = duplicateOffset;
2400 				duplicateOffset = node->RightLink();
2401 			}
2402 		} else if (!parent->IsLeaf()) {
2403 			// Test a regular child node recursively
2404 			off_t nextOffset = parent->OverflowLink();
2405 			if (i < count - 1)
2406 				nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
2407 
2408 			if (i == 0 && lastOffset != BPLUSTREE_NULL) {
2409 				// Test right link of the previous node
2410 				const bplustree_node* previous = cached.SetTo(lastOffset, true);
2411 				if (previous == NULL)
2412 					return B_IO_ERROR;
2413 
2414 				if (previous->RightLink() != childOffset) {
2415 					dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2416 						"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2417 						"!\n", fStream->ID(), lastOffset, previous->RightLink(),
2418 						childOffset);
2419 					check.FoundError();
2420 				}
2421 			}
2422 
2423 			status_t status = _ValidateChild(check, cached, level, childOffset,
2424 				lastOffset, nextOffset, key, keyLength);
2425 			if (status != B_OK)
2426 				return status;
2427 		} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
2428 			dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
2429 				"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
2430 				fStream->ID(), offset, childOffset, i);
2431 			check.FoundError();
2432 		}
2433 
2434 		lastOffset = childOffset;
2435 	}
2436 
2437 	if (parent->OverflowLink() != BPLUSTREE_NULL) {
2438 		off_t childOffset = parent->OverflowLink();
2439 		status_t status = _ValidateChild(check, cached, level, childOffset,
2440 			lastOffset, 0, NULL, 0);
2441 		if (status != B_OK)
2442 			return status;
2443 
2444 		lastOffset = childOffset;
2445 	}
2446 
2447 	check.SetPreviousOffset(level, lastOffset);
2448 	return B_OK;
2449 }
2450 
2451 
2452 status_t
2453 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
2454 	off_t offset, off_t lastOffset, off_t nextOffset,
2455 	const uint8* key, uint16 keyLength)
2456 {
2457 	const bplustree_node* node;
2458 	status_t status = cached.SetTo(offset, &node, true);
2459 	if (status != B_OK) {
2460 		if (status == B_IO_ERROR)
2461 			return B_IO_ERROR;
2462 
2463 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
2464 			"read: %s\n", fStream->ID(), offset, strerror(status));
2465 		check.FoundError();
2466 		return B_OK;
2467 	}
2468 
2469 	if (node->LeftLink() != lastOffset) {
2470 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2471 			"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
2472 			"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
2473 		check.FoundError();
2474 	}
2475 
2476 	if (nextOffset != 0 && node->RightLink() != nextOffset) {
2477 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2478 			"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2479 			"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
2480 		check.FoundError();
2481 	}
2482 
2483 	return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
2484 }
2485 
2486 
2487 //	#pragma mark -
2488 
2489 
2490 TreeIterator::TreeIterator(BPlusTree* tree)
2491 	:
2492 	fTree(tree),
2493 	fCurrentNodeOffset(BPLUSTREE_NULL)
2494 {
2495 	tree->_AddIterator(this);
2496 }
2497 
2498 
2499 TreeIterator::~TreeIterator()
2500 {
2501 	if (fTree)
2502 		fTree->_RemoveIterator(this);
2503 }
2504 
2505 
2506 status_t
2507 TreeIterator::Goto(int8 to)
2508 {
2509 	if (fTree == NULL || fTree->fStream == NULL)
2510 		RETURN_ERROR(B_BAD_VALUE);
2511 
2512 	// lock access to stream
2513 	InodeReadLocker locker(fTree->fStream);
2514 
2515 	off_t nodeOffset = fTree->fHeader.RootNode();
2516 	CachedNode cached(fTree);
2517 	const bplustree_node* node;
2518 
2519 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2520 		// is the node a leaf node?
2521 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2522 			fCurrentNodeOffset = nodeOffset;
2523 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
2524 			fDuplicateNode = BPLUSTREE_NULL;
2525 
2526 			return B_OK;
2527 		}
2528 
2529 		// get the next node offset depending on the direction (and if there
2530 		// are any keys in that node at all)
2531 		off_t nextOffset;
2532 		if (to == BPLUSTREE_END || node->all_key_count == 0)
2533 			nextOffset = node->OverflowLink();
2534 		else {
2535 			if (node->AllKeyLength() > fTree->fNodeSize
2536 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
2537 					- 8 * node->NumKeys())
2538 				RETURN_ERROR(B_ERROR);
2539 
2540 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
2541 		}
2542 		if (nextOffset == nodeOffset)
2543 			break;
2544 
2545 		nodeOffset = nextOffset;
2546 	}
2547 	FATAL(("%s fails\n", __FUNCTION__));
2548 
2549 	RETURN_ERROR(B_ERROR);
2550 }
2551 
2552 
2553 /*!	Iterates through the tree in the specified direction.
2554 	When it iterates through duplicates, the "key" is only updated for the
2555 	first entry - if you need to know when this happens, use the "duplicate"
2556 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2557 	the other duplicates.
2558 	That's not too nice, but saves the 256 bytes that would be needed to
2559 	store the last key - if this will ever become an issue, it will be
2560 	easy to change.
2561 	The other advantage of this is, that the queries can skip all duplicates
2562 	at once when they are not relevant to them.
2563 */
2564 status_t
2565 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2566 	uint16 maxLength, off_t* value, uint16* duplicate)
2567 {
2568 	if (fTree == NULL)
2569 		return B_INTERRUPTED;
2570 
2571 	bool forward = direction == BPLUSTREE_FORWARD;
2572 
2573 	if (fCurrentNodeOffset == BPLUSTREE_NULL
2574 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
2575 		RETURN_ERROR(B_ERROR);
2576 
2577 	// if the tree was emptied since the last call
2578 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
2579 		return B_ENTRY_NOT_FOUND;
2580 
2581 	// lock access to stream
2582 	InodeReadLocker locker(fTree->fStream);
2583 
2584 	CachedNode cached(fTree);
2585 	const bplustree_node* node;
2586 
2587 	if (fDuplicateNode != BPLUSTREE_NULL) {
2588 		// regardless of traverse direction the duplicates are always presented
2589 		// in the same order; since they are all considered as equal, this
2590 		// shouldn't cause any problems
2591 
2592 		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2593 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2594 				false);
2595 		} else
2596 			node = NULL;
2597 
2598 		if (node != NULL) {
2599 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2600 				// If the node is out of duplicates, we go directly to the next
2601 				// one
2602 				fDuplicateNode = node->RightLink();
2603 				if (fDuplicateNode != BPLUSTREE_NULL
2604 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2605 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2606 						false);
2607 					fDuplicate = 0;
2608 				}
2609 			}
2610 			if (fDuplicate < fNumDuplicates) {
2611 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2612 					fDuplicate++);
2613 				if (duplicate)
2614 					*duplicate = 2;
2615 				return B_OK;
2616 			}
2617 		}
2618 		fDuplicateNode = BPLUSTREE_NULL;
2619 	}
2620 
2621 	off_t savedNodeOffset = fCurrentNodeOffset;
2622 	int32 savedKey = fCurrentKey;
2623 
2624 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2625 		RETURN_ERROR(B_ERROR);
2626 
2627 	if (duplicate)
2628 		*duplicate = 0;
2629 
2630 	fCurrentKey += direction;
2631 
2632 	// is the current key in the current node?
2633 	while ((forward && fCurrentKey >= node->NumKeys())
2634 			|| (!forward && fCurrentKey < 0)) {
2635 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2636 
2637 		// are there any more nodes?
2638 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2639 			node = cached.SetTo(fCurrentNodeOffset);
2640 			if (!node)
2641 				RETURN_ERROR(B_ERROR);
2642 
2643 			// reset current key
2644 			fCurrentKey = forward ? 0 : node->NumKeys();
2645 		} else {
2646 			// there are no nodes left, so turn back to the last key
2647 			fCurrentNodeOffset = savedNodeOffset;
2648 			fCurrentKey = savedKey;
2649 
2650 			return B_ENTRY_NOT_FOUND;
2651 		}
2652 	}
2653 
2654 	if (node->all_key_count == 0)
2655 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2656 
2657 	uint16 length;
2658 	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2659 	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2660 			> (uint8*)node + fTree->fNodeSize
2661 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2662 		fTree->fStream->GetVolume()->Panic();
2663 		RETURN_ERROR(B_BAD_DATA);
2664 	}
2665 
2666 	length = min_c(length, maxLength);
2667 	memcpy(key, keyStart, length);
2668 
2669 	if (fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE)	{
2670 		// terminate string type
2671 		if (length == maxLength)
2672 			length--;
2673 		((char*)key)[length] = '\0';
2674 	}
2675 	*keyLength = length;
2676 
2677 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2678 
2679 	// duplicate fragments?
2680 	uint8 type = bplustree_node::LinkType(offset);
2681 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2682 		|| type == BPLUSTREE_DUPLICATE_NODE) {
2683 		fDuplicateNode = offset;
2684 
2685 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2686 			false);
2687 		if (node == NULL)
2688 			RETURN_ERROR(B_ERROR);
2689 
2690 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2691 
2692 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2693 		if (fNumDuplicates) {
2694 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2695 			fDuplicate = 1;
2696 			if (duplicate)
2697 				*duplicate = 1;
2698 		} else {
2699 			// Shouldn't happen, but we're dealing here with potentially
2700 			// corrupt disks...
2701 			fDuplicateNode = BPLUSTREE_NULL;
2702 			offset = 0;
2703 		}
2704 	}
2705 	*value = offset;
2706 
2707 	return B_OK;
2708 }
2709 
2710 
2711 /*!	This is more or less a copy of BPlusTree::Find() - but it just
2712 	sets the current position in the iterator, regardless of if the
2713 	key could be found or not.
2714 */
2715 status_t
2716 TreeIterator::Find(const uint8* key, uint16 keyLength)
2717 {
2718 	if (fTree == NULL)
2719 		return B_INTERRUPTED;
2720 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2721 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2722 		|| key == NULL)
2723 		RETURN_ERROR(B_BAD_VALUE);
2724 
2725 	// lock access to stream
2726 	InodeReadLocker locker(fTree->fStream);
2727 
2728 	off_t nodeOffset = fTree->fHeader.RootNode();
2729 
2730 	CachedNode cached(fTree);
2731 	const bplustree_node* node;
2732 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2733 		uint16 keyIndex = 0;
2734 		off_t nextOffset;
2735 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2736 			&nextOffset);
2737 
2738 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2739 			fCurrentNodeOffset = nodeOffset;
2740 			fCurrentKey = keyIndex - 1;
2741 			fDuplicateNode = BPLUSTREE_NULL;
2742 
2743 			return status;
2744 		} else if (nextOffset == nodeOffset)
2745 			RETURN_ERROR(B_ERROR);
2746 
2747 		nodeOffset = nextOffset;
2748 	}
2749 	RETURN_ERROR(B_ERROR);
2750 }
2751 
2752 
2753 void
2754 TreeIterator::SkipDuplicates()
2755 {
2756 	fDuplicateNode = BPLUSTREE_NULL;
2757 }
2758 
2759 
2760 void
2761 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2762 	uint16 splitAt, int8 change)
2763 {
2764 	if (offset != fCurrentNodeOffset)
2765 		return;
2766 
2767 	if (nextOffset != BPLUSTREE_NULL) {
2768 		fCurrentNodeOffset = nextOffset;
2769 		if (splitAt <= fCurrentKey) {
2770 			fCurrentKey -= splitAt;
2771 			keyIndex -= splitAt;
2772 		}
2773 	}
2774 
2775 	// Adjust fCurrentKey to point to the same key as before.
2776 	// Note, that if a key is inserted at the current position
2777 	// it won't be included in this tree transition.
2778 	if (keyIndex <= fCurrentKey)
2779 		fCurrentKey += change;
2780 
2781 	// TODO: duplicate handling!
2782 }
2783 
2784 
2785 void
2786 TreeIterator::Stop()
2787 {
2788 	fTree = NULL;
2789 }
2790 
2791 
2792 #ifdef DEBUG
2793 void
2794 TreeIterator::Dump()
2795 {
2796 	__out("TreeIterator at %p:\n", this);
2797 	__out("\tfTree = %p\n", fTree);
2798 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2799 	__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2800 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n",
2801 		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2802 		fDuplicateNode);
2803 	__out("\tfDuplicate = %u\n", fDuplicate);
2804 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2805 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2806 }
2807 #endif
2808 
2809 
2810 // #pragma mark -
2811 
2812 
2813 bool
2814 bplustree_header::IsValid() const
2815 {
2816 	return Magic() == BPLUSTREE_MAGIC
2817 		&& (RootNode() % NodeSize()) == 0
2818 		&& IsValidLink(RootNode())
2819 		&& IsValidLink(FreeNode());
2820 }
2821 
2822 
2823 // #pragma mark -
2824 
2825 
2826 void
2827 bplustree_node::Initialize()
2828 {
2829 	left_link = right_link = overflow_link
2830 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2831 	all_key_count = 0;
2832 	all_key_length = 0;
2833 }
2834 
2835 
2836 uint8*
2837 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2838 {
2839 	if (index < 0 || index > NumKeys())
2840 		return NULL;
2841 
2842 	uint8* keyStart = Keys();
2843 	uint16* keyLengths = KeyLengths();
2844 
2845 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2846 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2847 	if (index > 0)
2848 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2849 
2850 	return keyStart;
2851 }
2852 
2853 
2854 uint8
2855 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2856 {
2857 	// the duplicate fragment handling is currently hard-coded to a node size
2858 	// of 1024 bytes - with future versions of BFS, this may be a problem
2859 
2860 	if (isFragment) {
2861 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2862 
2863 		return ((off_t*)this)[fragment];
2864 	}
2865 	return OverflowLink();
2866 }
2867 
2868 
2869 off_t
2870 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2871 {
2872 	uint32 start;
2873 	if (isFragment)
2874 		start = 8 * ((uint64)offset & 0x3ff);
2875 	else
2876 		start = 2;
2877 
2878 	return ((off_t*)this)[start + 1 + index];
2879 }
2880 
2881 
2882 /*!	Although the name suggests it, this function doesn't return the real
2883 	used fragment count; at least, it can only count to two: it returns
2884 	0, if there is no fragment used, 1 if there is only one fragment
2885 	used, and 2 if there are at least 2 fragments used.
2886 */
2887 uint32
2888 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2889 {
2890 	uint32 used = 0;
2891 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2892 		duplicate_array* array = FragmentAt(i);
2893 		if (array->Count() > 0 && ++used > 1)
2894 			return used;
2895 	}
2896 	return used;
2897 }
2898 
2899 
2900 status_t
2901 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2902 {
2903 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2904 		DEBUGGER(("invalid node: key/length count"));
2905 
2906 	for (int32 i = 0; i < NumKeys(); i++) {
2907 		uint16 length;
2908 		uint8* key = KeyAt(i, &length);
2909 		if (key + length + sizeof(off_t) + sizeof(uint16)
2910 				> (uint8*)this + nodeSize
2911 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2912 			dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
2913 			return B_BAD_DATA;
2914 		}
2915 		if (Values()[i] == -1) {
2916 			dprintf("invalid node %p, value %d: %lld: values corrupted\n",
2917 				this, (int)i, Values()[i]);
2918 			return B_BAD_DATA;
2919 		}
2920 	}
2921 	return B_OK;
2922 }
2923 
2924 
2925 // #pragma mark -
2926 
2927 
2928 BitmapArray::BitmapArray(size_t numBits)
2929 {
2930 	fSize = (numBits + 7) / 8;
2931 	fBitmap = (uint8*)calloc(fSize, 1);
2932 	fCountSet = 0;
2933 }
2934 
2935 
2936 BitmapArray::~BitmapArray()
2937 {
2938 	free(fBitmap);
2939 }
2940 
2941 
2942 status_t
2943 BitmapArray::InitCheck() const
2944 {
2945 	return fBitmap != NULL ? B_OK : B_NO_MEMORY;
2946 }
2947 
2948 
2949 bool
2950 BitmapArray::IsSet(size_t index) const
2951 {
2952 	uint32 byteIndex = index / 8;
2953 	if (byteIndex >= fSize)
2954 		return false;
2955 
2956 	return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
2957 }
2958 
2959 
2960 void
2961 BitmapArray::Set(size_t index, bool set)
2962 {
2963 	uint32 byteIndex = index / 8;
2964 	if (byteIndex >= fSize)
2965 		return;
2966 
2967 	if (set) {
2968 		fBitmap[byteIndex] |= 1UL << (index & 0x7);
2969 		fCountSet++;
2970 	} else {
2971 		fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
2972 		fCountSet--;
2973 	}
2974 }
2975 
2976 
2977 // #pragma mark -
2978 
2979 
2980 bool
2981 duplicate_array::_FindInternal(off_t value, int32& index) const
2982 {
2983 	int32 min = 0, max = Count() - 1;
2984 	off_t cmp;
2985 	while (min <= max) {
2986 		index = (min + max) / 2;
2987 
2988 		cmp = ValueAt(index) - value;
2989 		if (cmp < 0)
2990 			min = index + 1;
2991 		else if (cmp > 0)
2992 			max = index - 1;
2993 		else
2994 			return true;
2995 	}
2996 	return false;
2997 }
2998 
2999 
3000 void
3001 duplicate_array::Insert(off_t value)
3002 {
3003 	// if there are more than 8 values in this array, use a
3004 	// binary search, if not, just iterate linearly to find
3005 	// the insertion point
3006 	int32 size = Count();
3007 	int32 i;
3008 	if (size > 8 ) {
3009 		if (!_FindInternal(value, i) && ValueAt(i) <= value)
3010 			i++;
3011 	} else {
3012 		for (i = 0; i < size; i++) {
3013 			if (ValueAt(i) > value)
3014 				break;
3015 		}
3016 	}
3017 
3018 	memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
3019 	values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
3020 	count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
3021 }
3022 
3023 
3024 bool
3025 duplicate_array::Remove(off_t value)
3026 {
3027 	int32 index = Find(value);
3028 	if (index == -1)
3029 		return false;
3030 
3031 	int32 newSize = Count() - 1;
3032 	memmove(&values[index], &values[index + 1],
3033 		(newSize - index) * sizeof(off_t));
3034 	count = HOST_ENDIAN_TO_BFS_INT64(newSize);
3035 
3036 	return true;
3037 }
3038 
3039 
3040 // #pragma mark -
3041 
3042 
3043 int32
3044 compareKeys(type_code type, const void* key1, int keyLength1,
3045 	const void* key2, int keyLength2)
3046 {
3047 	// if one of the keys is NULL, bail out gracefully
3048 	if (key1 == NULL || key2 == NULL) {
3049 		// even if it's most probably a bug in the calling code, we try to
3050 		// give a meaningful result
3051 		if (key1 == NULL && key2 != NULL)
3052 			return 1;
3053 		if (key2 == NULL && key1 != NULL)
3054 			return -1;
3055 
3056 		return 0;
3057 	}
3058 
3059 	switch (type) {
3060 	    case B_STRING_TYPE:
3061     	{
3062 			int result = memcmp(key1, key2, min_c(keyLength1, keyLength2));
3063 			if (result == 0) {
3064 				// ignore trailing null bytes
3065 				if ((keyLength1 == keyLength2 + 1
3066 						&& ((uint8*)key1)[keyLength2] == '\0')
3067 					|| (keyLength2 == keyLength1 + 1
3068 						&& ((uint8*)key2)[keyLength1] == '\0'))
3069 					return 0;
3070 
3071 				result = keyLength1 - keyLength2;
3072 			}
3073 
3074 			return result;
3075 		}
3076 
3077 		case B_SSIZE_T_TYPE:
3078 		case B_INT32_TYPE:
3079 			return *(int32*)key1 - *(int32*)key2;
3080 
3081 		case B_SIZE_T_TYPE:
3082 		case B_UINT32_TYPE:
3083 			if (*(uint32*)key1 == *(uint32*)key2)
3084 				return 0;
3085 			if (*(uint32*)key1 > *(uint32*)key2)
3086 				return 1;
3087 
3088 			return -1;
3089 
3090 		case B_OFF_T_TYPE:
3091 		case B_INT64_TYPE:
3092 			if (*(int64*)key1 == *(int64*)key2)
3093 				return 0;
3094 			if (*(int64*)key1 > *(int64*)key2)
3095 				return 1;
3096 
3097 			return -1;
3098 
3099 		case B_UINT64_TYPE:
3100 			if (*(uint64*)key1 == *(uint64*)key2)
3101 				return 0;
3102 			if (*(uint64*)key1 > *(uint64*)key2)
3103 				return 1;
3104 
3105 			return -1;
3106 
3107 		case B_FLOAT_TYPE:
3108 		{
3109 			float result = *(float*)key1 - *(float*)key2;
3110 			if (result == 0.0f)
3111 				return 0;
3112 
3113 			return (result < 0.0f) ? -1 : 1;
3114 		}
3115 
3116 		case B_DOUBLE_TYPE:
3117 		{
3118 			double result = *(double*)key1 - *(double*)key2;
3119 			if (result == 0.0)
3120 				return 0;
3121 
3122 			return (result < 0.0) ? -1 : 1;
3123 		}
3124 	}
3125 
3126 	// if the type is unknown, the entries don't match...
3127 	return -1;
3128 }
3129 
3130