xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 0754c319592cd8a523959d85fb06ab23c64a98a6)
1 /*
2  * Copyright 2001-2014, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 
10 //! B+Tree implementation
11 
12 
13 #include "BPlusTree.h"
14 
15 #include <file_systems/QueryParserUtils.h>
16 
17 #include "Debug.h"
18 #include "Utility.h"
19 
20 #if !_BOOT_MODE
21 #	include "Inode.h"
22 #else
23 #	include "Stream.h"
24 
25 // BFS::Stream from the bootloader has the same API as Inode.
26 #	define Inode BFS::Stream
27 
28 #	define strerror(x)		"error message unavailable"
29 
30 namespace BFS {
31 #endif
32 
33 
34 /*!	Simple array used for the duplicate handling in the B+Tree. This is an
35 	on disk structure.
36 */
37 struct duplicate_array {
38 	off_t	count;
39 	off_t	values[0];
40 
41 	inline bool IsEmpty() const
42 	{
43 		return count == 0;
44 	}
45 
46 	inline int32 Count() const
47 	{
48 		return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
49 	}
50 
51 	inline off_t ValueAt(uint32 index) const
52 	{
53 		return BFS_ENDIAN_TO_HOST_INT64(values[index]);
54 	}
55 
56 	inline void SetValueAt(uint32 index, off_t value)
57 	{
58 		values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
59 	}
60 
61 	inline int32 Find(off_t value) const
62 	{
63 		int32 i;
64 		return _FindInternal(value, i) ? i : -1;
65 	}
66 
67 	void Insert(off_t value);
68 	bool Remove(off_t value);
69 
70 private:
71 	bool _FindInternal(off_t value, int32& index) const;
72 } _PACKED;
73 
74 
75 #ifdef DEBUG
76 class NodeChecker {
77 public:
78 	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
79 		:
80 		fNode(node),
81 		fSize(nodeSize),
82 		fText(text)
83 	{
84 		Check("integrity check failed on construction.");
85 	}
86 
87 	~NodeChecker()
88 	{
89 		Check("integrity check failed on destruction.");
90 	}
91 
92 	void
93 	Check(const char* message)
94 	{
95 		if (fNode->CheckIntegrity(fSize) != B_OK) {
96 			dprintf("%s: %s\n", fText, message);
97 			DEBUGGER(("NodeChecker integrity check failed!"));
98 		}
99 	}
100 
101 private:
102 	const bplustree_node*	fNode;
103 	int32					fSize;
104 	const char*				fText;
105 };
106 #endif // DEBUG
107 
108 
109 #if !_BOOT_MODE
110 class BitmapArray {
111 public:
112 								BitmapArray(size_t numBits);
113 								~BitmapArray();
114 
115 			status_t			InitCheck() const;
116 
117 			bool				IsSet(size_t index) const;
118 			void				Set(size_t index, bool set);
119 
120 			size_t				CountSet() const { return fCountSet; }
121 
122 private:
123 			uint8*				fBitmap;
124 			size_t				fSize;
125 			size_t				fCountSet;
126 };
127 
128 
129 struct TreeCheck {
130 	TreeCheck(BPlusTree* tree)
131 		:
132 		fLevelCount(0),
133 		fFreeCount(0),
134 		fNodeSize(tree->NodeSize()),
135 		fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
136 		fFoundErrors(0),
137 		fVisited(tree->Stream()->Size() / tree->NodeSize()),
138 		fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
139 	{
140 		fPreviousOffsets = (off_t*)malloc(
141 			sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
142 		if (fPreviousOffsets != NULL) {
143 			for (size_t i = 0; i < fMaxLevels; i++)
144 				fPreviousOffsets[i] = BPLUSTREE_NULL;
145 		}
146 	}
147 
148 	~TreeCheck()
149 	{
150 		free(fPreviousOffsets);
151 	}
152 
153 	status_t InitCheck() const
154 	{
155 		if (fPreviousOffsets == NULL)
156 			return B_NO_MEMORY;
157 
158 		status_t status = fVisited.InitCheck();
159 		if (status != B_OK)
160 			return status;
161 
162 		return fVisitedFragment.InitCheck();
163 	}
164 
165 	bool Visited(off_t offset) const
166 	{
167 		return fVisited.IsSet(offset / fNodeSize);
168 	}
169 
170 	void SetVisited(off_t offset)
171 	{
172 		fVisited.Set(offset / fNodeSize, true);
173 	}
174 
175 	size_t VisitedCount() const
176 	{
177 		return fVisited.CountSet();
178 	}
179 
180 	bool VisitedFragment(off_t offset) const
181 	{
182 		return fVisitedFragment.IsSet(offset / fNodeSize);
183 	}
184 
185 	void SetVisitedFragment(off_t offset)
186 	{
187 		fVisitedFragment.Set(offset / fNodeSize, true);
188 	}
189 
190 	uint32 MaxLevels() const
191 	{
192 		return fLevelCount;
193 	}
194 
195 	void SetLevel(uint32 level)
196 	{
197 		if (fLevelCount < level)
198 			fLevelCount = level;
199 	}
200 
201 	off_t PreviousOffset(uint32 level)
202 	{
203 		return fPreviousOffsets[level];
204 	}
205 
206 	void SetPreviousOffset(uint32 level, off_t offset)
207 	{
208 		fPreviousOffsets[level] = offset;
209 	}
210 
211 	void FoundError()
212 	{
213 		fFoundErrors++;
214 	}
215 
216 	bool ErrorsFound()
217 	{
218 		return fFoundErrors != 0;
219 	}
220 
221 private:
222 			uint32				fLevelCount;
223 			uint32				fFreeCount;
224 			uint32				fNodeSize;
225 			uint32				fMaxLevels;
226 			uint32				fFoundErrors;
227 			BitmapArray			fVisited;
228 			BitmapArray			fVisitedFragment;
229 			off_t*				fPreviousOffsets;
230 };
231 
232 
233 // #pragma mark -
234 
235 
236 // Node Caching for the BPlusTree class
237 //
238 // With write support, there is the need for a function that allocates new
239 // nodes by either returning empty nodes, or by growing the file's data stream
240 //
241 // !! The CachedNode class assumes that you have properly locked the stream
242 // !! before asking for nodes.
243 //
244 // Note: This code will fail if the block size is smaller than the node size!
245 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
246 // is hard-coded to 1024 bytes, that's not an issue now.
247 
248 void
249 CachedNode::UnsetUnchanged(Transaction& transaction)
250 {
251 	if (fTree == NULL || fTree->fStream == NULL)
252 		return;
253 
254 	if (fNode != NULL) {
255 		void* cache = fTree->fStream->GetVolume()->BlockCache();
256 
257 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
258 		block_cache_put(cache, fBlockNumber);
259 		fNode = NULL;
260 	}
261 }
262 #endif // !_BOOT_MODE
263 
264 
265 void
266 CachedNode::Unset()
267 {
268 	if (fTree == NULL || fTree->fStream == NULL)
269 		return;
270 
271 	if (fNode != NULL) {
272 #if !_BOOT_MODE
273 		if (fWritable && fOffset == 0) {
274 			// The B+tree header has been updated - we need to update the
275 			// BPlusTrees copy of it, as well.
276 			memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
277 		}
278 
279 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
280 			fBlockNumber);
281 #endif // !_BOOT_MODE
282 
283 		fNode = NULL;
284 	}
285 }
286 
287 
288 const bplustree_node*
289 CachedNode::SetTo(off_t offset, bool check)
290 {
291 	const bplustree_node* node;
292 	if (SetTo(offset, &node, check) == B_OK)
293 		return node;
294 
295 	return NULL;
296 }
297 
298 
299 status_t
300 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
301 {
302 	if (fTree == NULL || fTree->fStream == NULL)
303 		RETURN_ERROR(B_BAD_VALUE);
304 
305 	Unset();
306 
307 	// You can only ask for nodes at valid positions - you can't
308 	// even access the b+tree header with this method (use SetToHeader()
309 	// instead)
310 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
311 		|| offset <= 0
312 		|| (offset % fTree->fNodeSize) != 0) {
313 		RETURN_ERROR(B_BAD_VALUE);
314 	}
315 
316 	if (InternalSetTo(NULL, offset) != NULL && check) {
317 		// sanity checks (links, all_key_count)
318 		if (!fTree->fHeader.CheckNode(fNode)) {
319 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
320 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
321 				fBlockNumber, fTree->fStream->ID()));
322 			return B_BAD_DATA;
323 		}
324 	}
325 
326 	*_node = fNode;
327 	return B_OK;
328 }
329 
330 
331 #if !_BOOT_MODE
332 bplustree_node*
333 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
334 {
335 	if (fTree == NULL || fTree->fStream == NULL) {
336 		REPORT_ERROR(B_BAD_VALUE);
337 		return NULL;
338 	}
339 
340 	Unset();
341 
342 	// You can only ask for nodes at valid positions - you can't
343 	// even access the b+tree header with this method (use SetToHeader()
344 	// instead)
345 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
346 		|| offset <= 0
347 		|| (offset % fTree->fNodeSize) != 0)
348 		return NULL;
349 
350 	if (InternalSetTo(&transaction, offset) != NULL && check) {
351 		// sanity checks (links, all_key_count)
352 		if (!fTree->fHeader.CheckNode(fNode)) {
353 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
354 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
355 				fBlockNumber, fTree->fStream->ID()));
356 			return NULL;
357 		}
358 	}
359 	return fNode;
360 }
361 
362 
363 bplustree_node*
364 CachedNode::MakeWritable(Transaction& transaction)
365 {
366 	if (fNode == NULL)
367 		return NULL;
368 
369 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
370 			fBlockNumber, transaction.ID()) == B_OK) {
371 		return fNode;
372 	}
373 
374 	return NULL;
375 }
376 #endif // !_BOOT_MODE
377 
378 
379 const bplustree_header*
380 CachedNode::SetToHeader()
381 {
382 	if (fTree == NULL || fTree->fStream == NULL) {
383 		REPORT_ERROR(B_BAD_VALUE);
384 		return NULL;
385 	}
386 
387 	Unset();
388 
389 	InternalSetTo(NULL, 0LL);
390 	return (bplustree_header*)fNode;
391 }
392 
393 
394 #if !_BOOT_MODE
395 bplustree_header*
396 CachedNode::SetToWritableHeader(Transaction& transaction)
397 {
398 	if (fTree == NULL || fTree->fStream == NULL) {
399 		REPORT_ERROR(B_BAD_VALUE);
400 		return NULL;
401 	}
402 
403 	Unset();
404 
405 	InternalSetTo(&transaction, 0LL);
406 
407 	if (fNode != NULL && !fTree->fInTransaction) {
408 		transaction.AddListener(fTree);
409 		fTree->fInTransaction = true;
410 
411 		if (!transaction.GetVolume()->IsInitializing()) {
412 			acquire_vnode(transaction.GetVolume()->FSVolume(),
413 				fTree->fStream->ID());
414 		}
415 	}
416 
417 	return (bplustree_header*)fNode;
418 }
419 #endif // !_BOOT_MODE
420 
421 
422 bplustree_node*
423 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
424 {
425 	fNode = NULL;
426 	fOffset = offset;
427 
428 	off_t fileOffset;
429 	block_run run;
430 	if (offset < fTree->fStream->Size()
431 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
432 
433 #if !_BOOT_MODE
434 		Volume* volume = fTree->fStream->GetVolume();
435 #else
436 		Volume* volume = &fTree->fStream->GetVolume();
437 #endif
438 
439 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
440 		fBlockNumber = volume->ToBlock(run) + blockOffset;
441 		uint8* block = NULL;
442 
443 #if !_BOOT_MODE
444 		if (transaction != NULL) {
445 			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
446 				fBlockNumber, transaction->ID());
447 			fWritable = true;
448 		} else {
449 			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
450 			fWritable = false;
451 		}
452 #else // !_BOOT_MODE
453 		if (fBlock == NULL) {
454 			fBlock = (uint8*)malloc(volume->BlockSize());
455 			if (fBlock == NULL)
456 				return NULL;
457 		}
458 
459 		if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(),
460 				fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) {
461 			block = fBlock;
462 		}
463 
464 		fWritable = false;
465 #endif // _BOOT_MODE
466 
467 		if (block != NULL) {
468 			// The node is somewhere in that block...
469 			// (confusing offset calculation)
470 			fNode = (bplustree_node*)(block + offset
471 				- (fileOffset + (blockOffset << volume->BlockShift())));
472 		} else
473 			REPORT_ERROR(B_IO_ERROR);
474 	}
475 	return fNode;
476 }
477 
478 
479 #if !_BOOT_MODE
480 status_t
481 CachedNode::Free(Transaction& transaction, off_t offset)
482 {
483 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
484 		RETURN_ERROR(B_BAD_VALUE);
485 
486 	// TODO: scan the free nodes list and remove all nodes at the end
487 	// of the tree - perhaps that shouldn't be done everytime that
488 	// function is called, perhaps it should be done when the directory
489 	// inode is closed or based on some calculation or whatever...
490 
491 	CachedNode cached(fTree);
492 	bplustree_header* header = cached.SetToWritableHeader(transaction);
493 	if (header == NULL)
494 		return B_IO_ERROR;
495 
496 #if 0
497 	// TODO: temporarily disabled because CheckNode() doesn't like this...
498 	// 		Also, it's such an edge case that it's almost useless, anyway.
499 	// if the node is the last one in the tree, we shrink
500 	// the tree and file size by one node
501 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
502 	if (offset == lastOffset) {
503 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
504 		if (status != B_OK)
505 			return status;
506 
507 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
508 		return B_OK;
509 	}
510 #endif
511 
512 	// add the node to the free nodes list
513 	fNode->left_link = header->free_node_pointer;
514 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
515 
516 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
517 	return B_OK;
518 }
519 
520 
521 status_t
522 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
523 	off_t* _offset)
524 {
525 	if (fTree == NULL || fTree->fStream == NULL)
526 		RETURN_ERROR(B_BAD_VALUE);
527 
528 	Unset();
529 
530 	bplustree_header* header;
531 	status_t status;
532 
533 	// if there are any free nodes, recycle them
534 	if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
535 		CachedNode cached(fTree);
536 		header = cached.SetToWritableHeader(transaction);
537 		if (header == NULL)
538 			return B_IO_ERROR;
539 
540 		*_offset = header->FreeNode();
541 		*_node = fNode;
542 
543 		// set new free node pointer
544 		header->free_node_pointer = fNode->left_link;
545 		fNode->Initialize();
546 		return B_OK;
547 	}
548 
549 	// allocate space for a new node
550 	Inode* stream = fTree->fStream;
551 	if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
552 		return status;
553 
554 	CachedNode cached(fTree);
555 	header = cached.SetToWritableHeader(transaction);
556 	if (header == NULL)
557 		return B_IO_ERROR;
558 
559 	// the maximum_size has to be changed before the call to SetTo() - or
560 	// else it will fail because the requested node is out of bounds
561 	off_t offset = header->MaximumSize();
562 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
563 		+ fTree->fNodeSize);
564 
565 	cached.Unset();
566 		// SetToWritable() below needs the new values in the tree's header
567 
568 	if (SetToWritable(transaction, offset, false) == NULL)
569 		RETURN_ERROR(B_ERROR);
570 
571 	fNode->Initialize();
572 
573 	*_offset = offset;
574 	*_node = fNode;
575 	return B_OK;
576 }
577 
578 
579 //	#pragma mark -
580 
581 
582 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
583 	:
584 	fStream(NULL),
585 	fInTransaction(false)
586 {
587 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
588 	SetTo(transaction, stream);
589 }
590 #endif // !_BOOT_MODE
591 
592 
593 BPlusTree::BPlusTree(Inode* stream)
594 	:
595 	fStream(NULL),
596 	fInTransaction(false)
597 {
598 #if !_BOOT_MODE
599 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
600 #endif
601 
602 	SetTo(stream);
603 }
604 
605 
606 BPlusTree::BPlusTree()
607 	:
608 	fStream(NULL),
609 	fNodeSize(BPLUSTREE_NODE_SIZE),
610 	fAllowDuplicates(true),
611 	fInTransaction(false),
612 	fStatus(B_NO_INIT)
613 {
614 #if !_BOOT_MODE
615 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
616 #endif
617 }
618 
619 
620 BPlusTree::~BPlusTree()
621 {
622 #if !_BOOT_MODE
623 	// if there are any TreeIterators left, we need to stop them
624 	// (can happen when the tree's inode gets deleted while
625 	// traversing the tree - a TreeIterator doesn't lock the inode)
626 	mutex_lock(&fIteratorLock);
627 
628 	SinglyLinkedList<TreeIterator>::Iterator iterator
629 		= fIterators.GetIterator();
630 	while (iterator.HasNext())
631 		iterator.Next()->Stop();
632 
633 	mutex_destroy(&fIteratorLock);
634 
635 	ASSERT(!fInTransaction);
636 #endif // !_BOOT_MODE
637 }
638 
639 
640 #if !_BOOT_MODE
641 /*! Create a new B+Tree on the specified stream */
642 status_t
643 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
644 {
645 	// initializes in-memory B+Tree
646 
647 	fStream = stream;
648 
649 	CachedNode cached(this);
650 	bplustree_header* header = cached.SetToWritableHeader(transaction);
651 	if (header == NULL) {
652 		// allocate space for new header + node!
653 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
654 		if (fStatus != B_OK)
655 			RETURN_ERROR(fStatus);
656 
657 		header = cached.SetToWritableHeader(transaction);
658 		if (header == NULL)
659 			RETURN_ERROR(fStatus = B_ERROR);
660 	}
661 
662 	fAllowDuplicates = stream->IsIndex()
663 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
664 
665 	fNodeSize = nodeSize;
666 
667 	// initialize b+tree header
668  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
669  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
670  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
671  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
672  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
673  	header->free_node_pointer
674  		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
675  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
676 
677 	cached.Unset();
678 
679 	// initialize b+tree root node
680 	cached.SetToWritable(transaction, fHeader.RootNode(), false);
681 	if (cached.Node() == NULL)
682 		RETURN_ERROR(B_IO_ERROR);
683 
684 	cached.Node()->Initialize();
685 
686 	return fStatus = B_OK;
687 }
688 #endif // !_BOOT_MODE
689 
690 
691 status_t
692 BPlusTree::SetTo(Inode* stream)
693 {
694 	if (stream == NULL)
695 		RETURN_ERROR(fStatus = B_BAD_VALUE);
696 
697 	fStream = stream;
698 
699 	// get on-disk B+Tree header
700 
701 	CachedNode cached(this);
702 	const bplustree_header* header = cached.SetToHeader();
703 	if (header != NULL)
704 		memcpy(&fHeader, header, sizeof(bplustree_header));
705 	else
706 		RETURN_ERROR(fStatus = B_IO_ERROR);
707 
708 	// is header valid?
709 
710 	if (fHeader.MaximumSize() != stream->Size()) {
711 		dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
712 			B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
713 		// we can't change the header since we don't have a transaction
714 		//fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
715 	}
716 	if (!fHeader.IsValid()) {
717 #ifdef DEBUG
718 		dump_bplustree_header(&fHeader);
719 		dump_block((const char*)&fHeader, 128);
720 #endif
721 		RETURN_ERROR(fStatus = B_BAD_DATA);
722 	}
723 
724 	fNodeSize = fHeader.NodeSize();
725 
726 	// validity check
727 	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
728 		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
729 		S_DOUBLE_INDEX};
730 	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
731 		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
732 		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
733 
734 	if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
735 		|| ((stream->Mode() & S_INDEX_DIR) != 0
736 			&& kToMode[fHeader.DataType()] != mode)
737 		|| !stream->IsContainer()) {
738 		D(	dump_bplustree_header(&fHeader);
739 			dump_inode(&stream->Node());
740 		);
741 		RETURN_ERROR(fStatus = B_BAD_TYPE);
742 	}
743 
744 	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
745 	// in the original BFS code - we will honour it nevertheless
746 	fAllowDuplicates = stream->IsIndex()
747 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
748 
749 	cached.SetTo(fHeader.RootNode());
750 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
751 }
752 
753 
754 status_t
755 BPlusTree::InitCheck()
756 {
757 	return fStatus;
758 }
759 
760 
761 #if !_BOOT_MODE
762 status_t
763 BPlusTree::Validate(bool repair, bool& _errorsFound)
764 {
765 	TreeCheck check(this);
766 	if (check.InitCheck() != B_OK)
767 		return B_NO_MEMORY;
768 
769 	check.SetVisited(0);
770 
771 	// Walk the free nodes
772 
773 	CachedNode cached(this);
774 	off_t freeOffset = fHeader.FreeNode();
775 	while (freeOffset > 0) {
776 		const bplustree_node* node;
777 		status_t status = cached.SetTo(freeOffset, &node, false);
778 		if (status != B_OK) {
779 			if (status == B_IO_ERROR)
780 				return B_IO_ERROR;
781 
782 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
783 				"not be read: %s\n", fStream->ID(), freeOffset,
784 				strerror(status));
785 			check.FoundError();
786 			break;
787 		}
788 
789 		if (check.Visited(freeOffset)) {
790 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
791 				" circular!\n", fStream->ID(), freeOffset);
792 			// TODO: if 'repair' is true, we could collect all unvisited nodes
793 			// at the end, and put the into the free list
794 			check.FoundError();
795 			break;
796 		}
797 
798 		check.SetVisited(freeOffset);
799 
800 		if (node->OverflowLink() != BPLUSTREE_FREE) {
801 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
802 				" misses free mark!\n", fStream->ID(), freeOffset);
803 		}
804 		freeOffset = node->LeftLink();
805 	}
806 
807 	// Iterate over the complete tree recursively
808 
809 	const bplustree_node* root;
810 	status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
811 	if (status != B_OK)
812 		return status;
813 
814 	status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
815 
816 	if (check.ErrorsFound())
817 		_errorsFound = true;
818 
819 	if (status != B_OK)
820 		return status;
821 
822 	if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
823 		dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
824 			"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
825 			fHeader.MaxNumberOfLevels());
826 	}
827 
828 	if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
829 		dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
830 			" nodes.\n", fStream->ID(), check.VisitedCount(),
831 			fHeader.MaximumSize() / fNodeSize);
832 	}
833 
834 	return B_OK;
835 }
836 
837 
838 status_t
839 BPlusTree::MakeEmpty()
840 {
841 	// Put all nodes into the free list in order
842 	Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
843 
844 	// Reset the header, and root node
845 	CachedNode cached(this);
846 	bplustree_header* header = cached.SetToWritableHeader(transaction);
847 	if (header == NULL)
848 		return B_IO_ERROR;
849 
850 	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
851 	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
852 	if (fStream->Size() > (off_t)NodeSize() * 2)
853 		header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
854 	else {
855 		header->free_node_pointer
856 			= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
857 	}
858 
859 	bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
860 	if (node == NULL)
861 		return B_IO_ERROR;
862 
863 	node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
864 	node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
865 	node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
866 	node->all_key_count = 0;
867 	node->all_key_length = 0;
868 
869 	for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
870 			offset += NodeSize()) {
871 		bplustree_node* node = cached.SetToWritable(transaction, offset, false);
872 		if (node == NULL) {
873 			dprintf("--> could not open %" B_PRIdOFF "\n", offset);
874 			return B_IO_ERROR;
875 		}
876 		if (offset < fStream->Size() - (off_t)NodeSize())
877 			node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
878 		else
879 			node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
880 
881 		node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
882 
883 		// It's not important to write it out in a single transaction; just
884 		// make sure it doesn't get too large
885 		if (offset % (1024 * 1024) == 0) {
886 			transaction.Done();
887 			transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
888 		}
889 	}
890 
891 	return transaction.Done();
892 }
893 
894 
895 int32
896 BPlusTree::TypeCodeToKeyType(type_code code)
897 {
898 	switch (code) {
899 		case B_STRING_TYPE:
900 			return BPLUSTREE_STRING_TYPE;
901 		case B_SSIZE_T_TYPE:
902 		case B_INT32_TYPE:
903 			return BPLUSTREE_INT32_TYPE;
904 		case B_SIZE_T_TYPE:
905 		case B_UINT32_TYPE:
906 			return BPLUSTREE_UINT32_TYPE;
907 		case B_OFF_T_TYPE:
908 		case B_INT64_TYPE:
909 			return BPLUSTREE_INT64_TYPE;
910 		case B_UINT64_TYPE:
911 			return BPLUSTREE_UINT64_TYPE;
912 		case B_FLOAT_TYPE:
913 			return BPLUSTREE_FLOAT_TYPE;
914 		case B_DOUBLE_TYPE:
915 			return BPLUSTREE_DOUBLE_TYPE;
916 	}
917 	return -1;
918 }
919 
920 
921 int32
922 BPlusTree::ModeToKeyType(mode_t mode)
923 {
924 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
925 			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
926 		case S_INT_INDEX:
927 			return BPLUSTREE_INT32_TYPE;
928 		case S_UINT_INDEX:
929 			return BPLUSTREE_UINT32_TYPE;
930 		case S_LONG_LONG_INDEX:
931 			return BPLUSTREE_INT64_TYPE;
932 		case S_ULONG_LONG_INDEX:
933 			return BPLUSTREE_UINT64_TYPE;
934 		case S_FLOAT_INDEX:
935 			return BPLUSTREE_FLOAT_TYPE;
936 		case S_DOUBLE_INDEX:
937 			return BPLUSTREE_DOUBLE_TYPE;
938 		case S_STR_INDEX:
939 		default:
940 			// default is for standard directories
941 			return BPLUSTREE_STRING_TYPE;
942 	}
943 }
944 #endif // !_BOOT_MODE
945 
946 
947 //	#pragma mark - TransactionListener implementation
948 
949 
950 #if !_BOOT_MODE
951 void
952 BPlusTree::TransactionDone(bool success)
953 {
954 	if (!success) {
955 		// update header from disk
956 		CachedNode cached(this);
957 		const bplustree_header* header = cached.SetToHeader();
958 		if (header != NULL)
959 			memcpy(&fHeader, header, sizeof(bplustree_header));
960 	}
961 }
962 
963 
964 void
965 BPlusTree::RemovedFromTransaction()
966 {
967 	fInTransaction = false;
968 
969 	if (!fStream->GetVolume()->IsInitializing())
970 		put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
971 }
972 
973 
974 //	#pragma mark -
975 
976 
977 void
978 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
979 	uint16 splitAt, int8 change)
980 {
981 	// Although every iterator which is affected by this update currently
982 	// waits on a semaphore, other iterators could be added/removed at
983 	// any time, so we need to protect this loop
984 	MutexLocker _(fIteratorLock);
985 
986 	SinglyLinkedList<TreeIterator>::Iterator iterator
987 		= fIterators.GetIterator();
988 	while (iterator.HasNext())
989 		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
990 }
991 
992 
993 void
994 BPlusTree::_AddIterator(TreeIterator* iterator)
995 {
996 	MutexLocker _(fIteratorLock);
997 	fIterators.Add(iterator);
998 }
999 
1000 
1001 void
1002 BPlusTree::_RemoveIterator(TreeIterator* iterator)
1003 {
1004 	MutexLocker _(fIteratorLock);
1005 	fIterators.Remove(iterator);
1006 }
1007 #endif // !_BOOT_MODE
1008 
1009 
1010 int32
1011 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
1012 	int keyLength2)
1013 {
1014 	type_code type = 0;
1015 	switch (fHeader.DataType()) {
1016 	    case BPLUSTREE_STRING_TYPE:
1017 	    	type = B_STRING_TYPE;
1018 	    	break;
1019 		case BPLUSTREE_INT32_TYPE:
1020 	    	type = B_INT32_TYPE;
1021 	    	break;
1022 		case BPLUSTREE_UINT32_TYPE:
1023 	    	type = B_UINT32_TYPE;
1024 	    	break;
1025 		case BPLUSTREE_INT64_TYPE:
1026 	    	type = B_INT64_TYPE;
1027 	    	break;
1028 		case BPLUSTREE_UINT64_TYPE:
1029 	    	type = B_UINT64_TYPE;
1030 	    	break;
1031 		case BPLUSTREE_FLOAT_TYPE:
1032 	    	type = B_FLOAT_TYPE;
1033 	    	break;
1034 		case BPLUSTREE_DOUBLE_TYPE:
1035 	    	type = B_DOUBLE_TYPE;
1036 	    	break;
1037 	}
1038 	return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2);
1039 }
1040 
1041 
1042 status_t
1043 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
1044 	uint16 keyLength, uint16* _index, off_t* _next)
1045 {
1046 #ifdef DEBUG
1047 	NodeChecker checker(node, fNodeSize, "find");
1048 #endif
1049 
1050 	if (node->all_key_count == 0) {
1051 		if (_index)
1052 			*_index = 0;
1053 		if (_next)
1054 			*_next = node->OverflowLink();
1055 		return B_ENTRY_NOT_FOUND;
1056 	}
1057 
1058 	off_t* values = node->Values();
1059 	int16 saveIndex = -1;
1060 
1061 	// binary search in the key array
1062 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
1063 		uint16 i = (first + last) >> 1;
1064 
1065 		uint16 searchLength;
1066 		uint8* searchKey = node->KeyAt(i, &searchLength);
1067 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
1068 				> (uint8*)node + fNodeSize
1069 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
1070 #if !_BOOT_MODE
1071 			fStream->GetVolume()->Panic();
1072 #endif
1073 			RETURN_ERROR(B_BAD_DATA);
1074 		}
1075 
1076 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
1077 		if (cmp < 0) {
1078 			last = i - 1;
1079 			saveIndex = i;
1080 		} else if (cmp > 0) {
1081 			saveIndex = first = i + 1;
1082 		} else {
1083 			if (_index)
1084 				*_index = i;
1085 			if (_next)
1086 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1087 			return B_OK;
1088 		}
1089 	}
1090 
1091 	if (_index)
1092 		*_index = saveIndex;
1093 	if (_next) {
1094 		if (saveIndex == node->NumKeys())
1095 			*_next = node->OverflowLink();
1096 		else
1097 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
1098 	}
1099 	return B_ENTRY_NOT_FOUND;
1100 }
1101 
1102 
1103 #if !_BOOT_MODE
1104 /*!	Prepares the stack to contain all nodes that were passed while
1105 	following the key, from the root node to the leaf node that could
1106 	or should contain that key.
1107 */
1108 status_t
1109 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
1110 	uint16 keyLength)
1111 {
1112 	// set the root node to begin with
1113 	node_and_key nodeAndKey;
1114 	nodeAndKey.nodeOffset = fHeader.RootNode();
1115 
1116 	CachedNode cached(this);
1117 	const bplustree_node* node;
1118 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1119 		// if we are already on leaf level, we're done
1120 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1121 			// node that the keyIndex is not properly set here (but it's not
1122 			// needed in the calling functions anyway)!
1123 			nodeAndKey.keyIndex = 0;
1124 			stack.Push(nodeAndKey);
1125 			return B_OK;
1126 		}
1127 
1128 		off_t nextOffset;
1129 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
1130 			&nextOffset);
1131 
1132 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
1133 			RETURN_ERROR(B_ERROR);
1134 
1135 		if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
1136 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
1137 			break;
1138 		}
1139 
1140 		// put the node offset & the correct keyIndex on the stack
1141 		stack.Push(nodeAndKey);
1142 
1143 		nodeAndKey.nodeOffset = nextOffset;
1144 	}
1145 
1146 	FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
1147 		B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
1148 	return B_ERROR;
1149 }
1150 
1151 
1152 /*!	This will find a free duplicate fragment in the given bplustree_node.
1153 	The CachedNode will be set to the writable fragment on success.
1154 */
1155 status_t
1156 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
1157 	const bplustree_node* node, CachedNode& cached,
1158 	off_t* _offset, bplustree_node** _fragment, uint32* _index)
1159 {
1160 	off_t* values = node->Values();
1161 	for (int32 i = 0; i < node->NumKeys(); i++) {
1162 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1163 
1164 		// does the value link to a duplicate fragment?
1165 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
1166 			continue;
1167 
1168 		const bplustree_node* fragment = cached.SetTo(
1169 			bplustree_node::FragmentOffset(value), false);
1170 		if (fragment == NULL) {
1171 			FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
1172 				B_PRIdOFF "\n", value, fStream->ID()));
1173 			continue;
1174 		}
1175 
1176 		// see if there is some space left for us
1177 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
1178 		for (uint32 j = 0; j < num; j++) {
1179 			duplicate_array* array = fragment->FragmentAt(j);
1180 
1181 			if (array->IsEmpty()) {
1182 				// found an unused fragment
1183 				*_fragment = cached.MakeWritable(transaction);
1184 				if (*_fragment == NULL)
1185 					return B_IO_ERROR;
1186 
1187 				*_offset = bplustree_node::FragmentOffset(value);
1188 				*_index = j;
1189 				return B_OK;
1190 			}
1191 		}
1192 	}
1193 	return B_ENTRY_NOT_FOUND;
1194 }
1195 
1196 
1197 status_t
1198 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
1199 	const bplustree_node* node, uint16 index, off_t value)
1200 {
1201 	CachedNode cachedDuplicate(this);
1202 	off_t* values = node->Values();
1203 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1204 	status_t status;
1205 	off_t offset;
1206 
1207 	if (bplustree_node::IsDuplicate(oldValue)) {
1208 		// If it's a duplicate fragment, try to insert it into that, or if it
1209 		// doesn't fit anymore, create a new duplicate node
1210 
1211 		if (bplustree_node::LinkType(oldValue)
1212 				== BPLUSTREE_DUPLICATE_FRAGMENT) {
1213 			bplustree_node* duplicate = cachedDuplicate.SetToWritable(
1214 				transaction, bplustree_node::FragmentOffset(oldValue), false);
1215 			if (duplicate == NULL)
1216 				return B_IO_ERROR;
1217 
1218 			duplicate_array* array = duplicate->FragmentAt(
1219 				bplustree_node::FragmentIndex(oldValue));
1220 			int32 arrayCount = array->Count();
1221 			if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
1222 				FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
1223 					"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1224 					(int)bplustree_node::FragmentIndex(oldValue),
1225 					bplustree_node::FragmentOffset(oldValue), arrayCount,
1226 					fStream->ID()));
1227 				return B_BAD_DATA;
1228 			}
1229 
1230 			if (arrayCount < NUM_FRAGMENT_VALUES) {
1231 				array->Insert(value);
1232 			} else {
1233 				// Test if the fragment will be empty if we remove this key's
1234 				// values
1235 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
1236 					// The node will be empty without our values, so let us
1237 					// reuse it as a duplicate node
1238 					offset = bplustree_node::FragmentOffset(oldValue);
1239 
1240 					memmove(duplicate->DuplicateArray(), array,
1241 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1242 					duplicate->left_link = duplicate->right_link
1243 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1244 
1245 					array = duplicate->DuplicateArray();
1246 					array->Insert(value);
1247 				} else {
1248 					// Create a new duplicate node
1249 
1250 					cachedDuplicate.UnsetUnchanged(transaction);
1251 						// The old duplicate has not been touched, so we can
1252 						// reuse it
1253 
1254 					bplustree_node* newDuplicate;
1255 					status = cachedDuplicate.Allocate(transaction,
1256 						&newDuplicate, &offset);
1257 					if (status != B_OK)
1258 						RETURN_ERROR(status);
1259 
1260 					// Copy the array from the fragment node to the duplicate
1261 					// node and free the old entry (by zero'ing all values)
1262 					newDuplicate->overflow_link = array->count;
1263 					memcpy(&newDuplicate->all_key_count, &array->values[0],
1264 						array->Count() * sizeof(off_t));
1265 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1266 
1267 					array = newDuplicate->DuplicateArray();
1268 					array->Insert(value);
1269 				}
1270 
1271 				// Update the main pointer to link to a duplicate node
1272 				if (cached.MakeWritable(transaction) == NULL)
1273 					return B_IO_ERROR;
1274 
1275 				values[index]
1276 					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1277 						BPLUSTREE_DUPLICATE_NODE, offset));
1278 			}
1279 
1280 			return B_OK;
1281 		}
1282 
1283 		// Put the value into a dedicated duplicate node
1284 
1285 		// search for free space in the duplicate nodes of that key
1286 		duplicate_array* array;
1287 		int32 arrayCount;
1288 		const bplustree_node* duplicate;
1289 		off_t duplicateOffset;
1290 		do {
1291 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1292 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1293 			if (duplicate == NULL)
1294 				return B_IO_ERROR;
1295 
1296 			array = duplicate->DuplicateArray();
1297 			arrayCount =array->Count();
1298 			if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1299 				FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
1300 					B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1301 					duplicateOffset, arrayCount, fStream->ID()));
1302 				return B_BAD_DATA;
1303 			}
1304 		} while (arrayCount >= NUM_DUPLICATE_VALUES
1305 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
1306 
1307 		bplustree_node* writableDuplicate
1308 			= cachedDuplicate.MakeWritable(transaction);
1309 		if (writableDuplicate == NULL)
1310 			return B_IO_ERROR;
1311 
1312 		if (arrayCount < NUM_DUPLICATE_VALUES) {
1313 			array = writableDuplicate->DuplicateArray();
1314 			array->Insert(value);
1315 		} else {
1316 			// no space left - add a new duplicate node
1317 
1318 			bplustree_node* newDuplicate;
1319 			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
1320 				&offset);
1321 			if (status != B_OK)
1322 				RETURN_ERROR(status);
1323 
1324 			// link the two nodes together
1325 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
1326 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
1327 
1328 			array = newDuplicate->DuplicateArray();
1329 			array->count = 0;
1330 			array->Insert(value);
1331 		}
1332 		return B_OK;
1333 	}
1334 
1335 	// Search for a free duplicate fragment or create a new one
1336 	// to insert the duplicate value into
1337 
1338 	uint32 fragmentIndex = 0;
1339 	bplustree_node* fragment;
1340 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
1341 			&offset, &fragment, &fragmentIndex) != B_OK) {
1342 		// allocate a new duplicate fragment node
1343 		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
1344 		if (status != B_OK)
1345 			RETURN_ERROR(status);
1346 
1347 		memset(fragment, 0, fNodeSize);
1348 	}
1349 
1350 	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
1351 	array->Insert(oldValue);
1352 	array->Insert(value);
1353 
1354 	if (cached.MakeWritable(transaction) == NULL)
1355 		return B_IO_ERROR;
1356 
1357 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1358 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
1359 
1360 	return B_OK;
1361 }
1362 
1363 
1364 void
1365 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
1366 	uint16 keyLength, off_t value)
1367 {
1368 	// should never happen, but who knows?
1369 	if (index > node->NumKeys())
1370 		return;
1371 
1372 	off_t* values = node->Values();
1373 	uint16* keyLengths = node->KeyLengths();
1374 	uint8* keys = node->Keys();
1375 
1376 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1377 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1378 		+ keyLength);
1379 
1380 	off_t* newValues = node->Values();
1381 	uint16* newKeyLengths = node->KeyLengths();
1382 
1383 	// move values and copy new value into them
1384 	memmove(newValues + index + 1, values + index,
1385 		sizeof(off_t) * (node->NumKeys() - 1 - index));
1386 	memmove(newValues, values, sizeof(off_t) * index);
1387 
1388 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1389 
1390 	// move and update key length index
1391 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1392 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1393 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1394 	}
1395 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1396 
1397 	int32 keyStart;
1398 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1399 		+ (keyStart = index > 0
1400 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1401 
1402 	// move keys and copy new key into them
1403 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1404 	int32 size = node->AllKeyLength() - length;
1405 	if (size > 0)
1406 		memmove(keys + length, keys + length - keyLength, size);
1407 
1408 	memcpy(keys + keyStart, key, keyLength);
1409 }
1410 
1411 
1412 /*!	Splits the \a node into two halves - the other half will be put into
1413 	\a other. It also takes care to create a new overflow link if the node
1414 	to split is an index node.
1415 */
1416 status_t
1417 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1418 	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1419 	uint16* _keyLength, off_t* _value)
1420 {
1421 	if (*_keyIndex > node->NumKeys() + 1)
1422 		return B_BAD_VALUE;
1423 
1424 	uint16* inKeyLengths = node->KeyLengths();
1425 	off_t* inKeyValues = node->Values();
1426 	uint8* inKeys = node->Keys();
1427 	uint8* outKeys = other->Keys();
1428 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1429 
1430 	if (keyIndex > node->NumKeys()) {
1431 		FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
1432 			"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
1433 		return B_BAD_VALUE;
1434 	}
1435 
1436 	// how many keys will fit in one (half) page?
1437 	// that loop will find the answer to this question and
1438 	// change the key lengths indices for their new home
1439 
1440 	// "bytes" is the number of bytes written for the new key,
1441 	// "bytesBefore" are the bytes before that key
1442 	// "bytesAfter" are the bytes after the new key, if any
1443 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1444 
1445 	size_t size = fNodeSize >> 1;
1446 	int32 out, in;
1447 	for (in = out = 0; in < node->NumKeys() + 1;) {
1448 		if (!bytes) {
1449 			bytesBefore = in > 0
1450 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1451 		}
1452 
1453 		if (in == keyIndex && !bytes) {
1454 			bytes = *_keyLength;
1455 		} else {
1456 			if (keyIndex < out) {
1457 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1458 					- bytesBefore;
1459 			}
1460 
1461 			in++;
1462 		}
1463 		out++;
1464 
1465 		if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1466 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1467 			// we have found the number of keys in the new node!
1468 			break;
1469 		}
1470 	}
1471 
1472 	// if the new key was not inserted, set the length of the keys
1473 	// that can be copied directly
1474 	if (keyIndex >= out && in > 0)
1475 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1476 
1477 	if (bytesBefore < 0 || bytesAfter < 0)
1478 		return B_BAD_DATA;
1479 
1480 	other->left_link = node->left_link;
1481 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1482 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1483 		+ bytesAfter);
1484 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1485 
1486 	uint16* outKeyLengths = other->KeyLengths();
1487 	off_t* outKeyValues = other->Values();
1488 	int32 keys = out > keyIndex ? keyIndex : out;
1489 
1490 	if (bytesBefore) {
1491 		// copy the keys
1492 		memcpy(outKeys, inKeys, bytesBefore);
1493 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1494 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1495 	}
1496 	if (bytes) {
1497 		// copy the newly inserted key
1498 		memcpy(outKeys + bytesBefore, key, bytes);
1499 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1500 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1501 
1502 		if (bytesAfter) {
1503 			// copy the keys after the new key
1504 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1505 				bytesAfter);
1506 			keys = out - keyIndex - 1;
1507 			for (int32 i = 0;i < keys;i++) {
1508 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1509 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1510 						+ bytes);
1511 			}
1512 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1513 				keys * sizeof(off_t));
1514 		}
1515 	}
1516 
1517 	// if the new key was already inserted, we shouldn't use it again
1518 	if (in != out)
1519 		keyIndex--;
1520 
1521 	int32 total = bytesBefore + bytesAfter;
1522 
1523 	// these variables are for the key that will be returned
1524 	// to the parent node
1525 	uint8* newKey = NULL;
1526 	uint16 newLength;
1527 	bool newAllocated = false;
1528 
1529 	// If we have split an index node, we have to drop the first key
1530 	// of the next node (which can also be the new key to insert).
1531 	// The dropped key is also the one which has to be inserted in
1532 	// the parent node, so we will set the "newKey" already here.
1533 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1534 		if (in == keyIndex) {
1535 			newKey = key;
1536 			newLength = *_keyLength;
1537 
1538 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1539 			keyIndex--;
1540 		} else {
1541 			// If a key is dropped (is not the new key), we have to copy
1542 			// it, because it would be lost if not.
1543 			uint8* droppedKey = node->KeyAt(in, &newLength);
1544 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1545 					> (uint8*)node + fNodeSize
1546 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1547 				fStream->GetVolume()->Panic();
1548 				RETURN_ERROR(B_BAD_DATA);
1549 			}
1550 			newKey = (uint8*)malloc(newLength);
1551 			if (newKey == NULL)
1552 				return B_NO_MEMORY;
1553 
1554 			newAllocated = true;
1555 			memcpy(newKey, droppedKey, newLength);
1556 
1557 			other->overflow_link = inKeyValues[in];
1558 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1559 		}
1560 	}
1561 
1562 	// and now the same game for the other page and the rest of the keys
1563 	// (but with memmove() instead of memcpy(), because they may overlap)
1564 
1565 	bytesBefore = bytesAfter = bytes = 0;
1566 	out = 0;
1567 	int32 skip = in;
1568 	while (in < node->NumKeys() + 1) {
1569 		if (in == keyIndex && !bytes) {
1570 			// it's enough to set bytesBefore once here, because we do
1571 			// not need to know the exact length of all keys in this
1572 			// loop
1573 			bytesBefore = in > skip
1574 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1575 			bytes = *_keyLength;
1576 			out++;
1577 		} else {
1578 			if (in < node->NumKeys()) {
1579 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1580 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1581 
1582 				if (bytes) {
1583 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1584 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1585 
1586 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1587 						- bytesBefore - bytes;
1588 				}
1589 				out++;
1590 			}
1591 			in++;
1592 		}
1593 	}
1594 
1595 	// adjust the byte counts (since we were a bit lazy in the loop)
1596 	if (keyIndex < skip)
1597 		bytesBefore = node->AllKeyLength() - total;
1598 
1599 	if (bytesBefore < 0 || bytesAfter < 0) {
1600 		if (newAllocated)
1601 			free(newKey);
1602 		return B_BAD_DATA;
1603 	}
1604 
1605 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1606 		// right link, and overflow link can stay the same
1607 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1608 		+ bytesAfter);
1609 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1610 
1611 	// array positions have changed
1612 	outKeyLengths = node->KeyLengths();
1613 	outKeyValues = node->Values();
1614 
1615 	// move the keys in the old node: the order is important here,
1616 	// because we don't want to overwrite any contents
1617 
1618 	keys = keyIndex <= skip ? out : keyIndex - skip;
1619 	keyIndex -= skip;
1620 	in = out - keyIndex - 1;
1621 		// Note: keyIndex and in will contain invalid values when the new key
1622 		// went to the other node. But in this case bytes and bytesAfter are
1623 		// 0 and subsequently we never use keyIndex and in.
1624 
1625 	if (bytesBefore)
1626 		memmove(inKeys, inKeys + total, bytesBefore);
1627 	if (bytesAfter) {
1628 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1629 			bytesAfter);
1630 	}
1631 
1632 	if (bytesBefore)
1633 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1634 	if (bytesAfter) {
1635 		// if byteAfter is > 0, keyIndex is larger than skip
1636 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1637 			in * sizeof(uint16));
1638 	}
1639 
1640 	if (bytesBefore)
1641 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1642 	if (bytesAfter) {
1643 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1644 			in * sizeof(off_t));
1645 	}
1646 
1647 	if (bytes) {
1648 		// finally, copy the newly inserted key (don't overwrite anything)
1649 		memcpy(inKeys + bytesBefore, key, bytes);
1650 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1651 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1652 	}
1653 
1654 	// Prepare the key that will be inserted in the parent node which
1655 	// is either the dropped key or the last of the other node.
1656 	// If it's the dropped key, "newKey" was already set earlier.
1657 
1658 	if (newKey == NULL)
1659 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1660 
1661 	memcpy(key, newKey, newLength);
1662 	*_keyLength = newLength;
1663 	*_value = otherOffset;
1664 
1665 	if (newAllocated)
1666 		free(newKey);
1667 
1668 	return B_OK;
1669 }
1670 
1671 
1672 /*!	This inserts a key into the tree. The changes made to the tree will
1673 	all be part of the \a transaction.
1674 	You need to have the inode write locked.
1675 */
1676 status_t
1677 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1678 	off_t value)
1679 {
1680 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1681 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1682 		RETURN_ERROR(B_BAD_VALUE);
1683 #ifdef DEBUG
1684 	if (value < 0)
1685 		panic("tried to insert invalid value %" B_PRId64 "!\n", value);
1686 #endif
1687 
1688 	ASSERT_WRITE_LOCKED_INODE(fStream);
1689 
1690 	Stack<node_and_key> stack;
1691 	if (_SeekDown(stack, key, keyLength) != B_OK)
1692 		RETURN_ERROR(B_ERROR);
1693 
1694 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1695 
1696 	memcpy(keyBuffer, key, keyLength);
1697 	keyBuffer[keyLength] = 0;
1698 
1699 	node_and_key nodeAndKey;
1700 	const bplustree_node* node;
1701 
1702 	CachedNode cached(this);
1703 	while (stack.Pop(&nodeAndKey)
1704 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1705 #ifdef DEBUG
1706 		NodeChecker checker(node, fNodeSize, "insert");
1707 #endif
1708 		if (node->IsLeaf()) {
1709 			// first round, check for duplicate entries
1710 			status_t status = _FindKey(node, key, keyLength,
1711 				&nodeAndKey.keyIndex);
1712 
1713 			// is this a duplicate entry?
1714 			if (status == B_OK) {
1715 				if (fAllowDuplicates) {
1716 					status = _InsertDuplicate(transaction, cached, node,
1717 						nodeAndKey.keyIndex, value);
1718 					if (status != B_OK)
1719 						RETURN_ERROR(status);
1720 					return B_OK;
1721 				}
1722 
1723 				return B_NAME_IN_USE;
1724 			}
1725 		}
1726 
1727 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1728 		if (writableNode == NULL)
1729 			return B_IO_ERROR;
1730 
1731 		// is the node big enough to hold the pair?
1732 		if (int32(key_align(sizeof(bplustree_node)
1733 				+ writableNode->AllKeyLength() + keyLength)
1734 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1735 				+ sizeof(off_t))) < fNodeSize) {
1736 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1737 				keyBuffer, keyLength, value);
1738 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1739 				nodeAndKey.keyIndex, 0, 1);
1740 
1741 			return B_OK;
1742 		} else {
1743 			CachedNode cachedNewRoot(this);
1744 			CachedNode cachedOther(this);
1745 
1746 			// do we need to allocate a new root node? if so, then do
1747 			// it now
1748 			off_t newRoot = BPLUSTREE_NULL;
1749 			if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1750 				bplustree_node* root;
1751 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1752 					&newRoot);
1753 				if (status != B_OK) {
1754 					// The tree is most likely corrupted!
1755 					// But it's still sane at leaf level - we could set
1756 					// a flag in the header that forces the tree to be
1757 					// rebuild next time...
1758 					// But since we will have journaling, that's not a big
1759 					// problem anyway.
1760 					RETURN_ERROR(status);
1761 				}
1762 			}
1763 
1764 			// reserve space for the other node
1765 			bplustree_node* other;
1766 			off_t otherOffset;
1767 			status_t status = cachedOther.Allocate(transaction, &other,
1768 				&otherOffset);
1769 			if (status != B_OK) {
1770 				cachedNewRoot.Free(transaction, newRoot);
1771 				RETURN_ERROR(status);
1772 			}
1773 
1774 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1775 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1776 					&value) != B_OK) {
1777 				// free root node & other node here
1778 				cachedOther.Free(transaction, otherOffset);
1779 				cachedNewRoot.Free(transaction, newRoot);
1780 
1781 				RETURN_ERROR(B_ERROR);
1782 			}
1783 #ifdef DEBUG
1784 			checker.Check("insert split");
1785 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1786 #endif
1787 
1788 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1789 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1790 
1791 			// update the right link of the node in the left of the new node
1792 			if ((other = cachedOther.SetToWritable(transaction,
1793 					other->LeftLink())) != NULL) {
1794 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1795 			}
1796 
1797 			// create a new root if necessary
1798 			if (newRoot != BPLUSTREE_NULL) {
1799 				bplustree_node* root = cachedNewRoot.Node();
1800 
1801 				_InsertKey(root, 0, keyBuffer, keyLength,
1802 					writableNode->LeftLink());
1803 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1804 					nodeAndKey.nodeOffset);
1805 
1806 				CachedNode cached(this);
1807 				bplustree_header* header
1808 					= cached.SetToWritableHeader(transaction);
1809 				if (header == NULL)
1810 					return B_IO_ERROR;
1811 
1812 				// finally, update header to point to the new root
1813 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1814 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1815 					header->MaxNumberOfLevels() + 1);
1816 
1817 				return B_OK;
1818 			}
1819 		}
1820 	}
1821 	RETURN_ERROR(B_ERROR);
1822 }
1823 
1824 
1825 /*!	Removes the duplicate index/value pair from the tree.
1826 	It's part of the private tree interface.
1827 */
1828 status_t
1829 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1830 	const bplustree_node* node, CachedNode& cached, uint16 index,
1831 	off_t value)
1832 {
1833 	off_t* values = node->Values();
1834 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1835 
1836 	CachedNode cachedDuplicate(this);
1837 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1838 	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1839 		duplicateOffset, false);
1840 	if (duplicate == NULL)
1841 		return B_IO_ERROR;
1842 
1843 	// if it's a duplicate fragment, remove the entry from there
1844 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1845 		duplicate_array* array = duplicate->FragmentAt(
1846 			bplustree_node::FragmentIndex(oldValue));
1847 		int32 arrayCount = array->Count();
1848 
1849 		if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
1850 			FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
1851 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1852 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1853 				arrayCount, fStream->ID()));
1854 			return B_BAD_DATA;
1855 		}
1856 		if (!array->Remove(value)) {
1857 			FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1858 				"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
1859 				duplicateOffset, fStream->ID()));
1860 			return B_ENTRY_NOT_FOUND;
1861 		}
1862 
1863 		// remove the array from the fragment node if it is empty
1864 		if (--arrayCount == 1) {
1865 			// set the link to the remaining value
1866 			if (cached.MakeWritable(transaction) == NULL)
1867 				return B_IO_ERROR;
1868 
1869 			values[index] = array->values[0];
1870 
1871 			// Remove the whole fragment node, if this was the only array,
1872 			// otherwise free just the array
1873 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1874 				status_t status = cachedDuplicate.Free(transaction,
1875 					duplicateOffset);
1876 				if (status != B_OK)
1877 					return status;
1878 			} else
1879 				array->count = 0;
1880 		}
1881 		return B_OK;
1882 	}
1883 
1884 	// Remove value from a duplicate node!
1885 
1886 	duplicate_array* array = NULL;
1887 	int32 arrayCount = 0;
1888 
1889 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1890 		FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1891 			", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
1892 		return B_BAD_DATA;
1893 	}
1894 
1895 	// Search the duplicate nodes until the entry could be found (and removed)
1896 	while (duplicate != NULL) {
1897 		array = duplicate->DuplicateArray();
1898 		arrayCount = array->Count();
1899 
1900 		if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1901 			FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
1902 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1903 				duplicateOffset, arrayCount, fStream->ID()));
1904 			return B_BAD_DATA;
1905 		}
1906 
1907 		if (array->Remove(value)) {
1908 			arrayCount--;
1909 			break;
1910 		}
1911 
1912 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1913 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1914 
1915 		cachedDuplicate.UnsetUnchanged(transaction);
1916 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1917 			false);
1918 	}
1919 	if (duplicate == NULL)
1920 		RETURN_ERROR(B_IO_ERROR);
1921 
1922 	// The entry got removed from the duplicate node, but we might want to free
1923 	// it now in case it's empty
1924 
1925 	while (true) {
1926 		off_t left = duplicate->LeftLink();
1927 		off_t right = duplicate->RightLink();
1928 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1929 
1930 		if ((isLast && arrayCount == 1) || arrayCount == 0) {
1931 			// Free empty duplicate page, link their siblings together, and
1932 			// update the duplicate link if needed (ie. when we either remove
1933 			// the last duplicate node or have a new first one)
1934 
1935 			if (left == BPLUSTREE_NULL) {
1936 				// the duplicate link points to us
1937 				if (cached.MakeWritable(transaction) == NULL)
1938 					return B_IO_ERROR;
1939 
1940 				if (arrayCount == 1) {
1941 					// This is the last node, and there is only one value left;
1942 					// replace the duplicate link with that value, it's no
1943 					// duplicate anymore
1944 					values[index] = array->values[0];
1945 				} else {
1946 					// Move the duplicate link to the next node
1947 					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1948 						bplustree_node::MakeLink(
1949 							BPLUSTREE_DUPLICATE_NODE, right));
1950 				}
1951 			}
1952 
1953 			status_t status = cachedDuplicate.Free(transaction,
1954 				duplicateOffset);
1955 			if (status != B_OK)
1956 				return status;
1957 
1958 			if (left != BPLUSTREE_NULL
1959 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1960 						false)) != NULL) {
1961 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1962 
1963 				// If the next node is the last node, we need to free that node
1964 				// and convert the duplicate entry back into a normal entry
1965 				array = duplicate->DuplicateArray();
1966 				arrayCount = array->Count();
1967 				if (right == BPLUSTREE_NULL
1968 					&& duplicate->LeftLink() == BPLUSTREE_NULL
1969 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1970 					duplicateOffset = left;
1971 					continue;
1972 				}
1973 			}
1974 			if (right != BPLUSTREE_NULL
1975 				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1976 						right, false)) != NULL) {
1977 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1978 
1979 				// Again, we may need to turn the duplicate entry back into a
1980 				// normal entry
1981 				array = duplicate->DuplicateArray();
1982 				arrayCount = array->Count();
1983 				if (left == BPLUSTREE_NULL
1984 					&& duplicate->RightLink() == BPLUSTREE_NULL
1985 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1986 					duplicateOffset = right;
1987 					continue;
1988 				}
1989 			}
1990 			return B_OK;
1991 		}
1992 		if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
1993 			// If the number of entries fits in a duplicate fragment, then
1994 			// either find a free fragment node, or convert this node to a
1995 			// fragment node.
1996 			CachedNode cachedOther(this);
1997 
1998 			bplustree_node* fragment = NULL;
1999 			uint32 fragmentIndex = 0;
2000 			off_t offset;
2001 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
2002 					&offset, &fragment, &fragmentIndex) == B_OK) {
2003 				// move to other node
2004 				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
2005 				memcpy(target, array,
2006 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2007 
2008 				cachedDuplicate.Free(transaction, duplicateOffset);
2009 				duplicateOffset = offset;
2010 			} else {
2011 				// convert node
2012 				memmove(duplicate, array,
2013 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2014 				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
2015 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2016 			}
2017 
2018 			if (cached.MakeWritable(transaction) == NULL)
2019 				return B_IO_ERROR;
2020 
2021 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
2022 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
2023 		}
2024 		return B_OK;
2025 	}
2026 }
2027 
2028 
2029 /*!	Removes the key with the given index from the specified node.
2030 	Since it has to get the key from the node anyway (to obtain it's
2031 	pointer), it's not needed to pass the key & its length, although
2032 	the calling method (BPlusTree::Remove()) have this data.
2033 */
2034 void
2035 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
2036 {
2037 	// should never happen, but who knows?
2038 	if (index > node->NumKeys() && node->NumKeys() > 0) {
2039 		FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
2040 			"\n", index, fStream->ID()));
2041 		return;
2042 	}
2043 
2044 	off_t* values = node->Values();
2045 
2046 	// if we would have to drop the overflow link, drop
2047 	// the last key instead and update the overflow link
2048 	// to the value of that one
2049 	if (!node->IsLeaf() && index == node->NumKeys())
2050 		node->overflow_link = values[--index];
2051 
2052 	uint16 length;
2053 	uint8* key = node->KeyAt(index, &length);
2054 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
2055 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2056 		FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
2057 			fStream->ID()));
2058 		fStream->GetVolume()->Panic();
2059 		return;
2060 	}
2061 
2062 	uint16* keyLengths = node->KeyLengths();
2063 	uint8* keys = node->Keys();
2064 
2065 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
2066 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
2067 		node->AllKeyLength() - length);
2068 
2069 	off_t* newValues = node->Values();
2070 	uint16* newKeyLengths = node->KeyLengths();
2071 
2072 	// move key data
2073 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
2074 
2075 	// move and update key lengths
2076 	if (index > 0 && newKeyLengths != keyLengths)
2077 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
2078 	for (uint16 i = index; i < node->NumKeys(); i++) {
2079 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
2080 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
2081 	}
2082 
2083 	// move values
2084 	if (index > 0)
2085 		memmove(newValues, values, index * sizeof(off_t));
2086 	if (node->NumKeys() > index) {
2087 		memmove(newValues + index, values + index + 1,
2088 			(node->NumKeys() - index) * sizeof(off_t));
2089 	}
2090 }
2091 
2092 
2093 /*!	Removes the specified key from the tree. The "value" parameter is only used
2094 	for trees which allow duplicates, so you may safely ignore it.
2095 	It's not an optional parameter, so at least you have to think about it.
2096 	You need to have the inode write locked.
2097 */
2098 status_t
2099 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
2100 	off_t value)
2101 {
2102 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2103 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2104 		RETURN_ERROR(B_BAD_VALUE);
2105 
2106 	ASSERT_WRITE_LOCKED_INODE(fStream);
2107 
2108 	Stack<node_and_key> stack;
2109 	if (_SeekDown(stack, key, keyLength) != B_OK)
2110 		RETURN_ERROR(B_ERROR);
2111 
2112 	node_and_key nodeAndKey;
2113 	const bplustree_node* node;
2114 
2115 	CachedNode cached(this);
2116 	while (stack.Pop(&nodeAndKey)
2117 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
2118 #ifdef DEBUG
2119 		NodeChecker checker(node, fNodeSize, "remove");
2120 #endif
2121 		if (node->IsLeaf()) {
2122 			// first round, check for duplicate entries
2123 			status_t status = _FindKey(node, key, keyLength,
2124 				&nodeAndKey.keyIndex);
2125 			if (status != B_OK)
2126 				RETURN_ERROR(status);
2127 
2128 			// Is this a duplicate entry?
2129 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
2130 					node->Values()[nodeAndKey.keyIndex]))) {
2131 				if (fAllowDuplicates) {
2132 					return _RemoveDuplicate(transaction, node, cached,
2133 						nodeAndKey.keyIndex, value);
2134 				}
2135 
2136 				FATAL(("dupliate node found where no duplicates are "
2137 					"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
2138 				RETURN_ERROR(B_ERROR);
2139 			} else {
2140 				if (node->Values()[nodeAndKey.keyIndex] != value)
2141 					return B_ENTRY_NOT_FOUND;
2142 
2143 				// If we will remove the last key, the iterator will be set
2144 				// to the next node after the current - if there aren't any
2145 				// more nodes, we need a way to prevent the TreeIterators to
2146 				// touch the old node again, we use BPLUSTREE_FREE for this
2147 				off_t next = node->RightLink() == BPLUSTREE_NULL
2148 					? BPLUSTREE_FREE : node->RightLink();
2149 				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
2150 					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
2151 			}
2152 		}
2153 
2154 		bplustree_node* writableNode = cached.MakeWritable(transaction);
2155 		if (writableNode == NULL)
2156 			return B_IO_ERROR;
2157 
2158 		// if it's an empty root node, we have to convert it
2159 		// to a leaf node by dropping the overflow link, or,
2160 		// if it's already a leaf node, just empty it
2161 		if (nodeAndKey.nodeOffset == fHeader.RootNode()
2162 			&& (node->NumKeys() == 0
2163 				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
2164 			writableNode->overflow_link
2165 				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2166 			writableNode->all_key_count = 0;
2167 			writableNode->all_key_length = 0;
2168 
2169 			// if we've made a leaf node out of the root node, we need
2170 			// to reset the maximum number of levels in the header
2171 			if (fHeader.MaxNumberOfLevels() != 1) {
2172 				CachedNode cached(this);
2173 				bplustree_header* header
2174 					= cached.SetToWritableHeader(transaction);
2175 				if (header == NULL)
2176 					return B_IO_ERROR;
2177 
2178 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
2179 			}
2180 			return B_OK;
2181 		}
2182 
2183 		// if there is only one key left, we don't have to remove
2184 		// it, we can just dump the node (index nodes still have
2185 		// the overflow link, so we have to drop the last key)
2186 		if (writableNode->NumKeys() > 1
2187 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
2188 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
2189 			return B_OK;
2190 		}
2191 
2192 		// when we are here, we can just free the node, but
2193 		// we have to update the right/left link of the
2194 		// siblings first
2195 		CachedNode otherCached(this);
2196 		bplustree_node* other = otherCached.SetToWritable(transaction,
2197 			writableNode->LeftLink());
2198 		if (other != NULL)
2199 			other->right_link = writableNode->right_link;
2200 
2201 		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
2202 				!= NULL) {
2203 			other->left_link = writableNode->left_link;
2204 		}
2205 
2206 		cached.Free(transaction, nodeAndKey.nodeOffset);
2207 	}
2208 	RETURN_ERROR(B_ERROR);
2209 }
2210 
2211 
2212 /*!	Replaces the value for the key in the tree.
2213 	Returns B_OK if the key could be found and its value replaced,
2214 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
2215 	to indicate that something went terribly wrong.
2216 	Note that this doesn't work with duplicates - it will just
2217 	return B_BAD_TYPE if you call this function on a tree where
2218 	duplicates are allowed.
2219 	You need to have the inode write locked.
2220 */
2221 status_t
2222 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
2223 	off_t value)
2224 {
2225 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2226 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2227 		|| key == NULL)
2228 		RETURN_ERROR(B_BAD_VALUE);
2229 
2230 	if (fAllowDuplicates)
2231 		RETURN_ERROR(B_BAD_TYPE);
2232 
2233 	ASSERT_WRITE_LOCKED_INODE(fStream);
2234 
2235 	off_t nodeOffset = fHeader.RootNode();
2236 	CachedNode cached(this);
2237 	const bplustree_node* node;
2238 
2239 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2240 		uint16 keyIndex = 0;
2241 		off_t nextOffset;
2242 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2243 			&nextOffset);
2244 
2245 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2246 			if (status == B_OK) {
2247 				bplustree_node* writableNode = cached.MakeWritable(transaction);
2248 				if (writableNode != NULL) {
2249 					writableNode->Values()[keyIndex]
2250 						= HOST_ENDIAN_TO_BFS_INT64(value);
2251 				} else
2252 					status = B_IO_ERROR;
2253 			}
2254 
2255 			return status;
2256 		} else if (nextOffset == nodeOffset)
2257 			RETURN_ERROR(B_ERROR);
2258 
2259 		nodeOffset = nextOffset;
2260 	}
2261 	RETURN_ERROR(B_ERROR);
2262 }
2263 #endif // !_BOOT_MODE
2264 
2265 
2266 /*!	Searches the key in the tree, and stores the offset found in _value,
2267 	if successful.
2268 	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
2269 	while it descends the tree.
2270 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
2271 	It can also return other errors to indicate that something went wrong.
2272 	Note that this doesn't work with duplicates - it will just return
2273 	B_BAD_TYPE if you call this function on a tree where duplicates are
2274 	allowed.
2275 	You need to have the inode read or write locked.
2276 */
2277 status_t
2278 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
2279 {
2280 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2281 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2282 		|| key == NULL)
2283 		RETURN_ERROR(B_BAD_VALUE);
2284 
2285 	if (fAllowDuplicates)
2286 		RETURN_ERROR(B_BAD_TYPE);
2287 
2288 #if !_BOOT_MODE
2289 	ASSERT_READ_LOCKED_INODE(fStream);
2290 #endif
2291 
2292 	off_t nodeOffset = fHeader.RootNode();
2293 	CachedNode cached(this);
2294 	const bplustree_node* node;
2295 
2296 #ifdef DEBUG
2297 	int32 levels = 0;
2298 #endif
2299 
2300 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2301 		uint16 keyIndex = 0;
2302 		off_t nextOffset;
2303 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2304 			&nextOffset);
2305 
2306 #ifdef DEBUG
2307 		levels++;
2308 #endif
2309 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2310 			if (status == B_OK && _value != NULL)
2311 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
2312 
2313 #ifdef DEBUG
2314 			if (levels != (int32)fHeader.MaxNumberOfLevels())
2315 				DEBUGGER(("levels don't match"));
2316 #endif
2317 			return status;
2318 		} else if (nextOffset == nodeOffset)
2319 			RETURN_ERROR(B_ERROR);
2320 
2321 		nodeOffset = nextOffset;
2322 	}
2323 	FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
2324 		B_PRIdOFF "\n", nodeOffset, fStream->ID()));
2325 	RETURN_ERROR(B_ERROR);
2326 }
2327 
2328 
2329 #if !_BOOT_MODE
2330 status_t
2331 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
2332 	const uint8* largestKey, uint16 largestKeyLength,
2333 	const bplustree_node* parent)
2334 {
2335 	if (parent->CheckIntegrity(fNodeSize) != B_OK) {
2336 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
2337 			"failed!\n", fStream->ID(), offset);
2338 		check.FoundError();
2339 		return B_OK;
2340 	}
2341 	if (level >= fHeader.MaxNumberOfLevels()) {
2342 		dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
2343 			"!\n", fStream->ID(), offset);
2344 		check.FoundError();
2345 		return B_OK;
2346 	}
2347 
2348 	check.SetLevel(level);
2349 
2350 	if (check.Visited(offset)) {
2351 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
2352 			fStream->ID(), offset);
2353 		check.FoundError();
2354 		return B_OK;
2355 	}
2356 
2357 	check.SetVisited(offset);
2358 
2359 	uint32 count = parent->NumKeys();
2360 	off_t* values = parent->Values();
2361 	off_t lastOffset = check.PreviousOffset(level);
2362 	CachedNode cached(this);
2363 
2364 	for (uint32 i = 0; i < count; i++) {
2365 		uint16 keyLength;
2366 		uint8* key = parent->KeyAt(i, &keyLength);
2367 		if (largestKey != NULL) {
2368 			int result = _CompareKeys(key, keyLength, largestKey,
2369 				largestKeyLength);
2370 			if (result >= 0) {
2371 				dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
2372 					B_PRIu32 " larger than it should!\n",
2373 					fStream->ID(), offset, i);
2374 				check.FoundError();
2375 			}
2376 		}
2377 
2378 		off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
2379 		if (bplustree_node::IsDuplicate(childOffset)) {
2380 			// Walk the duplicate nodes
2381 			off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
2382 			off_t lastDuplicateOffset = BPLUSTREE_NULL;
2383 
2384 			while (duplicateOffset != BPLUSTREE_NULL) {
2385 				const bplustree_node* node;
2386 				status_t status = cached.SetTo(duplicateOffset, &node, false);
2387 				if (status != B_OK) {
2388 					if (status == B_IO_ERROR)
2389 						return B_IO_ERROR;
2390 
2391 					dprintf("inode %" B_PRIdOFF ": duplicate node at %"
2392 						B_PRIdOFF " could not be read: %s\n", fStream->ID(),
2393 						duplicateOffset, strerror(status));
2394 					check.FoundError();
2395 					break;
2396 				}
2397 
2398 				bool isFragmentNode = bplustree_node::LinkType(childOffset)
2399 					== BPLUSTREE_DUPLICATE_FRAGMENT;
2400 				bool isKnownFragment = isFragmentNode
2401 					&& check.VisitedFragment(duplicateOffset);
2402 
2403 				if (!isKnownFragment && check.Visited(duplicateOffset)) {
2404 					dprintf("inode %" B_PRIdOFF ": %s node at %"
2405 						B_PRIdOFF " already visited, referenced from %"
2406 						B_PRIdOFF "!\n", fStream->ID(),
2407 						isFragmentNode ? "fragment" : "duplicate",
2408 						duplicateOffset, offset);
2409 					check.FoundError();
2410 					break;
2411 				}
2412 
2413 				// Fragment nodes may be visited more than once from different
2414 				// places
2415 				if (!check.Visited(duplicateOffset))
2416 					check.SetVisited(duplicateOffset);
2417 				if (!isKnownFragment && isFragmentNode)
2418 					check.SetVisitedFragment(duplicateOffset);
2419 
2420 				duplicate_array* array;
2421 				int32 minSize;
2422 				int32 maxSize;
2423 				if (isFragmentNode) {
2424 					array = node->FragmentAt(
2425 						bplustree_node::FragmentIndex(childOffset));
2426 					minSize = 2;
2427 					maxSize = NUM_FRAGMENT_VALUES;
2428 				} else {
2429 					array = node->DuplicateArray();
2430 					minSize = 1;
2431 					maxSize = NUM_DUPLICATE_VALUES;
2432 				}
2433 				int32 arrayCount = array->Count();
2434 
2435 				if (arrayCount < minSize || arrayCount > maxSize) {
2436 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2437 						" has invalid array size %" B_PRId32 "!\n",
2438 						fStream->ID(), duplicateOffset, arrayCount);
2439 					check.FoundError();
2440 				} else {
2441 					// Simple check if the values in the array may be valid
2442 					for (int32 j = 0; j < arrayCount; j++) {
2443 						if (!fStream->GetVolume()->IsValidInodeBlock(
2444 								array->ValueAt(j))) {
2445 							dprintf("inode %" B_PRIdOFF ": duplicate at %"
2446 								B_PRIdOFF " contains invalid block %" B_PRIdOFF
2447 								" at %" B_PRId32 "!\n", fStream->ID(),
2448 								duplicateOffset, array->ValueAt(j), j);
2449 							check.FoundError();
2450 							break;
2451 						}
2452 					}
2453 				}
2454 
2455 				// A fragment node is not linked (and does not have valid links)
2456 				if (isFragmentNode)
2457 					break;
2458 
2459 				if (node->LeftLink() != lastDuplicateOffset) {
2460 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2461 						" has wrong left link %" B_PRIdOFF ", expected %"
2462 						B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
2463 						node->LeftLink(), lastDuplicateOffset);
2464 					check.FoundError();
2465 				}
2466 
2467 				lastDuplicateOffset = duplicateOffset;
2468 				duplicateOffset = node->RightLink();
2469 			}
2470 		} else if (!parent->IsLeaf()) {
2471 			// Test a regular child node recursively
2472 			off_t nextOffset = parent->OverflowLink();
2473 			if (i < count - 1)
2474 				nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
2475 
2476 			if (i == 0 && lastOffset != BPLUSTREE_NULL) {
2477 				// Test right link of the previous node
2478 				const bplustree_node* previous = cached.SetTo(lastOffset, true);
2479 				if (previous == NULL)
2480 					return B_IO_ERROR;
2481 
2482 				if (previous->RightLink() != childOffset) {
2483 					dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2484 						"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2485 						"!\n", fStream->ID(), lastOffset, previous->RightLink(),
2486 						childOffset);
2487 					check.FoundError();
2488 				}
2489 			}
2490 
2491 			status_t status = _ValidateChild(check, cached, level, childOffset,
2492 				lastOffset, nextOffset, key, keyLength);
2493 			if (status != B_OK)
2494 				return status;
2495 		} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
2496 			dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
2497 				"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
2498 				fStream->ID(), offset, childOffset, i);
2499 			check.FoundError();
2500 		}
2501 
2502 		lastOffset = childOffset;
2503 	}
2504 
2505 	if (parent->OverflowLink() != BPLUSTREE_NULL) {
2506 		off_t childOffset = parent->OverflowLink();
2507 		status_t status = _ValidateChild(check, cached, level, childOffset,
2508 			lastOffset, 0, NULL, 0);
2509 		if (status != B_OK)
2510 			return status;
2511 
2512 		lastOffset = childOffset;
2513 	}
2514 
2515 	check.SetPreviousOffset(level, lastOffset);
2516 	return B_OK;
2517 }
2518 
2519 
2520 status_t
2521 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
2522 	off_t offset, off_t lastOffset, off_t nextOffset,
2523 	const uint8* key, uint16 keyLength)
2524 {
2525 	const bplustree_node* node;
2526 	status_t status = cached.SetTo(offset, &node, true);
2527 	if (status != B_OK) {
2528 		if (status == B_IO_ERROR)
2529 			return B_IO_ERROR;
2530 
2531 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
2532 			"read: %s\n", fStream->ID(), offset, strerror(status));
2533 		check.FoundError();
2534 		return B_OK;
2535 	}
2536 
2537 	if (node->LeftLink() != lastOffset) {
2538 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2539 			"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
2540 			"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
2541 		check.FoundError();
2542 	}
2543 
2544 	if (nextOffset != 0 && node->RightLink() != nextOffset) {
2545 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2546 			"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2547 			"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
2548 		check.FoundError();
2549 	}
2550 
2551 	return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
2552 }
2553 #endif // !_BOOT_MODE
2554 
2555 
2556 //	#pragma mark -
2557 
2558 
2559 TreeIterator::TreeIterator(BPlusTree* tree)
2560 	:
2561 	fTree(tree),
2562 	fCurrentNodeOffset(BPLUSTREE_NULL)
2563 {
2564 #if !_BOOT_MODE
2565 	tree->_AddIterator(this);
2566 #endif
2567 }
2568 
2569 
2570 TreeIterator::~TreeIterator()
2571 {
2572 #if !_BOOT_MODE
2573 	if (fTree)
2574 		fTree->_RemoveIterator(this);
2575 #endif
2576 }
2577 
2578 
2579 status_t
2580 TreeIterator::Goto(int8 to)
2581 {
2582 	if (fTree == NULL || fTree->fStream == NULL)
2583 		RETURN_ERROR(B_BAD_VALUE);
2584 
2585 #if !_BOOT_MODE
2586 	// lock access to stream
2587 	InodeReadLocker locker(fTree->fStream);
2588 #endif
2589 
2590 	off_t nodeOffset = fTree->fHeader.RootNode();
2591 	CachedNode cached(fTree);
2592 	const bplustree_node* node;
2593 
2594 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2595 		// is the node a leaf node?
2596 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2597 			fCurrentNodeOffset = nodeOffset;
2598 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
2599 			fDuplicateNode = BPLUSTREE_NULL;
2600 
2601 			return B_OK;
2602 		}
2603 
2604 		// get the next node offset depending on the direction (and if there
2605 		// are any keys in that node at all)
2606 		off_t nextOffset;
2607 		if (to == BPLUSTREE_END || node->all_key_count == 0)
2608 			nextOffset = node->OverflowLink();
2609 		else {
2610 			if (node->AllKeyLength() > fTree->fNodeSize
2611 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
2612 					- 8 * node->NumKeys())
2613 				RETURN_ERROR(B_ERROR);
2614 
2615 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
2616 		}
2617 		if (nextOffset == nodeOffset)
2618 			break;
2619 
2620 		nodeOffset = nextOffset;
2621 	}
2622 	FATAL(("%s fails\n", __FUNCTION__));
2623 
2624 	RETURN_ERROR(B_ERROR);
2625 }
2626 
2627 
2628 /*!	Iterates through the tree in the specified direction.
2629 	When it iterates through duplicates, the "key" is only updated for the
2630 	first entry - if you need to know when this happens, use the "duplicate"
2631 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2632 	the other duplicates.
2633 	That's not too nice, but saves the 256 bytes that would be needed to
2634 	store the last key - if this will ever become an issue, it will be
2635 	easy to change.
2636 	The other advantage of this is, that the queries can skip all duplicates
2637 	at once when they are not relevant to them.
2638 */
2639 status_t
2640 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2641 	uint16 maxLength, off_t* value, uint16* duplicate)
2642 {
2643 	if (fTree == NULL)
2644 		return B_INTERRUPTED;
2645 
2646 	bool forward = direction == BPLUSTREE_FORWARD;
2647 
2648 	if (fCurrentNodeOffset == BPLUSTREE_NULL
2649 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
2650 		RETURN_ERROR(B_ERROR);
2651 
2652 	// if the tree was emptied since the last call
2653 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
2654 		return B_ENTRY_NOT_FOUND;
2655 
2656 #if !_BOOT_MODE
2657 	// lock access to stream
2658 	InodeReadLocker locker(fTree->fStream);
2659 #endif
2660 
2661 	CachedNode cached(fTree);
2662 	const bplustree_node* node;
2663 
2664 	if (fDuplicateNode != BPLUSTREE_NULL) {
2665 		// regardless of traverse direction the duplicates are always presented
2666 		// in the same order; since they are all considered as equal, this
2667 		// shouldn't cause any problems
2668 
2669 		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2670 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2671 				false);
2672 		} else
2673 			node = NULL;
2674 
2675 		if (node != NULL) {
2676 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2677 				// If the node is out of duplicates, we go directly to the next
2678 				// one
2679 				fDuplicateNode = node->RightLink();
2680 				if (fDuplicateNode != BPLUSTREE_NULL
2681 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2682 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2683 						false);
2684 					fDuplicate = 0;
2685 				}
2686 			}
2687 			if (fDuplicate < fNumDuplicates) {
2688 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2689 					fDuplicate++);
2690 				if (duplicate)
2691 					*duplicate = 2;
2692 				return B_OK;
2693 			}
2694 		}
2695 		fDuplicateNode = BPLUSTREE_NULL;
2696 	}
2697 
2698 	off_t savedNodeOffset = fCurrentNodeOffset;
2699 	int32 savedKey = fCurrentKey;
2700 
2701 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2702 		RETURN_ERROR(B_ERROR);
2703 
2704 	if (duplicate)
2705 		*duplicate = 0;
2706 
2707 	fCurrentKey += direction;
2708 
2709 	// is the current key in the current node?
2710 	while ((forward && fCurrentKey >= node->NumKeys())
2711 			|| (!forward && fCurrentKey < 0)) {
2712 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2713 
2714 		// are there any more nodes?
2715 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2716 			node = cached.SetTo(fCurrentNodeOffset);
2717 			if (!node)
2718 				RETURN_ERROR(B_ERROR);
2719 
2720 			// reset current key
2721 			fCurrentKey = forward ? 0 : node->NumKeys() - 1;
2722 		} else {
2723 			// there are no nodes left, so turn back to the last key
2724 			fCurrentNodeOffset = savedNodeOffset;
2725 			fCurrentKey = savedKey;
2726 
2727 			return B_ENTRY_NOT_FOUND;
2728 		}
2729 	}
2730 
2731 	if (node->all_key_count == 0)
2732 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2733 
2734 	uint16 length;
2735 	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2736 	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2737 			> (uint8*)node + fTree->fNodeSize
2738 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2739 #if !_BOOT_MODE
2740 		fTree->fStream->GetVolume()->Panic();
2741 #endif
2742 		RETURN_ERROR(B_BAD_DATA);
2743 	}
2744 
2745 	// include the termination for string types
2746 	bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE;
2747 	if (length + (needsTermination ? 1 : 0) > maxLength) {
2748 		// the buffer is too small, restore the last key and return an error
2749 		fCurrentNodeOffset = savedNodeOffset;
2750 		fCurrentKey = savedKey;
2751 		return B_BUFFER_OVERFLOW;
2752 	}
2753 
2754 	memcpy(key, keyStart, length);
2755 
2756 	if (needsTermination)
2757 		((char*)key)[length] = '\0';
2758 
2759 	*keyLength = length;
2760 
2761 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2762 
2763 	// duplicate fragments?
2764 	uint8 type = bplustree_node::LinkType(offset);
2765 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2766 		|| type == BPLUSTREE_DUPLICATE_NODE) {
2767 		fDuplicateNode = offset;
2768 
2769 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2770 			false);
2771 		if (node == NULL)
2772 			RETURN_ERROR(B_ERROR);
2773 
2774 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2775 
2776 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2777 		if (fNumDuplicates) {
2778 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2779 			fDuplicate = 1;
2780 			if (duplicate)
2781 				*duplicate = 1;
2782 		} else {
2783 			// Shouldn't happen, but we're dealing here with potentially
2784 			// corrupt disks...
2785 			fDuplicateNode = BPLUSTREE_NULL;
2786 			offset = 0;
2787 		}
2788 	}
2789 	*value = offset;
2790 
2791 	return B_OK;
2792 }
2793 
2794 
2795 /*!	This is more or less a copy of BPlusTree::Find() - but it just
2796 	sets the current position in the iterator, regardless of if the
2797 	key could be found or not.
2798 */
2799 status_t
2800 TreeIterator::Find(const uint8* key, uint16 keyLength)
2801 {
2802 	if (fTree == NULL)
2803 		return B_INTERRUPTED;
2804 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2805 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2806 		|| key == NULL)
2807 		RETURN_ERROR(B_BAD_VALUE);
2808 
2809 #if !_BOOT_MODE
2810 	// lock access to stream
2811 	InodeReadLocker locker(fTree->fStream);
2812 #endif
2813 
2814 	off_t nodeOffset = fTree->fHeader.RootNode();
2815 
2816 	CachedNode cached(fTree);
2817 	const bplustree_node* node;
2818 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2819 		uint16 keyIndex = 0;
2820 		off_t nextOffset;
2821 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2822 			&nextOffset);
2823 
2824 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2825 			fCurrentNodeOffset = nodeOffset;
2826 			fCurrentKey = keyIndex - 1;
2827 			fDuplicateNode = BPLUSTREE_NULL;
2828 
2829 			return status;
2830 		} else if (nextOffset == nodeOffset)
2831 			RETURN_ERROR(B_ERROR);
2832 
2833 		nodeOffset = nextOffset;
2834 	}
2835 	RETURN_ERROR(B_ERROR);
2836 }
2837 
2838 
2839 void
2840 TreeIterator::SkipDuplicates()
2841 {
2842 	fDuplicateNode = BPLUSTREE_NULL;
2843 }
2844 
2845 
2846 void
2847 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2848 	uint16 splitAt, int8 change)
2849 {
2850 	if (offset != fCurrentNodeOffset)
2851 		return;
2852 
2853 	if (nextOffset != BPLUSTREE_NULL) {
2854 		fCurrentNodeOffset = nextOffset;
2855 		if (splitAt <= fCurrentKey) {
2856 			fCurrentKey -= splitAt;
2857 			keyIndex -= splitAt;
2858 		}
2859 	}
2860 
2861 	// Adjust fCurrentKey to point to the same key as before.
2862 	// Note, that if a key is inserted at the current position
2863 	// it won't be included in this tree transition.
2864 	if (keyIndex <= fCurrentKey)
2865 		fCurrentKey += change;
2866 
2867 	// TODO: duplicate handling!
2868 }
2869 
2870 
2871 void
2872 TreeIterator::Stop()
2873 {
2874 	fTree = NULL;
2875 }
2876 
2877 
2878 #ifdef DEBUG
2879 void
2880 TreeIterator::Dump()
2881 {
2882 	__out("TreeIterator at %p:\n", this);
2883 	__out("\tfTree = %p\n", fTree);
2884 	__out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset);
2885 	__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2886 	__out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n",
2887 		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2888 		fDuplicateNode);
2889 	__out("\tfDuplicate = %u\n", fDuplicate);
2890 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2891 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2892 }
2893 #endif
2894 
2895 
2896 // #pragma mark -
2897 
2898 
2899 bool
2900 bplustree_header::IsValid() const
2901 {
2902 	return Magic() == BPLUSTREE_MAGIC
2903 		&& (RootNode() % NodeSize()) == 0
2904 		&& IsValidLink(RootNode())
2905 		&& IsValidLink(FreeNode());
2906 }
2907 
2908 
2909 // #pragma mark -
2910 
2911 
2912 void
2913 bplustree_node::Initialize()
2914 {
2915 	left_link = right_link = overflow_link
2916 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2917 	all_key_count = 0;
2918 	all_key_length = 0;
2919 }
2920 
2921 
2922 uint8*
2923 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2924 {
2925 	if (index < 0 || index > NumKeys())
2926 		return NULL;
2927 
2928 	uint8* keyStart = Keys();
2929 	uint16* keyLengths = KeyLengths();
2930 
2931 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2932 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2933 	if (index > 0)
2934 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2935 
2936 	return keyStart;
2937 }
2938 
2939 
2940 uint8
2941 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2942 {
2943 	// the duplicate fragment handling is currently hard-coded to a node size
2944 	// of 1024 bytes - with future versions of BFS, this may be a problem
2945 
2946 	if (isFragment) {
2947 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2948 
2949 		return ((off_t*)this)[fragment];
2950 	}
2951 	return OverflowLink();
2952 }
2953 
2954 
2955 off_t
2956 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2957 {
2958 	uint32 start;
2959 	if (isFragment)
2960 		start = 8 * ((uint64)offset & 0x3ff);
2961 	else
2962 		start = 2;
2963 
2964 	return ((off_t*)this)[start + 1 + index];
2965 }
2966 
2967 
2968 /*!	Although the name suggests it, this function doesn't return the real
2969 	used fragment count; at least, it can only count to two: it returns
2970 	0, if there is no fragment used, 1 if there is only one fragment
2971 	used, and 2 if there are at least 2 fragments used.
2972 */
2973 uint32
2974 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2975 {
2976 	uint32 used = 0;
2977 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2978 		duplicate_array* array = FragmentAt(i);
2979 		if (array->Count() > 0 && ++used > 1)
2980 			return used;
2981 	}
2982 	return used;
2983 }
2984 
2985 
2986 status_t
2987 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2988 {
2989 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2990 		DEBUGGER(("invalid node: key/length count"));
2991 
2992 	for (int32 i = 0; i < NumKeys(); i++) {
2993 		uint16 length;
2994 		uint8* key = KeyAt(i, &length);
2995 		if (key + length + sizeof(off_t) + sizeof(uint16)
2996 				> (uint8*)this + nodeSize
2997 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2998 			dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
2999 			return B_BAD_DATA;
3000 		}
3001 		if (Values()[i] == -1) {
3002 			dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
3003 				"corrupted\n", this, (int)i, Values()[i]);
3004 			return B_BAD_DATA;
3005 		}
3006 	}
3007 	return B_OK;
3008 }
3009 
3010 
3011 // #pragma mark -
3012 
3013 
3014 #if !_BOOT_MODE
3015 BitmapArray::BitmapArray(size_t numBits)
3016 {
3017 	fSize = (numBits + 7) / 8;
3018 	fBitmap = (uint8*)calloc(fSize, 1);
3019 	fCountSet = 0;
3020 }
3021 
3022 
3023 BitmapArray::~BitmapArray()
3024 {
3025 	free(fBitmap);
3026 }
3027 
3028 
3029 status_t
3030 BitmapArray::InitCheck() const
3031 {
3032 	return fBitmap != NULL ? B_OK : B_NO_MEMORY;
3033 }
3034 
3035 
3036 bool
3037 BitmapArray::IsSet(size_t index) const
3038 {
3039 	uint32 byteIndex = index / 8;
3040 	if (byteIndex >= fSize)
3041 		return false;
3042 
3043 	return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
3044 }
3045 
3046 
3047 void
3048 BitmapArray::Set(size_t index, bool set)
3049 {
3050 	uint32 byteIndex = index / 8;
3051 	if (byteIndex >= fSize)
3052 		return;
3053 
3054 	if (set) {
3055 		fBitmap[byteIndex] |= 1UL << (index & 0x7);
3056 		fCountSet++;
3057 	} else {
3058 		fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
3059 		fCountSet--;
3060 	}
3061 }
3062 #endif // !_BOOT_MODE
3063 
3064 
3065 // #pragma mark -
3066 
3067 
3068 bool
3069 duplicate_array::_FindInternal(off_t value, int32& index) const
3070 {
3071 	int32 min = 0, max = Count() - 1;
3072 	off_t cmp;
3073 	while (min <= max) {
3074 		index = (min + max) / 2;
3075 
3076 		cmp = ValueAt(index) - value;
3077 		if (cmp < 0)
3078 			min = index + 1;
3079 		else if (cmp > 0)
3080 			max = index - 1;
3081 		else
3082 			return true;
3083 	}
3084 	return false;
3085 }
3086 
3087 
3088 void
3089 duplicate_array::Insert(off_t value)
3090 {
3091 	// if there are more than 8 values in this array, use a
3092 	// binary search, if not, just iterate linearly to find
3093 	// the insertion point
3094 	int32 size = Count();
3095 	int32 i = 0;
3096 	if (size > 8 ) {
3097 		if (!_FindInternal(value, i) && ValueAt(i) <= value)
3098 			i++;
3099 	} else {
3100 		for (i = 0; i < size; i++) {
3101 			if (ValueAt(i) > value)
3102 				break;
3103 		}
3104 	}
3105 
3106 	memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
3107 	values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
3108 	count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
3109 }
3110 
3111 
3112 bool
3113 duplicate_array::Remove(off_t value)
3114 {
3115 	int32 index = Find(value);
3116 	if (index == -1)
3117 		return false;
3118 
3119 	int32 newSize = Count() - 1;
3120 	memmove(&values[index], &values[index + 1],
3121 		(newSize - index) * sizeof(off_t));
3122 	count = HOST_ENDIAN_TO_BFS_INT64(newSize);
3123 
3124 	return true;
3125 }
3126 
3127 
3128 #if _BOOT_MODE
3129 } // namespace BFS
3130 #endif
3131