xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 2897df967633aab846ff4917b53e2af7d1e54eeb)
1 /*
2  * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 
10 //! B+Tree implementation
11 
12 
13 #include "BPlusTree.h"
14 
15 #include <file_systems/QueryParserUtils.h>
16 
17 #include "Debug.h"
18 #include "Utility.h"
19 
20 #if !_BOOT_MODE
21 #	include "Inode.h"
22 #else
23 #	include "Stream.h"
24 
25 // BFS::Stream from the bootloader has the same API as Inode.
26 #	define Inode BFS::Stream
27 
28 #	define strerror(x)		"error message unavailable"
29 
30 namespace BFS {
31 #endif
32 
33 
34 /*!	Simple array used for the duplicate handling in the B+Tree. This is an
35 	on disk structure.
36 */
37 struct duplicate_array {
38 	off_t	count;
39 	off_t	values[0];
40 
41 	inline bool IsEmpty() const
42 	{
43 		return count == 0;
44 	}
45 
46 	inline int32 Count() const
47 	{
48 		return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
49 	}
50 
51 	inline off_t ValueAt(uint32 index) const
52 	{
53 		return BFS_ENDIAN_TO_HOST_INT64(values[index]);
54 	}
55 
56 	inline void SetValueAt(uint32 index, off_t value)
57 	{
58 		values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
59 	}
60 
61 	inline int32 Find(off_t value) const
62 	{
63 		int32 i;
64 		return _FindInternal(value, i) ? i : -1;
65 	}
66 
67 	void Insert(off_t value);
68 	bool Remove(off_t value);
69 
70 private:
71 	bool _FindInternal(off_t value, int32& index) const;
72 } _PACKED;
73 
74 
75 #ifdef DEBUG
76 class NodeChecker {
77 public:
78 	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
79 		:
80 		fNode(node),
81 		fSize(nodeSize),
82 		fText(text)
83 	{
84 		Check("integrity check failed on construction.");
85 	}
86 
87 	~NodeChecker()
88 	{
89 		Check("integrity check failed on destruction.");
90 	}
91 
92 	void
93 	Check(const char* message)
94 	{
95 		if (fNode->CheckIntegrity(fSize) != B_OK) {
96 			dprintf("%s: %s\n", fText, message);
97 			DEBUGGER(("NodeChecker integrity check failed!"));
98 		}
99 	}
100 
101 private:
102 	const bplustree_node*	fNode;
103 	int32					fSize;
104 	const char*				fText;
105 };
106 #endif // DEBUG
107 
108 
109 #if !_BOOT_MODE
110 class BitmapArray {
111 public:
112 								BitmapArray(size_t numBits);
113 								~BitmapArray();
114 
115 			status_t			InitCheck() const;
116 
117 			bool				IsSet(size_t index) const;
118 			void				Set(size_t index, bool set);
119 
120 			size_t				CountSet() const { return fCountSet; }
121 
122 private:
123 			uint8*				fBitmap;
124 			size_t				fSize;
125 			size_t				fCountSet;
126 };
127 
128 
129 struct TreeCheck {
130 	TreeCheck(BPlusTree* tree)
131 		:
132 		fLevelCount(0),
133 		fFreeCount(0),
134 		fNodeSize(tree->NodeSize()),
135 		fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
136 		fFoundErrors(0),
137 		fVisited(tree->Stream()->Size() / tree->NodeSize()),
138 		fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
139 	{
140 		fPreviousOffsets = (off_t*)malloc(
141 			sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
142 		if (fPreviousOffsets != NULL) {
143 			for (size_t i = 0; i < fMaxLevels; i++)
144 				fPreviousOffsets[i] = BPLUSTREE_NULL;
145 		}
146 	}
147 
148 	~TreeCheck()
149 	{
150 		free(fPreviousOffsets);
151 	}
152 
153 	status_t InitCheck() const
154 	{
155 		if (fPreviousOffsets == NULL)
156 			return B_NO_MEMORY;
157 
158 		status_t status = fVisited.InitCheck();
159 		if (status != B_OK)
160 			return status;
161 
162 		return fVisitedFragment.InitCheck();
163 	}
164 
165 	bool Visited(off_t offset) const
166 	{
167 		return fVisited.IsSet(offset / fNodeSize);
168 	}
169 
170 	void SetVisited(off_t offset)
171 	{
172 		fVisited.Set(offset / fNodeSize, true);
173 	}
174 
175 	size_t VisitedCount() const
176 	{
177 		return fVisited.CountSet();
178 	}
179 
180 	bool VisitedFragment(off_t offset) const
181 	{
182 		return fVisitedFragment.IsSet(offset / fNodeSize);
183 	}
184 
185 	void SetVisitedFragment(off_t offset)
186 	{
187 		fVisitedFragment.Set(offset / fNodeSize, true);
188 	}
189 
190 	uint32 MaxLevels() const
191 	{
192 		return fLevelCount;
193 	}
194 
195 	void SetLevel(uint32 level)
196 	{
197 		if (fLevelCount < level)
198 			fLevelCount = level;
199 	}
200 
201 	off_t PreviousOffset(uint32 level)
202 	{
203 		return fPreviousOffsets[level];
204 	}
205 
206 	void SetPreviousOffset(uint32 level, off_t offset)
207 	{
208 		fPreviousOffsets[level] = offset;
209 	}
210 
211 	void FoundError()
212 	{
213 		fFoundErrors++;
214 	}
215 
216 	bool ErrorsFound()
217 	{
218 		return fFoundErrors != 0;
219 	}
220 
221 private:
222 			uint32				fLevelCount;
223 			uint32				fFreeCount;
224 			uint32				fNodeSize;
225 			uint32				fMaxLevels;
226 			uint32				fFoundErrors;
227 			BitmapArray			fVisited;
228 			BitmapArray			fVisitedFragment;
229 			off_t*				fPreviousOffsets;
230 };
231 
232 
233 // #pragma mark -
234 
235 
236 // Node Caching for the BPlusTree class
237 //
238 // With write support, there is the need for a function that allocates new
239 // nodes by either returning empty nodes, or by growing the file's data stream
240 //
241 // !! The CachedNode class assumes that you have properly locked the stream
242 // !! before asking for nodes.
243 //
244 // Note: This code will fail if the block size is smaller than the node size!
245 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
246 // is hard-coded to 1024 bytes, that's not an issue now.
247 
248 void
249 CachedNode::UnsetUnchanged(Transaction& transaction)
250 {
251 	if (fTree == NULL || fTree->fStream == NULL)
252 		return;
253 
254 	if (fNode != NULL) {
255 		void* cache = fTree->fStream->GetVolume()->BlockCache();
256 
257 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
258 		block_cache_put(cache, fBlockNumber);
259 		fNode = NULL;
260 	}
261 }
262 #endif // !_BOOT_MODE
263 
264 
265 void
266 CachedNode::Unset()
267 {
268 	if (fTree == NULL || fTree->fStream == NULL)
269 		return;
270 
271 	if (fNode != NULL) {
272 #if !_BOOT_MODE
273 		if (fWritable && fOffset == 0) {
274 			// The B+tree header has been updated - we need to update the
275 			// BPlusTrees copy of it, as well.
276 			memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
277 		}
278 
279 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
280 			fBlockNumber);
281 #endif // !_BOOT_MODE
282 
283 		fNode = NULL;
284 	}
285 }
286 
287 
288 const bplustree_node*
289 CachedNode::SetTo(off_t offset, bool check)
290 {
291 	const bplustree_node* node;
292 	if (SetTo(offset, &node, check) == B_OK)
293 		return node;
294 
295 	return NULL;
296 }
297 
298 
299 status_t
300 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
301 {
302 	if (fTree == NULL || fTree->fStream == NULL)
303 		RETURN_ERROR(B_BAD_VALUE);
304 
305 	Unset();
306 
307 	// You can only ask for nodes at valid positions - you can't
308 	// even access the b+tree header with this method (use SetToHeader()
309 	// instead)
310 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
311 		|| offset <= 0
312 		|| (offset % fTree->fNodeSize) != 0) {
313 		RETURN_ERROR(B_BAD_VALUE);
314 	}
315 
316 	if (InternalSetTo(NULL, offset) != NULL && check) {
317 		// sanity checks (links, all_key_count)
318 		if (!fTree->fHeader.CheckNode(fNode)) {
319 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
320 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
321 				fBlockNumber, fTree->fStream->ID()));
322 			return B_BAD_DATA;
323 		}
324 	}
325 
326 	*_node = fNode;
327 	return B_OK;
328 }
329 
330 
331 #if !_BOOT_MODE
332 bplustree_node*
333 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
334 {
335 	if (fTree == NULL || fTree->fStream == NULL) {
336 		REPORT_ERROR(B_BAD_VALUE);
337 		return NULL;
338 	}
339 
340 	Unset();
341 
342 	// You can only ask for nodes at valid positions - you can't
343 	// even access the b+tree header with this method (use SetToHeader()
344 	// instead)
345 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
346 		|| offset <= 0
347 		|| (offset % fTree->fNodeSize) != 0)
348 		return NULL;
349 
350 	if (InternalSetTo(&transaction, offset) != NULL && check) {
351 		// sanity checks (links, all_key_count)
352 		if (!fTree->fHeader.CheckNode(fNode)) {
353 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
354 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
355 				fBlockNumber, fTree->fStream->ID()));
356 			return NULL;
357 		}
358 	}
359 	return fNode;
360 }
361 
362 
363 bplustree_node*
364 CachedNode::MakeWritable(Transaction& transaction)
365 {
366 	if (fNode == NULL)
367 		return NULL;
368 
369 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
370 			fBlockNumber, transaction.ID()) == B_OK) {
371 		return fNode;
372 	}
373 
374 	return NULL;
375 }
376 #endif // !_BOOT_MODE
377 
378 
379 const bplustree_header*
380 CachedNode::SetToHeader()
381 {
382 	if (fTree == NULL || fTree->fStream == NULL) {
383 		REPORT_ERROR(B_BAD_VALUE);
384 		return NULL;
385 	}
386 
387 	Unset();
388 
389 	InternalSetTo(NULL, 0LL);
390 	return (bplustree_header*)fNode;
391 }
392 
393 
394 #if !_BOOT_MODE
395 bplustree_header*
396 CachedNode::SetToWritableHeader(Transaction& transaction)
397 {
398 	if (fTree == NULL || fTree->fStream == NULL) {
399 		REPORT_ERROR(B_BAD_VALUE);
400 		return NULL;
401 	}
402 
403 	Unset();
404 
405 	InternalSetTo(&transaction, 0LL);
406 
407 	if (fNode != NULL && !fTree->fInTransaction) {
408 		transaction.AddListener(fTree);
409 		fTree->fInTransaction = true;
410 
411 		if (!transaction.GetVolume()->IsInitializing()) {
412 			acquire_vnode(transaction.GetVolume()->FSVolume(),
413 				fTree->fStream->ID());
414 		}
415 	}
416 
417 	return (bplustree_header*)fNode;
418 }
419 #endif // !_BOOT_MODE
420 
421 
422 bplustree_node*
423 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
424 {
425 	fNode = NULL;
426 	fOffset = offset;
427 
428 	off_t fileOffset;
429 	block_run run;
430 	if (offset < fTree->fStream->Size()
431 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
432 
433 #if !_BOOT_MODE
434 		Volume* volume = fTree->fStream->GetVolume();
435 #else
436 		Volume* volume = &fTree->fStream->GetVolume();
437 #endif
438 
439 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
440 		fBlockNumber = volume->ToBlock(run) + blockOffset;
441 		uint8* block = NULL;
442 
443 #if !_BOOT_MODE
444 		if (transaction != NULL) {
445 			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
446 				fBlockNumber, transaction->ID());
447 			fWritable = true;
448 		} else {
449 			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
450 			fWritable = false;
451 		}
452 #else // !_BOOT_MODE
453 		if (fBlock == NULL) {
454 			fBlock = (uint8*)malloc(volume->BlockSize());
455 			if (fBlock == NULL)
456 				return NULL;
457 		}
458 
459 		if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(),
460 				fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) {
461 			block = fBlock;
462 		}
463 
464 		fWritable = false;
465 #endif // _BOOT_MODE
466 
467 		if (block != NULL) {
468 			// The node is somewhere in that block...
469 			// (confusing offset calculation)
470 			fNode = (bplustree_node*)(block + offset
471 				- (fileOffset + (blockOffset << volume->BlockShift())));
472 		} else
473 			REPORT_ERROR(B_IO_ERROR);
474 	}
475 	return fNode;
476 }
477 
478 
479 #if !_BOOT_MODE
480 status_t
481 CachedNode::Free(Transaction& transaction, off_t offset)
482 {
483 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
484 		RETURN_ERROR(B_BAD_VALUE);
485 
486 	// TODO: scan the free nodes list and remove all nodes at the end
487 	// of the tree - perhaps that shouldn't be done everytime that
488 	// function is called, perhaps it should be done when the directory
489 	// inode is closed or based on some calculation or whatever...
490 
491 	CachedNode cached(fTree);
492 	bplustree_header* header = cached.SetToWritableHeader(transaction);
493 	if (header == NULL)
494 		return B_IO_ERROR;
495 
496 #if 0
497 	// TODO: temporarily disabled because CheckNode() doesn't like this...
498 	// 		Also, it's such an edge case that it's almost useless, anyway.
499 	// if the node is the last one in the tree, we shrink
500 	// the tree and file size by one node
501 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
502 	if (offset == lastOffset) {
503 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
504 		if (status != B_OK)
505 			return status;
506 
507 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
508 		return B_OK;
509 	}
510 #endif
511 
512 	// add the node to the free nodes list
513 	fNode->left_link = header->free_node_pointer;
514 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
515 
516 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
517 	return B_OK;
518 }
519 
520 
521 status_t
522 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
523 	off_t* _offset)
524 {
525 	if (fTree == NULL || fTree->fStream == NULL)
526 		RETURN_ERROR(B_BAD_VALUE);
527 
528 	Unset();
529 
530 	bplustree_header* header;
531 	status_t status;
532 
533 	// if there are any free nodes, recycle them
534 	if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
535 		CachedNode cached(fTree);
536 		header = cached.SetToWritableHeader(transaction);
537 		if (header == NULL)
538 			return B_IO_ERROR;
539 
540 		*_offset = header->FreeNode();
541 		*_node = fNode;
542 
543 		// set new free node pointer
544 		header->free_node_pointer = fNode->left_link;
545 		fNode->Initialize();
546 		return B_OK;
547 	}
548 
549 	// allocate space for a new node
550 	Inode* stream = fTree->fStream;
551 	if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
552 		return status;
553 
554 	CachedNode cached(fTree);
555 	header = cached.SetToWritableHeader(transaction);
556 	if (header == NULL)
557 		return B_IO_ERROR;
558 
559 	// the maximum_size has to be changed before the call to SetTo() - or
560 	// else it will fail because the requested node is out of bounds
561 	off_t offset = header->MaximumSize();
562 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
563 		+ fTree->fNodeSize);
564 
565 	cached.Unset();
566 		// SetToWritable() below needs the new values in the tree's header
567 
568 	if (SetToWritable(transaction, offset, false) == NULL)
569 		RETURN_ERROR(B_ERROR);
570 
571 	fNode->Initialize();
572 
573 	*_offset = offset;
574 	*_node = fNode;
575 	return B_OK;
576 }
577 
578 
579 //	#pragma mark -
580 
581 
582 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
583 	:
584 	fStream(NULL),
585 	fInTransaction(false)
586 {
587 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
588 	SetTo(transaction, stream);
589 }
590 #endif // !_BOOT_MODE
591 
592 
593 BPlusTree::BPlusTree(Inode* stream)
594 	:
595 	fStream(NULL),
596 	fInTransaction(false)
597 {
598 #if !_BOOT_MODE
599 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
600 #endif
601 
602 	SetTo(stream);
603 }
604 
605 
606 BPlusTree::BPlusTree()
607 	:
608 	fStream(NULL),
609 	fNodeSize(BPLUSTREE_NODE_SIZE),
610 	fAllowDuplicates(true),
611 	fInTransaction(false),
612 	fStatus(B_NO_INIT)
613 {
614 #if !_BOOT_MODE
615 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
616 #endif
617 }
618 
619 
620 BPlusTree::~BPlusTree()
621 {
622 #if !_BOOT_MODE
623 	// if there are any TreeIterators left, we need to stop them
624 	// (can happen when the tree's inode gets deleted while
625 	// traversing the tree - a TreeIterator doesn't lock the inode)
626 	mutex_lock(&fIteratorLock);
627 
628 	SinglyLinkedList<TreeIterator>::Iterator iterator
629 		= fIterators.GetIterator();
630 	while (iterator.HasNext())
631 		iterator.Next()->Stop();
632 
633 	mutex_destroy(&fIteratorLock);
634 
635 	ASSERT(!fInTransaction);
636 #endif // !_BOOT_MODE
637 }
638 
639 
640 #if !_BOOT_MODE
641 /*! Create a new B+Tree on the specified stream */
642 status_t
643 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
644 {
645 	// initializes in-memory B+Tree
646 
647 	fStream = stream;
648 
649 	CachedNode cached(this);
650 	bplustree_header* header = cached.SetToWritableHeader(transaction);
651 	if (header == NULL) {
652 		// allocate space for new header + node!
653 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
654 		if (fStatus != B_OK)
655 			RETURN_ERROR(fStatus);
656 
657 		header = cached.SetToWritableHeader(transaction);
658 		if (header == NULL)
659 			RETURN_ERROR(fStatus = B_ERROR);
660 	}
661 
662 	fAllowDuplicates = stream->IsIndex()
663 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
664 
665 	fNodeSize = nodeSize;
666 
667 	// initialize b+tree header
668  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
669  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
670  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
671  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
672  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
673  	header->free_node_pointer
674  		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
675  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
676 
677 	cached.Unset();
678 
679 	// initialize b+tree root node
680 	cached.SetToWritable(transaction, fHeader.RootNode(), false);
681 	if (cached.Node() == NULL)
682 		RETURN_ERROR(B_IO_ERROR);
683 
684 	cached.Node()->Initialize();
685 
686 	return fStatus = B_OK;
687 }
688 #endif // !_BOOT_MODE
689 
690 
691 status_t
692 BPlusTree::SetTo(Inode* stream)
693 {
694 	if (stream == NULL)
695 		RETURN_ERROR(fStatus = B_BAD_VALUE);
696 
697 	fStream = stream;
698 
699 	// get on-disk B+Tree header
700 
701 	CachedNode cached(this);
702 	const bplustree_header* header = cached.SetToHeader();
703 	if (header != NULL)
704 		memcpy(&fHeader, header, sizeof(bplustree_header));
705 	else
706 		RETURN_ERROR(fStatus = B_IO_ERROR);
707 
708 	// is header valid?
709 
710 	if (fHeader.MaximumSize() != stream->Size()) {
711 		dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
712 			B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
713 		// we can't change the header since we don't have a transaction
714 		//fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
715 	}
716 	if (!fHeader.IsValid()) {
717 #ifdef DEBUG
718 		dump_bplustree_header(&fHeader);
719 		dump_block((const char*)&fHeader, 128);
720 #endif
721 		RETURN_ERROR(fStatus = B_BAD_DATA);
722 	}
723 
724 	fNodeSize = fHeader.NodeSize();
725 
726 	// validity check
727 	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
728 		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
729 		S_DOUBLE_INDEX};
730 	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
731 		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
732 		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
733 
734 	if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
735 		|| ((stream->Mode() & S_INDEX_DIR) != 0
736 			&& kToMode[fHeader.DataType()] != mode)
737 		|| !stream->IsContainer()) {
738 		D(	dump_bplustree_header(&fHeader);
739 			dump_inode(&stream->Node());
740 		);
741 		RETURN_ERROR(fStatus = B_BAD_TYPE);
742 	}
743 
744 	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
745 	// in the original BFS code - we will honour it nevertheless
746 	fAllowDuplicates = stream->IsIndex()
747 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
748 
749 	cached.SetTo(fHeader.RootNode());
750 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
751 }
752 
753 
754 status_t
755 BPlusTree::InitCheck()
756 {
757 	return fStatus;
758 }
759 
760 
761 #if !_BOOT_MODE
762 status_t
763 BPlusTree::Validate(bool repair, bool& _errorsFound)
764 {
765 	TreeCheck check(this);
766 	if (check.InitCheck() != B_OK)
767 		return B_NO_MEMORY;
768 
769 	check.SetVisited(0);
770 
771 	// Walk the free nodes
772 
773 	CachedNode cached(this);
774 	off_t freeOffset = fHeader.FreeNode();
775 	while (freeOffset > 0) {
776 		const bplustree_node* node;
777 		status_t status = cached.SetTo(freeOffset, &node, false);
778 		if (status != B_OK) {
779 			if (status == B_IO_ERROR)
780 				return B_IO_ERROR;
781 
782 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
783 				"not be read: %s\n", fStream->ID(), freeOffset,
784 				strerror(status));
785 			check.FoundError();
786 			break;
787 		}
788 
789 		if (check.Visited(freeOffset)) {
790 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
791 				" circular!\n", fStream->ID(), freeOffset);
792 			// TODO: if 'repair' is true, we could collect all unvisited nodes
793 			// at the end, and put the into the free list
794 			check.FoundError();
795 			break;
796 		}
797 
798 		check.SetVisited(freeOffset);
799 
800 		if (node->OverflowLink() != BPLUSTREE_FREE) {
801 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
802 				" misses free mark!\n", fStream->ID(), freeOffset);
803 		}
804 		freeOffset = node->LeftLink();
805 	}
806 
807 	// Iterate over the complete tree recursively
808 
809 	const bplustree_node* root;
810 	status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
811 	if (status != B_OK)
812 		return status;
813 
814 	status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
815 
816 	if (check.ErrorsFound())
817 		_errorsFound = true;
818 
819 	if (status != B_OK)
820 		return status;
821 
822 	if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
823 		dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
824 			"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
825 			fHeader.MaxNumberOfLevels());
826 	}
827 
828 	if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
829 		dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
830 			" nodes.\n", fStream->ID(), check.VisitedCount(),
831 			fHeader.MaximumSize() / fNodeSize);
832 	}
833 
834 	return B_OK;
835 }
836 
837 
838 status_t
839 BPlusTree::MakeEmpty()
840 {
841 	// Put all nodes into the free list in order
842 	Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
843 
844 	// Reset the header, and root node
845 	CachedNode cached(this);
846 	bplustree_header* header = cached.SetToWritableHeader(transaction);
847 	if (header == NULL)
848 		return B_IO_ERROR;
849 
850 	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
851 	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
852 	if (fStream->Size() > (off_t)NodeSize() * 2)
853 		header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
854 	else {
855 		header->free_node_pointer
856 			= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
857 	}
858 
859 	bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
860 	if (node == NULL)
861 		return B_IO_ERROR;
862 
863 	node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
864 	node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
865 	node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
866 	node->all_key_count = 0;
867 	node->all_key_length = 0;
868 
869 	for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
870 			offset += NodeSize()) {
871 		bplustree_node* node = cached.SetToWritable(transaction, offset, false);
872 		if (node == NULL) {
873 			dprintf("--> could not open %" B_PRIdOFF "\n", offset);
874 			return B_IO_ERROR;
875 		}
876 		if (offset < fStream->Size() - (off_t)NodeSize())
877 			node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
878 		else
879 			node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
880 
881 		node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
882 
883 		// It's not important to write it out in a single transaction; just
884 		// make sure it doesn't get too large
885 		if (offset % (1024 * 1024) == 0) {
886 			transaction.Done();
887 			transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
888 		}
889 	}
890 
891 	return transaction.Done();
892 }
893 
894 
895 int32
896 BPlusTree::TypeCodeToKeyType(type_code code)
897 {
898 	switch (code) {
899 		case B_STRING_TYPE:
900 			return BPLUSTREE_STRING_TYPE;
901 		case B_SSIZE_T_TYPE:
902 		case B_INT32_TYPE:
903 			return BPLUSTREE_INT32_TYPE;
904 		case B_SIZE_T_TYPE:
905 		case B_UINT32_TYPE:
906 			return BPLUSTREE_UINT32_TYPE;
907 		case B_OFF_T_TYPE:
908 		case B_INT64_TYPE:
909 			return BPLUSTREE_INT64_TYPE;
910 		case B_UINT64_TYPE:
911 			return BPLUSTREE_UINT64_TYPE;
912 		case B_FLOAT_TYPE:
913 			return BPLUSTREE_FLOAT_TYPE;
914 		case B_DOUBLE_TYPE:
915 			return BPLUSTREE_DOUBLE_TYPE;
916 	}
917 	return -1;
918 }
919 
920 
921 int32
922 BPlusTree::ModeToKeyType(mode_t mode)
923 {
924 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
925 			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
926 		case S_INT_INDEX:
927 			return BPLUSTREE_INT32_TYPE;
928 		case S_UINT_INDEX:
929 			return BPLUSTREE_UINT32_TYPE;
930 		case S_LONG_LONG_INDEX:
931 			return BPLUSTREE_INT64_TYPE;
932 		case S_ULONG_LONG_INDEX:
933 			return BPLUSTREE_UINT64_TYPE;
934 		case S_FLOAT_INDEX:
935 			return BPLUSTREE_FLOAT_TYPE;
936 		case S_DOUBLE_INDEX:
937 			return BPLUSTREE_DOUBLE_TYPE;
938 		case S_STR_INDEX:
939 		default:
940 			// default is for standard directories
941 			return BPLUSTREE_STRING_TYPE;
942 	}
943 }
944 #endif // !_BOOT_MODE
945 
946 
947 //	#pragma mark - TransactionListener implementation
948 
949 
950 #if !_BOOT_MODE
951 void
952 BPlusTree::TransactionDone(bool success)
953 {
954 	if (!success) {
955 		// update header from disk
956 		CachedNode cached(this);
957 		const bplustree_header* header = cached.SetToHeader();
958 		if (header != NULL)
959 			memcpy(&fHeader, header, sizeof(bplustree_header));
960 	}
961 }
962 
963 
964 void
965 BPlusTree::RemovedFromTransaction()
966 {
967 	fInTransaction = false;
968 
969 	if (!fStream->GetVolume()->IsInitializing())
970 		put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
971 }
972 
973 
974 //	#pragma mark -
975 
976 
977 void
978 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
979 	uint16 splitAt, int8 change)
980 {
981 	// Although every iterator which is affected by this update currently
982 	// waits on a semaphore, other iterators could be added/removed at
983 	// any time, so we need to protect this loop
984 	MutexLocker _(fIteratorLock);
985 
986 	SinglyLinkedList<TreeIterator>::Iterator iterator
987 		= fIterators.GetIterator();
988 	while (iterator.HasNext())
989 		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
990 }
991 
992 
993 void
994 BPlusTree::_AddIterator(TreeIterator* iterator)
995 {
996 	MutexLocker _(fIteratorLock);
997 	fIterators.Add(iterator);
998 }
999 
1000 
1001 void
1002 BPlusTree::_RemoveIterator(TreeIterator* iterator)
1003 {
1004 	MutexLocker _(fIteratorLock);
1005 	fIterators.Remove(iterator);
1006 }
1007 #endif // !_BOOT_MODE
1008 
1009 
1010 int32
1011 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
1012 	int keyLength2)
1013 {
1014 	type_code type = 0;
1015 	switch (fHeader.DataType()) {
1016 	    case BPLUSTREE_STRING_TYPE:
1017 	    	type = B_STRING_TYPE;
1018 	    	break;
1019 		case BPLUSTREE_INT32_TYPE:
1020 	    	type = B_INT32_TYPE;
1021 	    	break;
1022 		case BPLUSTREE_UINT32_TYPE:
1023 	    	type = B_UINT32_TYPE;
1024 	    	break;
1025 		case BPLUSTREE_INT64_TYPE:
1026 	    	type = B_INT64_TYPE;
1027 	    	break;
1028 		case BPLUSTREE_UINT64_TYPE:
1029 	    	type = B_UINT64_TYPE;
1030 	    	break;
1031 		case BPLUSTREE_FLOAT_TYPE:
1032 	    	type = B_FLOAT_TYPE;
1033 	    	break;
1034 		case BPLUSTREE_DOUBLE_TYPE:
1035 	    	type = B_DOUBLE_TYPE;
1036 	    	break;
1037 	}
1038 	return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2);
1039 }
1040 
1041 
1042 status_t
1043 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
1044 	uint16 keyLength, uint16* _index, off_t* _next)
1045 {
1046 #ifdef DEBUG
1047 	NodeChecker checker(node, fNodeSize, "find");
1048 #endif
1049 
1050 	if (node->all_key_count == 0) {
1051 		if (_index)
1052 			*_index = 0;
1053 		if (_next)
1054 			*_next = node->OverflowLink();
1055 		return B_ENTRY_NOT_FOUND;
1056 	}
1057 
1058 	off_t* values = node->Values();
1059 	int16 saveIndex = -1;
1060 
1061 	// binary search in the key array
1062 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
1063 		uint16 i = (first + last) >> 1;
1064 
1065 		uint16 searchLength = 0;
1066 		uint8* searchKey = node->KeyAt(i, &searchLength);
1067 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
1068 				> (uint8*)node + fNodeSize
1069 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
1070 #if !_BOOT_MODE
1071 			fStream->GetVolume()->Panic();
1072 #endif
1073 			RETURN_ERROR(B_BAD_DATA);
1074 		}
1075 
1076 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
1077 		if (cmp < 0) {
1078 			last = i - 1;
1079 			saveIndex = i;
1080 		} else if (cmp > 0) {
1081 			saveIndex = first = i + 1;
1082 		} else {
1083 			if (_index)
1084 				*_index = i;
1085 			if (_next)
1086 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1087 			return B_OK;
1088 		}
1089 	}
1090 
1091 	if (_index)
1092 		*_index = saveIndex;
1093 	if (_next) {
1094 		if (saveIndex == node->NumKeys())
1095 			*_next = node->OverflowLink();
1096 		else
1097 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
1098 	}
1099 	return B_ENTRY_NOT_FOUND;
1100 }
1101 
1102 
1103 #if !_BOOT_MODE
1104 /*!	Prepares the stack to contain all nodes that were passed while
1105 	following the key, from the root node to the leaf node that could
1106 	or should contain that key.
1107 */
1108 status_t
1109 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
1110 	uint16 keyLength)
1111 {
1112 	// set the root node to begin with
1113 	node_and_key nodeAndKey;
1114 	nodeAndKey.nodeOffset = fHeader.RootNode();
1115 
1116 	CachedNode cached(this);
1117 	const bplustree_node* node;
1118 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1119 		// if we are already on leaf level, we're done
1120 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1121 			// node that the keyIndex is not properly set here (but it's not
1122 			// needed in the calling functions anyway)!
1123 			nodeAndKey.keyIndex = 0;
1124 			stack.Push(nodeAndKey);
1125 			return B_OK;
1126 		}
1127 
1128 		off_t nextOffset;
1129 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
1130 			&nextOffset);
1131 
1132 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
1133 			RETURN_ERROR(B_ERROR);
1134 
1135 		if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
1136 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
1137 			break;
1138 		}
1139 
1140 		// put the node offset & the correct keyIndex on the stack
1141 		stack.Push(nodeAndKey);
1142 
1143 		nodeAndKey.nodeOffset = nextOffset;
1144 	}
1145 
1146 	FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
1147 		B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
1148 	return B_ERROR;
1149 }
1150 
1151 
1152 /*!	This will find a free duplicate fragment in the given bplustree_node.
1153 	The CachedNode will be set to the writable fragment on success.
1154 */
1155 status_t
1156 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
1157 	const bplustree_node* node, CachedNode& cached,
1158 	off_t* _offset, bplustree_node** _fragment, uint32* _index)
1159 {
1160 	off_t* values = node->Values();
1161 	for (int32 i = 0; i < node->NumKeys(); i++) {
1162 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1163 
1164 		// does the value link to a duplicate fragment?
1165 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
1166 			continue;
1167 
1168 		const bplustree_node* fragment = cached.SetTo(
1169 			bplustree_node::FragmentOffset(value), false);
1170 		if (fragment == NULL) {
1171 			FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
1172 				B_PRIdOFF "\n", value, fStream->ID()));
1173 			continue;
1174 		}
1175 
1176 		// see if there is some space left for us
1177 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
1178 		for (uint32 j = 0; j < num; j++) {
1179 			duplicate_array* array = fragment->FragmentAt(j);
1180 
1181 			if (array->IsEmpty()) {
1182 				// found an unused fragment
1183 				*_fragment = cached.MakeWritable(transaction);
1184 				if (*_fragment == NULL)
1185 					return B_IO_ERROR;
1186 
1187 				*_offset = bplustree_node::FragmentOffset(value);
1188 				*_index = j;
1189 				return B_OK;
1190 			}
1191 		}
1192 	}
1193 	return B_ENTRY_NOT_FOUND;
1194 }
1195 
1196 
1197 status_t
1198 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
1199 	const bplustree_node* node, uint16 index, off_t value)
1200 {
1201 	CachedNode cachedDuplicate(this);
1202 	off_t* values = node->Values();
1203 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1204 	status_t status;
1205 	off_t offset;
1206 
1207 	if (bplustree_node::IsDuplicate(oldValue)) {
1208 		// If it's a duplicate fragment, try to insert it into that, or if it
1209 		// doesn't fit anymore, create a new duplicate node
1210 
1211 		if (bplustree_node::LinkType(oldValue)
1212 				== BPLUSTREE_DUPLICATE_FRAGMENT) {
1213 			bplustree_node* duplicate = cachedDuplicate.SetToWritable(
1214 				transaction, bplustree_node::FragmentOffset(oldValue), false);
1215 			if (duplicate == NULL)
1216 				return B_IO_ERROR;
1217 
1218 			duplicate_array* array = duplicate->FragmentAt(
1219 				bplustree_node::FragmentIndex(oldValue));
1220 			int32 arrayCount = array->Count();
1221 			if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
1222 				FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
1223 					"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1224 					(int)bplustree_node::FragmentIndex(oldValue),
1225 					bplustree_node::FragmentOffset(oldValue), arrayCount,
1226 					fStream->ID()));
1227 				return B_BAD_DATA;
1228 			}
1229 
1230 			if (arrayCount < NUM_FRAGMENT_VALUES) {
1231 				array->Insert(value);
1232 			} else {
1233 				// Test if the fragment will be empty if we remove this key's
1234 				// values
1235 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
1236 					// The node will be empty without our values, so let us
1237 					// reuse it as a duplicate node
1238 					offset = bplustree_node::FragmentOffset(oldValue);
1239 
1240 					memmove(duplicate->DuplicateArray(), array,
1241 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1242 					duplicate->left_link = duplicate->right_link
1243 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1244 
1245 					array = duplicate->DuplicateArray();
1246 					array->Insert(value);
1247 				} else {
1248 					// Create a new duplicate node
1249 
1250 					cachedDuplicate.UnsetUnchanged(transaction);
1251 						// The old duplicate has not been touched, so we can
1252 						// reuse it
1253 
1254 					bplustree_node* newDuplicate;
1255 					status = cachedDuplicate.Allocate(transaction,
1256 						&newDuplicate, &offset);
1257 					if (status != B_OK)
1258 						RETURN_ERROR(status);
1259 
1260 					// Copy the array from the fragment node to the duplicate
1261 					// node and free the old entry (by zero'ing all values)
1262 					newDuplicate->overflow_link = array->count;
1263 					memcpy(&newDuplicate->all_key_count, &array->values[0],
1264 						array->Count() * sizeof(off_t));
1265 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1266 
1267 					array = newDuplicate->DuplicateArray();
1268 					array->Insert(value);
1269 				}
1270 
1271 				// Update the main pointer to link to a duplicate node
1272 				if (cached.MakeWritable(transaction) == NULL)
1273 					return B_IO_ERROR;
1274 
1275 				values[index]
1276 					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1277 						BPLUSTREE_DUPLICATE_NODE, offset));
1278 			}
1279 
1280 			return B_OK;
1281 		}
1282 
1283 		// Put the value into a dedicated duplicate node
1284 
1285 		// search for free space in the duplicate nodes of that key
1286 		duplicate_array* array;
1287 		int32 arrayCount;
1288 		const bplustree_node* duplicate;
1289 		off_t duplicateOffset;
1290 		do {
1291 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1292 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1293 			if (duplicate == NULL)
1294 				return B_IO_ERROR;
1295 
1296 			array = duplicate->DuplicateArray();
1297 			arrayCount =array->Count();
1298 			if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1299 				FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
1300 					B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1301 					duplicateOffset, arrayCount, fStream->ID()));
1302 				return B_BAD_DATA;
1303 			}
1304 		} while (arrayCount >= NUM_DUPLICATE_VALUES
1305 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
1306 
1307 		bplustree_node* writableDuplicate
1308 			= cachedDuplicate.MakeWritable(transaction);
1309 		if (writableDuplicate == NULL)
1310 			return B_IO_ERROR;
1311 
1312 		if (arrayCount < NUM_DUPLICATE_VALUES) {
1313 			array = writableDuplicate->DuplicateArray();
1314 			array->Insert(value);
1315 		} else {
1316 			// no space left - add a new duplicate node
1317 
1318 			bplustree_node* newDuplicate;
1319 			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
1320 				&offset);
1321 			if (status != B_OK)
1322 				RETURN_ERROR(status);
1323 
1324 			// link the two nodes together
1325 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
1326 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
1327 
1328 			array = newDuplicate->DuplicateArray();
1329 			array->count = 0;
1330 			array->Insert(value);
1331 		}
1332 		return B_OK;
1333 	}
1334 
1335 	// Search for a free duplicate fragment or create a new one
1336 	// to insert the duplicate value into
1337 
1338 	uint32 fragmentIndex = 0;
1339 	bplustree_node* fragment;
1340 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
1341 			&offset, &fragment, &fragmentIndex) != B_OK) {
1342 		// allocate a new duplicate fragment node
1343 		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
1344 		if (status != B_OK)
1345 			RETURN_ERROR(status);
1346 
1347 		memset(fragment, 0, fNodeSize);
1348 	}
1349 
1350 	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
1351 	array->Insert(oldValue);
1352 	array->Insert(value);
1353 
1354 	if (cached.MakeWritable(transaction) == NULL)
1355 		return B_IO_ERROR;
1356 
1357 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1358 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
1359 
1360 	return B_OK;
1361 }
1362 
1363 
1364 void
1365 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
1366 	uint16 keyLength, off_t value)
1367 {
1368 	// should never happen, but who knows?
1369 	if (index > node->NumKeys())
1370 		return;
1371 
1372 	off_t* values = node->Values();
1373 	uint16* keyLengths = node->KeyLengths();
1374 	uint8* keys = node->Keys();
1375 
1376 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1377 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1378 		+ keyLength);
1379 
1380 	off_t* newValues = node->Values();
1381 	uint16* newKeyLengths = node->KeyLengths();
1382 
1383 	// move values and copy new value into them
1384 	memmove(newValues + index + 1, values + index,
1385 		sizeof(off_t) * (node->NumKeys() - 1 - index));
1386 	memmove(newValues, values, sizeof(off_t) * index);
1387 
1388 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1389 
1390 	// move and update key length index
1391 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1392 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1393 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1394 	}
1395 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1396 
1397 	int32 keyStart;
1398 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1399 		+ (keyStart = index > 0
1400 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1401 
1402 	// move keys and copy new key into them
1403 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1404 	int32 size = node->AllKeyLength() - length;
1405 	if (size > 0)
1406 		memmove(keys + length, keys + length - keyLength, size);
1407 
1408 	memcpy(keys + keyStart, key, keyLength);
1409 }
1410 
1411 
1412 /*!	Splits the \a node into two halves - the other half will be put into
1413 	\a other. It also takes care to create a new overflow link if the node
1414 	to split is an index node.
1415 */
1416 status_t
1417 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1418 	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1419 	uint16* _keyLength, off_t* _value)
1420 {
1421 	if (*_keyIndex > node->NumKeys() + 1)
1422 		return B_BAD_VALUE;
1423 
1424 	uint16* inKeyLengths = node->KeyLengths();
1425 	off_t* inKeyValues = node->Values();
1426 	uint8* inKeys = node->Keys();
1427 	uint8* outKeys = other->Keys();
1428 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1429 
1430 	if (keyIndex > node->NumKeys()) {
1431 		FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
1432 			"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
1433 		return B_BAD_VALUE;
1434 	}
1435 
1436 	// How many keys will fit in one (half) page?
1437 	// The following loop will find the answer to this question and
1438 	// change the key lengths indices for their new home
1439 
1440 	// "bytes" is the number of bytes written for the new key,
1441 	// "bytesBefore" are the bytes before that key
1442 	// "bytesAfter" are the bytes after the new key, if any
1443 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1444 
1445 	size_t size = fNodeSize >> 1;
1446 	int32 out, in;
1447 	size_t keyLengths = 0;
1448 	for (in = out = 0; in < node->NumKeys() + 1;) {
1449 		keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]);
1450 
1451 		if (in == keyIndex && !bytes) {
1452 			bytes = *_keyLength;
1453 			bytesBefore = in > 0
1454 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1455 		} else {
1456 			if (keyIndex < out)
1457 				bytesAfter = keyLengths - bytesBefore;
1458 
1459 			in++;
1460 		}
1461 		out++;
1462 
1463 		if (key_align(sizeof(bplustree_node) + bytes + keyLengths)
1464 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1465 			// we have found the number of keys in the new node!
1466 			break;
1467 		}
1468 	}
1469 
1470 	// if the new key was not inserted, set the length of the keys
1471 	// that can be copied directly
1472 
1473 	if (keyIndex >= out && in > 0)
1474 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1475 	else if (keyIndex + 1 < out)
1476 		bytesAfter = keyLengths - bytesBefore;
1477 
1478 	if (bytesBefore < 0 || bytesAfter < 0)
1479 		return B_BAD_DATA;
1480 
1481 	other->left_link = node->left_link;
1482 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1483 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1484 		+ bytesAfter);
1485 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1486 
1487 	uint16* outKeyLengths = other->KeyLengths();
1488 	off_t* outKeyValues = other->Values();
1489 	int32 keys = out > keyIndex ? keyIndex : out;
1490 
1491 	if (bytesBefore) {
1492 		// copy the keys
1493 		memcpy(outKeys, inKeys, bytesBefore);
1494 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1495 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1496 	}
1497 	if (bytes) {
1498 		// copy the newly inserted key
1499 		memcpy(outKeys + bytesBefore, key, bytes);
1500 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1501 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1502 
1503 		if (bytesAfter) {
1504 			// copy the keys after the new key
1505 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1506 				bytesAfter);
1507 			keys = out - keyIndex - 1;
1508 			for (int32 i = 0;i < keys;i++) {
1509 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1510 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1511 						+ bytes);
1512 			}
1513 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1514 				keys * sizeof(off_t));
1515 		}
1516 	}
1517 
1518 	// if the new key was already inserted, we shouldn't use it again
1519 	if (in != out)
1520 		keyIndex--;
1521 
1522 	int32 total = bytesBefore + bytesAfter;
1523 
1524 	// these variables are for the key that will be returned
1525 	// to the parent node
1526 	uint8* newKey = NULL;
1527 	uint16 newLength;
1528 	bool newAllocated = false;
1529 
1530 	// If we have split an index node, we have to drop the first key
1531 	// of the next node (which can also be the new key to insert).
1532 	// The dropped key is also the one which has to be inserted in
1533 	// the parent node, so we will set the "newKey" already here.
1534 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1535 		if (in == keyIndex) {
1536 			newKey = key;
1537 			newLength = *_keyLength;
1538 
1539 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1540 			keyIndex--;
1541 		} else {
1542 			// If a key is dropped (is not the new key), we have to copy
1543 			// it, because it would be lost if not.
1544 			uint8* droppedKey = node->KeyAt(in, &newLength);
1545 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1546 					> (uint8*)node + fNodeSize
1547 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1548 				fStream->GetVolume()->Panic();
1549 				RETURN_ERROR(B_BAD_DATA);
1550 			}
1551 			newKey = (uint8*)malloc(newLength);
1552 			if (newKey == NULL)
1553 				return B_NO_MEMORY;
1554 
1555 			newAllocated = true;
1556 			memcpy(newKey, droppedKey, newLength);
1557 
1558 			other->overflow_link = inKeyValues[in];
1559 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1560 		}
1561 	}
1562 
1563 	// and now the same game for the other page and the rest of the keys
1564 	// (but with memmove() instead of memcpy(), because they may overlap)
1565 
1566 	bytesBefore = bytesAfter = bytes = 0;
1567 	out = 0;
1568 	int32 skip = in;
1569 	while (in < node->NumKeys() + 1) {
1570 		if (in == keyIndex && !bytes) {
1571 			// it's enough to set bytesBefore once here, because we do
1572 			// not need to know the exact length of all keys in this
1573 			// loop
1574 			bytesBefore = in > skip
1575 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1576 			bytes = *_keyLength;
1577 			out++;
1578 		} else {
1579 			if (in < node->NumKeys()) {
1580 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1581 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1582 
1583 				if (bytes) {
1584 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1585 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1586 
1587 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1588 						- bytesBefore - bytes;
1589 				}
1590 				out++;
1591 			}
1592 			in++;
1593 		}
1594 	}
1595 
1596 	// adjust the byte counts (since we were a bit lazy in the loop)
1597 	if (keyIndex < skip)
1598 		bytesBefore = node->AllKeyLength() - total;
1599 
1600 	if (bytesBefore < 0 || bytesAfter < 0) {
1601 		if (newAllocated)
1602 			free(newKey);
1603 		return B_BAD_DATA;
1604 	}
1605 
1606 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1607 		// right link, and overflow link can stay the same
1608 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1609 		+ bytesAfter);
1610 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1611 
1612 	// array positions have changed
1613 	outKeyLengths = node->KeyLengths();
1614 	outKeyValues = node->Values();
1615 
1616 	// move the keys in the old node: the order is important here,
1617 	// because we don't want to overwrite any contents
1618 
1619 	keys = keyIndex <= skip ? out : keyIndex - skip;
1620 	keyIndex -= skip;
1621 	in = out - keyIndex - 1;
1622 		// Note: keyIndex and in will contain invalid values when the new key
1623 		// went to the other node. But in this case bytes and bytesAfter are
1624 		// 0 and subsequently we never use keyIndex and in.
1625 
1626 	if (bytesBefore)
1627 		memmove(inKeys, inKeys + total, bytesBefore);
1628 	if (bytesAfter) {
1629 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1630 			bytesAfter);
1631 	}
1632 
1633 	if (bytesBefore)
1634 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1635 	if (bytesAfter) {
1636 		// if byteAfter is > 0, keyIndex is larger than skip
1637 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1638 			in * sizeof(uint16));
1639 	}
1640 
1641 	if (bytesBefore)
1642 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1643 	if (bytesAfter) {
1644 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1645 			in * sizeof(off_t));
1646 	}
1647 
1648 	if (bytes) {
1649 		// finally, copy the newly inserted key (don't overwrite anything)
1650 		memcpy(inKeys + bytesBefore, key, bytes);
1651 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1652 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1653 	}
1654 
1655 	// Prepare the key that will be inserted in the parent node which
1656 	// is either the dropped key or the last of the other node.
1657 	// If it's the dropped key, "newKey" was already set earlier.
1658 
1659 	if (newKey == NULL)
1660 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1661 
1662 	memcpy(key, newKey, newLength);
1663 	*_keyLength = newLength;
1664 	*_value = otherOffset;
1665 
1666 	if (newAllocated)
1667 		free(newKey);
1668 
1669 	return B_OK;
1670 }
1671 
1672 
1673 /*!	This inserts a key into the tree. The changes made to the tree will
1674 	all be part of the \a transaction.
1675 	You need to have the inode write locked.
1676 */
1677 status_t
1678 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1679 	off_t value)
1680 {
1681 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1682 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1683 		RETURN_ERROR(B_BAD_VALUE);
1684 #ifdef DEBUG
1685 	if (value < 0)
1686 		panic("tried to insert invalid value %" B_PRId64 "!\n", value);
1687 #endif
1688 
1689 	ASSERT_WRITE_LOCKED_INODE(fStream);
1690 
1691 	Stack<node_and_key> stack;
1692 	if (_SeekDown(stack, key, keyLength) != B_OK)
1693 		RETURN_ERROR(B_ERROR);
1694 
1695 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1696 	memcpy(keyBuffer, key, keyLength);
1697 
1698 	node_and_key nodeAndKey;
1699 	const bplustree_node* node;
1700 
1701 	CachedNode cached(this);
1702 	while (stack.Pop(&nodeAndKey)
1703 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1704 #ifdef DEBUG
1705 		NodeChecker checker(node, fNodeSize, "insert");
1706 #endif
1707 		if (node->IsLeaf()) {
1708 			// first round, check for duplicate entries
1709 			status_t status = _FindKey(node, key, keyLength,
1710 				&nodeAndKey.keyIndex);
1711 
1712 			// is this a duplicate entry?
1713 			if (status == B_OK) {
1714 				if (fAllowDuplicates) {
1715 					status = _InsertDuplicate(transaction, cached, node,
1716 						nodeAndKey.keyIndex, value);
1717 					if (status != B_OK)
1718 						RETURN_ERROR(status);
1719 					return B_OK;
1720 				}
1721 
1722 				return B_NAME_IN_USE;
1723 			}
1724 		}
1725 
1726 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1727 		if (writableNode == NULL)
1728 			return B_IO_ERROR;
1729 
1730 		// is the node big enough to hold the pair?
1731 		if (int32(key_align(sizeof(bplustree_node)
1732 				+ writableNode->AllKeyLength() + keyLength)
1733 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1734 				+ sizeof(off_t))) < fNodeSize) {
1735 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1736 				keyBuffer, keyLength, value);
1737 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1738 				nodeAndKey.keyIndex, 0, 1);
1739 
1740 			return B_OK;
1741 		} else {
1742 			CachedNode cachedNewRoot(this);
1743 			CachedNode cachedOther(this);
1744 
1745 			// do we need to allocate a new root node? if so, then do
1746 			// it now
1747 			off_t newRoot = BPLUSTREE_NULL;
1748 			if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1749 				bplustree_node* root;
1750 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1751 					&newRoot);
1752 				if (status != B_OK) {
1753 					// The tree is most likely corrupted!
1754 					// But it's still sane at leaf level - we could set
1755 					// a flag in the header that forces the tree to be
1756 					// rebuild next time...
1757 					// But since we will have journaling, that's not a big
1758 					// problem anyway.
1759 					RETURN_ERROR(status);
1760 				}
1761 			}
1762 
1763 			// reserve space for the other node
1764 			bplustree_node* other;
1765 			off_t otherOffset;
1766 			status_t status = cachedOther.Allocate(transaction, &other,
1767 				&otherOffset);
1768 			if (status != B_OK) {
1769 				cachedNewRoot.Free(transaction, newRoot);
1770 				RETURN_ERROR(status);
1771 			}
1772 
1773 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1774 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1775 					&value) != B_OK) {
1776 				// free root node & other node here
1777 				cachedOther.Free(transaction, otherOffset);
1778 				cachedNewRoot.Free(transaction, newRoot);
1779 
1780 				RETURN_ERROR(B_ERROR);
1781 			}
1782 #ifdef DEBUG
1783 			checker.Check("insert split");
1784 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1785 #endif
1786 
1787 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1788 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1789 
1790 			// update the right link of the node in the left of the new node
1791 			if ((other = cachedOther.SetToWritable(transaction,
1792 					other->LeftLink())) != NULL) {
1793 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1794 			}
1795 
1796 			// create a new root if necessary
1797 			if (newRoot != BPLUSTREE_NULL) {
1798 				bplustree_node* root = cachedNewRoot.Node();
1799 
1800 				_InsertKey(root, 0, keyBuffer, keyLength,
1801 					writableNode->LeftLink());
1802 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1803 					nodeAndKey.nodeOffset);
1804 
1805 				CachedNode cached(this);
1806 				bplustree_header* header
1807 					= cached.SetToWritableHeader(transaction);
1808 				if (header == NULL)
1809 					return B_IO_ERROR;
1810 
1811 				// finally, update header to point to the new root
1812 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1813 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1814 					header->MaxNumberOfLevels() + 1);
1815 
1816 				return B_OK;
1817 			}
1818 		}
1819 	}
1820 	RETURN_ERROR(B_ERROR);
1821 }
1822 
1823 
1824 /*!	Removes the duplicate index/value pair from the tree.
1825 	It's part of the private tree interface.
1826 */
1827 status_t
1828 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1829 	const bplustree_node* node, CachedNode& cached, uint16 index,
1830 	off_t value)
1831 {
1832 	off_t* values = node->Values();
1833 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1834 
1835 	CachedNode cachedDuplicate(this);
1836 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1837 	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1838 		duplicateOffset, false);
1839 	if (duplicate == NULL)
1840 		return B_IO_ERROR;
1841 
1842 	// if it's a duplicate fragment, remove the entry from there
1843 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1844 		duplicate_array* array = duplicate->FragmentAt(
1845 			bplustree_node::FragmentIndex(oldValue));
1846 		int32 arrayCount = array->Count();
1847 
1848 		if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
1849 			FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
1850 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1851 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1852 				arrayCount, fStream->ID()));
1853 			return B_BAD_DATA;
1854 		}
1855 		if (!array->Remove(value)) {
1856 			FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1857 				"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
1858 				duplicateOffset, fStream->ID()));
1859 			return B_ENTRY_NOT_FOUND;
1860 		}
1861 
1862 		// remove the array from the fragment node if it is empty
1863 		if (--arrayCount == 1) {
1864 			// set the link to the remaining value
1865 			if (cached.MakeWritable(transaction) == NULL)
1866 				return B_IO_ERROR;
1867 
1868 			values[index] = array->values[0];
1869 
1870 			// Remove the whole fragment node, if this was the only array,
1871 			// otherwise free just the array
1872 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1873 				status_t status = cachedDuplicate.Free(transaction,
1874 					duplicateOffset);
1875 				if (status != B_OK)
1876 					return status;
1877 			} else
1878 				array->count = 0;
1879 		}
1880 		return B_OK;
1881 	}
1882 
1883 	// Remove value from a duplicate node!
1884 
1885 	duplicate_array* array = NULL;
1886 	int32 arrayCount = 0;
1887 
1888 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1889 		FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1890 			", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
1891 		return B_BAD_DATA;
1892 	}
1893 
1894 	// Search the duplicate nodes until the entry could be found (and removed)
1895 	while (duplicate != NULL) {
1896 		array = duplicate->DuplicateArray();
1897 		arrayCount = array->Count();
1898 
1899 		if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1900 			FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
1901 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1902 				duplicateOffset, arrayCount, fStream->ID()));
1903 			return B_BAD_DATA;
1904 		}
1905 
1906 		if (array->Remove(value)) {
1907 			arrayCount--;
1908 			break;
1909 		}
1910 
1911 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1912 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1913 
1914 		cachedDuplicate.UnsetUnchanged(transaction);
1915 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1916 			false);
1917 	}
1918 	if (duplicate == NULL)
1919 		RETURN_ERROR(B_IO_ERROR);
1920 
1921 	// The entry got removed from the duplicate node, but we might want to free
1922 	// it now in case it's empty
1923 
1924 	while (true) {
1925 		off_t left = duplicate->LeftLink();
1926 		off_t right = duplicate->RightLink();
1927 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1928 
1929 		if ((isLast && arrayCount == 1) || arrayCount == 0) {
1930 			// Free empty duplicate page, link their siblings together, and
1931 			// update the duplicate link if needed (ie. when we either remove
1932 			// the last duplicate node or have a new first one)
1933 
1934 			if (left == BPLUSTREE_NULL) {
1935 				// the duplicate link points to us
1936 				if (cached.MakeWritable(transaction) == NULL)
1937 					return B_IO_ERROR;
1938 
1939 				if (arrayCount == 1) {
1940 					// This is the last node, and there is only one value left;
1941 					// replace the duplicate link with that value, it's no
1942 					// duplicate anymore
1943 					values[index] = array->values[0];
1944 				} else {
1945 					// Move the duplicate link to the next node
1946 					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1947 						bplustree_node::MakeLink(
1948 							BPLUSTREE_DUPLICATE_NODE, right));
1949 				}
1950 			}
1951 
1952 			status_t status = cachedDuplicate.Free(transaction,
1953 				duplicateOffset);
1954 			if (status != B_OK)
1955 				return status;
1956 
1957 			if (left != BPLUSTREE_NULL
1958 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1959 						false)) != NULL) {
1960 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1961 
1962 				// If the next node is the last node, we need to free that node
1963 				// and convert the duplicate entry back into a normal entry
1964 				array = duplicate->DuplicateArray();
1965 				arrayCount = array->Count();
1966 				if (right == BPLUSTREE_NULL
1967 					&& duplicate->LeftLink() == BPLUSTREE_NULL
1968 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1969 					duplicateOffset = left;
1970 					continue;
1971 				}
1972 			}
1973 			if (right != BPLUSTREE_NULL
1974 				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1975 						right, false)) != NULL) {
1976 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1977 
1978 				// Again, we may need to turn the duplicate entry back into a
1979 				// normal entry
1980 				array = duplicate->DuplicateArray();
1981 				arrayCount = array->Count();
1982 				if (left == BPLUSTREE_NULL
1983 					&& duplicate->RightLink() == BPLUSTREE_NULL
1984 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1985 					duplicateOffset = right;
1986 					continue;
1987 				}
1988 			}
1989 			return B_OK;
1990 		}
1991 		if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
1992 			// If the number of entries fits in a duplicate fragment, then
1993 			// either find a free fragment node, or convert this node to a
1994 			// fragment node.
1995 			CachedNode cachedOther(this);
1996 
1997 			bplustree_node* fragment = NULL;
1998 			uint32 fragmentIndex = 0;
1999 			off_t offset;
2000 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
2001 					&offset, &fragment, &fragmentIndex) == B_OK) {
2002 				// move to other node
2003 				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
2004 				memcpy(target, array,
2005 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2006 
2007 				cachedDuplicate.Free(transaction, duplicateOffset);
2008 				duplicateOffset = offset;
2009 			} else {
2010 				// convert node
2011 				memmove(duplicate, array,
2012 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2013 				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
2014 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2015 			}
2016 
2017 			if (cached.MakeWritable(transaction) == NULL)
2018 				return B_IO_ERROR;
2019 
2020 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
2021 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
2022 		}
2023 		return B_OK;
2024 	}
2025 }
2026 
2027 
2028 /*!	Removes the key with the given index from the specified node.
2029 	Since it has to get the key from the node anyway (to obtain it's
2030 	pointer), it's not needed to pass the key & its length, although
2031 	the calling method (BPlusTree::Remove()) have this data.
2032 */
2033 void
2034 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
2035 {
2036 	// should never happen, but who knows?
2037 	if (index > node->NumKeys() && node->NumKeys() > 0) {
2038 		FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
2039 			"\n", index, fStream->ID()));
2040 		return;
2041 	}
2042 
2043 	off_t* values = node->Values();
2044 
2045 	// if we would have to drop the overflow link, drop
2046 	// the last key instead and update the overflow link
2047 	// to the value of that one
2048 	if (!node->IsLeaf() && index == node->NumKeys())
2049 		node->overflow_link = values[--index];
2050 
2051 	uint16 length;
2052 	uint8* key = node->KeyAt(index, &length);
2053 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
2054 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2055 		FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
2056 			fStream->ID()));
2057 		fStream->GetVolume()->Panic();
2058 		return;
2059 	}
2060 
2061 	uint16* keyLengths = node->KeyLengths();
2062 	uint8* keys = node->Keys();
2063 
2064 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
2065 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
2066 		node->AllKeyLength() - length);
2067 
2068 	off_t* newValues = node->Values();
2069 	uint16* newKeyLengths = node->KeyLengths();
2070 
2071 	// move key data
2072 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
2073 
2074 	// move and update key lengths
2075 	if (index > 0 && newKeyLengths != keyLengths)
2076 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
2077 	for (uint16 i = index; i < node->NumKeys(); i++) {
2078 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
2079 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
2080 	}
2081 
2082 	// move values
2083 	if (index > 0)
2084 		memmove(newValues, values, index * sizeof(off_t));
2085 	if (node->NumKeys() > index) {
2086 		memmove(newValues + index, values + index + 1,
2087 			(node->NumKeys() - index) * sizeof(off_t));
2088 	}
2089 }
2090 
2091 
2092 /*!	Removes the specified key from the tree. The "value" parameter is only used
2093 	for trees which allow duplicates, so you may safely ignore it.
2094 	It's not an optional parameter, so at least you have to think about it.
2095 	You need to have the inode write locked.
2096 */
2097 status_t
2098 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
2099 	off_t value)
2100 {
2101 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2102 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2103 		RETURN_ERROR(B_BAD_VALUE);
2104 
2105 	ASSERT_WRITE_LOCKED_INODE(fStream);
2106 
2107 	Stack<node_and_key> stack;
2108 	if (_SeekDown(stack, key, keyLength) != B_OK)
2109 		RETURN_ERROR(B_ERROR);
2110 
2111 	node_and_key nodeAndKey;
2112 	const bplustree_node* node;
2113 
2114 	CachedNode cached(this);
2115 	while (stack.Pop(&nodeAndKey)
2116 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
2117 #ifdef DEBUG
2118 		NodeChecker checker(node, fNodeSize, "remove");
2119 #endif
2120 		if (node->IsLeaf()) {
2121 			// first round, check for duplicate entries
2122 			status_t status = _FindKey(node, key, keyLength,
2123 				&nodeAndKey.keyIndex);
2124 			if (status != B_OK)
2125 				RETURN_ERROR(status);
2126 
2127 			// Is this a duplicate entry?
2128 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
2129 					node->Values()[nodeAndKey.keyIndex]))) {
2130 				if (fAllowDuplicates) {
2131 					return _RemoveDuplicate(transaction, node, cached,
2132 						nodeAndKey.keyIndex, value);
2133 				}
2134 
2135 				FATAL(("dupliate node found where no duplicates are "
2136 					"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
2137 				RETURN_ERROR(B_ERROR);
2138 			} else {
2139 				if (node->Values()[nodeAndKey.keyIndex] != value)
2140 					return B_ENTRY_NOT_FOUND;
2141 
2142 				// If we will remove the last key, the iterator will be set
2143 				// to the next node after the current - if there aren't any
2144 				// more nodes, we need a way to prevent the TreeIterators to
2145 				// touch the old node again, we use BPLUSTREE_FREE for this
2146 				off_t next = node->RightLink() == BPLUSTREE_NULL
2147 					? BPLUSTREE_FREE : node->RightLink();
2148 				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
2149 					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
2150 			}
2151 		}
2152 
2153 		bplustree_node* writableNode = cached.MakeWritable(transaction);
2154 		if (writableNode == NULL)
2155 			return B_IO_ERROR;
2156 
2157 		// if it's an empty root node, we have to convert it
2158 		// to a leaf node by dropping the overflow link, or,
2159 		// if it's already a leaf node, just empty it
2160 		if (nodeAndKey.nodeOffset == fHeader.RootNode()
2161 			&& (node->NumKeys() == 0
2162 				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
2163 			writableNode->overflow_link
2164 				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2165 			writableNode->all_key_count = 0;
2166 			writableNode->all_key_length = 0;
2167 
2168 			// if we've made a leaf node out of the root node, we need
2169 			// to reset the maximum number of levels in the header
2170 			if (fHeader.MaxNumberOfLevels() != 1) {
2171 				CachedNode cached(this);
2172 				bplustree_header* header
2173 					= cached.SetToWritableHeader(transaction);
2174 				if (header == NULL)
2175 					return B_IO_ERROR;
2176 
2177 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
2178 			}
2179 			return B_OK;
2180 		}
2181 
2182 		// if there is only one key left, we don't have to remove
2183 		// it, we can just dump the node (index nodes still have
2184 		// the overflow link, so we have to drop the last key)
2185 		if (writableNode->NumKeys() > 1
2186 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
2187 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
2188 			return B_OK;
2189 		}
2190 
2191 		// when we are here, we can just free the node, but
2192 		// we have to update the right/left link of the
2193 		// siblings first
2194 		CachedNode otherCached(this);
2195 		bplustree_node* other = otherCached.SetToWritable(transaction,
2196 			writableNode->LeftLink());
2197 		if (other != NULL)
2198 			other->right_link = writableNode->right_link;
2199 
2200 		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
2201 				!= NULL) {
2202 			other->left_link = writableNode->left_link;
2203 		}
2204 
2205 		cached.Free(transaction, nodeAndKey.nodeOffset);
2206 	}
2207 	RETURN_ERROR(B_ERROR);
2208 }
2209 
2210 
2211 /*!	Replaces the value for the key in the tree.
2212 	Returns B_OK if the key could be found and its value replaced,
2213 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
2214 	to indicate that something went terribly wrong.
2215 	Note that this doesn't work with duplicates - it will just
2216 	return B_BAD_TYPE if you call this function on a tree where
2217 	duplicates are allowed.
2218 	You need to have the inode write locked.
2219 */
2220 status_t
2221 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
2222 	off_t value)
2223 {
2224 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2225 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2226 		|| key == NULL)
2227 		RETURN_ERROR(B_BAD_VALUE);
2228 
2229 	if (fAllowDuplicates)
2230 		RETURN_ERROR(B_BAD_TYPE);
2231 
2232 	ASSERT_WRITE_LOCKED_INODE(fStream);
2233 
2234 	off_t nodeOffset = fHeader.RootNode();
2235 	CachedNode cached(this);
2236 	const bplustree_node* node;
2237 
2238 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2239 		uint16 keyIndex = 0;
2240 		off_t nextOffset;
2241 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2242 			&nextOffset);
2243 
2244 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2245 			if (status == B_OK) {
2246 				bplustree_node* writableNode = cached.MakeWritable(transaction);
2247 				if (writableNode != NULL) {
2248 					writableNode->Values()[keyIndex]
2249 						= HOST_ENDIAN_TO_BFS_INT64(value);
2250 				} else
2251 					status = B_IO_ERROR;
2252 			}
2253 
2254 			return status;
2255 		} else if (nextOffset == nodeOffset)
2256 			RETURN_ERROR(B_ERROR);
2257 
2258 		nodeOffset = nextOffset;
2259 	}
2260 	RETURN_ERROR(B_ERROR);
2261 }
2262 #endif // !_BOOT_MODE
2263 
2264 
2265 /*!	Searches the key in the tree, and stores the offset found in _value,
2266 	if successful.
2267 	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
2268 	while it descends the tree.
2269 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
2270 	It can also return other errors to indicate that something went wrong.
2271 	Note that this doesn't work with duplicates - it will just return
2272 	B_BAD_TYPE if you call this function on a tree where duplicates are
2273 	allowed.
2274 	You need to have the inode read or write locked.
2275 */
2276 status_t
2277 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
2278 {
2279 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2280 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2281 		|| key == NULL)
2282 		RETURN_ERROR(B_BAD_VALUE);
2283 
2284 	if (fAllowDuplicates)
2285 		RETURN_ERROR(B_BAD_TYPE);
2286 
2287 #if !_BOOT_MODE
2288 	ASSERT_READ_LOCKED_INODE(fStream);
2289 #endif
2290 
2291 	off_t nodeOffset = fHeader.RootNode();
2292 	CachedNode cached(this);
2293 	const bplustree_node* node;
2294 
2295 #ifdef DEBUG
2296 	int32 levels = 0;
2297 #endif
2298 
2299 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2300 		uint16 keyIndex = 0;
2301 		off_t nextOffset;
2302 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2303 			&nextOffset);
2304 
2305 #ifdef DEBUG
2306 		levels++;
2307 #endif
2308 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2309 			if (status == B_OK && _value != NULL)
2310 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
2311 
2312 #ifdef DEBUG
2313 			if (levels != (int32)fHeader.MaxNumberOfLevels())
2314 				DEBUGGER(("levels don't match"));
2315 #endif
2316 			return status;
2317 		} else if (nextOffset == nodeOffset)
2318 			RETURN_ERROR(B_ERROR);
2319 
2320 		nodeOffset = nextOffset;
2321 	}
2322 	FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
2323 		B_PRIdOFF "\n", nodeOffset, fStream->ID()));
2324 	RETURN_ERROR(B_ERROR);
2325 }
2326 
2327 
2328 #if !_BOOT_MODE
2329 status_t
2330 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
2331 	const uint8* largestKey, uint16 largestKeyLength,
2332 	const bplustree_node* parent)
2333 {
2334 	if (parent->CheckIntegrity(fNodeSize) != B_OK) {
2335 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
2336 			"failed!\n", fStream->ID(), offset);
2337 		check.FoundError();
2338 		return B_OK;
2339 	}
2340 	if (level >= fHeader.MaxNumberOfLevels()) {
2341 		dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
2342 			"!\n", fStream->ID(), offset);
2343 		check.FoundError();
2344 		return B_OK;
2345 	}
2346 
2347 	check.SetLevel(level);
2348 
2349 	if (check.Visited(offset)) {
2350 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
2351 			fStream->ID(), offset);
2352 		check.FoundError();
2353 		return B_OK;
2354 	}
2355 
2356 	check.SetVisited(offset);
2357 
2358 	uint32 count = parent->NumKeys();
2359 	off_t* values = parent->Values();
2360 	off_t lastOffset = check.PreviousOffset(level);
2361 	CachedNode cached(this);
2362 
2363 	for (uint32 i = 0; i < count; i++) {
2364 		uint16 keyLength;
2365 		uint8* key = parent->KeyAt(i, &keyLength);
2366 		if (largestKey != NULL) {
2367 			int result = _CompareKeys(key, keyLength, largestKey,
2368 				largestKeyLength);
2369 			if (result > 0 || (result == 0 && i != count - 1)) {
2370 				dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
2371 					B_PRIu32 " larger than it should!\n",
2372 					fStream->ID(), offset, i);
2373 				check.FoundError();
2374 			}
2375 		}
2376 
2377 		off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
2378 		if (bplustree_node::IsDuplicate(childOffset)) {
2379 			// Walk the duplicate nodes
2380 			off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
2381 			off_t lastDuplicateOffset = BPLUSTREE_NULL;
2382 
2383 			while (duplicateOffset != BPLUSTREE_NULL) {
2384 				const bplustree_node* node;
2385 				status_t status = cached.SetTo(duplicateOffset, &node, false);
2386 				if (status != B_OK) {
2387 					if (status == B_IO_ERROR)
2388 						return B_IO_ERROR;
2389 
2390 					dprintf("inode %" B_PRIdOFF ": duplicate node at %"
2391 						B_PRIdOFF " could not be read: %s\n", fStream->ID(),
2392 						duplicateOffset, strerror(status));
2393 					check.FoundError();
2394 					break;
2395 				}
2396 
2397 				bool isFragmentNode = bplustree_node::LinkType(childOffset)
2398 					== BPLUSTREE_DUPLICATE_FRAGMENT;
2399 				bool isKnownFragment = isFragmentNode
2400 					&& check.VisitedFragment(duplicateOffset);
2401 
2402 				if (!isKnownFragment && check.Visited(duplicateOffset)) {
2403 					dprintf("inode %" B_PRIdOFF ": %s node at %"
2404 						B_PRIdOFF " already visited, referenced from %"
2405 						B_PRIdOFF "!\n", fStream->ID(),
2406 						isFragmentNode ? "fragment" : "duplicate",
2407 						duplicateOffset, offset);
2408 					check.FoundError();
2409 					break;
2410 				}
2411 
2412 				// Fragment nodes may be visited more than once from different
2413 				// places
2414 				if (!check.Visited(duplicateOffset))
2415 					check.SetVisited(duplicateOffset);
2416 				if (!isKnownFragment && isFragmentNode)
2417 					check.SetVisitedFragment(duplicateOffset);
2418 
2419 				duplicate_array* array;
2420 				int32 minSize;
2421 				int32 maxSize;
2422 				if (isFragmentNode) {
2423 					array = node->FragmentAt(
2424 						bplustree_node::FragmentIndex(childOffset));
2425 					minSize = 2;
2426 					maxSize = NUM_FRAGMENT_VALUES;
2427 				} else {
2428 					array = node->DuplicateArray();
2429 					minSize = 1;
2430 					maxSize = NUM_DUPLICATE_VALUES;
2431 				}
2432 				int32 arrayCount = array->Count();
2433 
2434 				if (arrayCount < minSize || arrayCount > maxSize) {
2435 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2436 						" has invalid array size %" B_PRId32 "!\n",
2437 						fStream->ID(), duplicateOffset, arrayCount);
2438 					check.FoundError();
2439 				} else {
2440 					// Simple check if the values in the array may be valid
2441 					for (int32 j = 0; j < arrayCount; j++) {
2442 						if (!fStream->GetVolume()->IsValidInodeBlock(
2443 								array->ValueAt(j))) {
2444 							dprintf("inode %" B_PRIdOFF ": duplicate at %"
2445 								B_PRIdOFF " contains invalid block %" B_PRIdOFF
2446 								" at %" B_PRId32 "!\n", fStream->ID(),
2447 								duplicateOffset, array->ValueAt(j), j);
2448 							check.FoundError();
2449 							break;
2450 						}
2451 					}
2452 				}
2453 
2454 				// A fragment node is not linked (and does not have valid links)
2455 				if (isFragmentNode)
2456 					break;
2457 
2458 				if (node->LeftLink() != lastDuplicateOffset) {
2459 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2460 						" has wrong left link %" B_PRIdOFF ", expected %"
2461 						B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
2462 						node->LeftLink(), lastDuplicateOffset);
2463 					check.FoundError();
2464 				}
2465 
2466 				lastDuplicateOffset = duplicateOffset;
2467 				duplicateOffset = node->RightLink();
2468 			}
2469 		} else if (!parent->IsLeaf()) {
2470 			// Test a regular child node recursively
2471 			off_t nextOffset = parent->OverflowLink();
2472 			if (i < count - 1)
2473 				nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
2474 
2475 			if (i == 0 && lastOffset != BPLUSTREE_NULL) {
2476 				// Test right link of the previous node
2477 				const bplustree_node* previous = cached.SetTo(lastOffset, true);
2478 				if (previous == NULL)
2479 					return B_IO_ERROR;
2480 
2481 				if (previous->RightLink() != childOffset) {
2482 					dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2483 						"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2484 						"!\n", fStream->ID(), lastOffset, previous->RightLink(),
2485 						childOffset);
2486 					check.FoundError();
2487 				}
2488 			}
2489 
2490 			status_t status = _ValidateChild(check, cached, level, childOffset,
2491 				lastOffset, nextOffset, key, keyLength);
2492 			if (status != B_OK)
2493 				return status;
2494 		} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
2495 			dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
2496 				"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
2497 				fStream->ID(), offset, childOffset, i);
2498 			check.FoundError();
2499 		}
2500 
2501 		lastOffset = childOffset;
2502 	}
2503 
2504 	if (parent->OverflowLink() != BPLUSTREE_NULL) {
2505 		off_t childOffset = parent->OverflowLink();
2506 		status_t status = _ValidateChild(check, cached, level, childOffset,
2507 			lastOffset, 0, NULL, 0);
2508 		if (status != B_OK)
2509 			return status;
2510 
2511 		lastOffset = childOffset;
2512 	}
2513 
2514 	check.SetPreviousOffset(level, lastOffset);
2515 	return B_OK;
2516 }
2517 
2518 
2519 status_t
2520 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
2521 	off_t offset, off_t lastOffset, off_t nextOffset,
2522 	const uint8* key, uint16 keyLength)
2523 {
2524 	const bplustree_node* node;
2525 	status_t status = cached.SetTo(offset, &node, true);
2526 	if (status != B_OK) {
2527 		if (status == B_IO_ERROR)
2528 			return B_IO_ERROR;
2529 
2530 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
2531 			"read: %s\n", fStream->ID(), offset, strerror(status));
2532 		check.FoundError();
2533 		return B_OK;
2534 	}
2535 
2536 	if (node->LeftLink() != lastOffset) {
2537 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2538 			"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
2539 			"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
2540 		check.FoundError();
2541 	}
2542 
2543 	if (nextOffset != 0 && node->RightLink() != nextOffset) {
2544 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2545 			"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2546 			"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
2547 		check.FoundError();
2548 	}
2549 
2550 	return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
2551 }
2552 #endif // !_BOOT_MODE
2553 
2554 
2555 //	#pragma mark -
2556 
2557 
2558 TreeIterator::TreeIterator(BPlusTree* tree)
2559 	:
2560 	fTree(tree),
2561 	fCurrentNodeOffset(BPLUSTREE_NULL)
2562 {
2563 #if !_BOOT_MODE
2564 	tree->_AddIterator(this);
2565 #endif
2566 }
2567 
2568 
2569 TreeIterator::~TreeIterator()
2570 {
2571 #if !_BOOT_MODE
2572 	if (fTree)
2573 		fTree->_RemoveIterator(this);
2574 #endif
2575 }
2576 
2577 
2578 status_t
2579 TreeIterator::Goto(int8 to)
2580 {
2581 	if (fTree == NULL || fTree->fStream == NULL)
2582 		RETURN_ERROR(B_BAD_VALUE);
2583 
2584 #if !_BOOT_MODE
2585 	// lock access to stream
2586 	InodeReadLocker locker(fTree->fStream);
2587 #endif
2588 
2589 	off_t nodeOffset = fTree->fHeader.RootNode();
2590 	CachedNode cached(fTree);
2591 	const bplustree_node* node;
2592 
2593 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2594 		// is the node a leaf node?
2595 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2596 			fCurrentNodeOffset = nodeOffset;
2597 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
2598 			fDuplicateNode = BPLUSTREE_NULL;
2599 
2600 			return B_OK;
2601 		}
2602 
2603 		// get the next node offset depending on the direction (and if there
2604 		// are any keys in that node at all)
2605 		off_t nextOffset;
2606 		if (to == BPLUSTREE_END || node->all_key_count == 0)
2607 			nextOffset = node->OverflowLink();
2608 		else {
2609 			if (node->AllKeyLength() > fTree->fNodeSize
2610 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
2611 					- 8 * node->NumKeys())
2612 				RETURN_ERROR(B_ERROR);
2613 
2614 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
2615 		}
2616 		if (nextOffset == nodeOffset)
2617 			break;
2618 
2619 		nodeOffset = nextOffset;
2620 	}
2621 	FATAL(("%s fails\n", __FUNCTION__));
2622 
2623 	RETURN_ERROR(B_ERROR);
2624 }
2625 
2626 
2627 /*!	Iterates through the tree in the specified direction.
2628 	When it iterates through duplicates, the "key" is only updated for the
2629 	first entry - if you need to know when this happens, use the "duplicate"
2630 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2631 	the other duplicates.
2632 	That's not too nice, but saves the 256 bytes that would be needed to
2633 	store the last key - if this will ever become an issue, it will be
2634 	easy to change.
2635 	The other advantage of this is, that the queries can skip all duplicates
2636 	at once when they are not relevant to them.
2637 */
2638 status_t
2639 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2640 	uint16 maxLength, off_t* value, uint16* duplicate)
2641 {
2642 	if (fTree == NULL)
2643 		return B_INTERRUPTED;
2644 
2645 	bool forward = direction == BPLUSTREE_FORWARD;
2646 
2647 	if (fCurrentNodeOffset == BPLUSTREE_NULL
2648 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
2649 		RETURN_ERROR(B_ERROR);
2650 
2651 	// if the tree was emptied since the last call
2652 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
2653 		return B_ENTRY_NOT_FOUND;
2654 
2655 #if !_BOOT_MODE
2656 	// lock access to stream
2657 	InodeReadLocker locker(fTree->fStream);
2658 #endif
2659 
2660 	CachedNode cached(fTree);
2661 	const bplustree_node* node;
2662 
2663 	if (fDuplicateNode != BPLUSTREE_NULL) {
2664 		// regardless of traverse direction the duplicates are always presented
2665 		// in the same order; since they are all considered as equal, this
2666 		// shouldn't cause any problems
2667 
2668 		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2669 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2670 				false);
2671 		} else
2672 			node = NULL;
2673 
2674 		if (node != NULL) {
2675 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2676 				// If the node is out of duplicates, we go directly to the next
2677 				// one
2678 				fDuplicateNode = node->RightLink();
2679 				if (fDuplicateNode != BPLUSTREE_NULL
2680 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2681 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2682 						false);
2683 					fDuplicate = 0;
2684 				}
2685 			}
2686 			if (fDuplicate < fNumDuplicates) {
2687 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2688 					fDuplicate++);
2689 				if (duplicate)
2690 					*duplicate = 2;
2691 				return B_OK;
2692 			}
2693 		}
2694 		fDuplicateNode = BPLUSTREE_NULL;
2695 	}
2696 
2697 	off_t savedNodeOffset = fCurrentNodeOffset;
2698 	int32 savedKey = fCurrentKey;
2699 
2700 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2701 		RETURN_ERROR(B_ERROR);
2702 
2703 	if (duplicate)
2704 		*duplicate = 0;
2705 
2706 	fCurrentKey += direction;
2707 
2708 	// is the current key in the current node?
2709 	while ((forward && fCurrentKey >= node->NumKeys())
2710 			|| (!forward && fCurrentKey < 0)) {
2711 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2712 
2713 		// are there any more nodes?
2714 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2715 			node = cached.SetTo(fCurrentNodeOffset);
2716 			if (!node)
2717 				RETURN_ERROR(B_ERROR);
2718 
2719 			// reset current key
2720 			fCurrentKey = forward ? 0 : node->NumKeys() - 1;
2721 		} else {
2722 			// there are no nodes left, so turn back to the last key
2723 			fCurrentNodeOffset = savedNodeOffset;
2724 			fCurrentKey = savedKey;
2725 
2726 			return B_ENTRY_NOT_FOUND;
2727 		}
2728 	}
2729 
2730 	if (node->all_key_count == 0)
2731 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2732 
2733 	uint16 length = 0;
2734 	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2735 	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2736 			> (uint8*)node + fTree->fNodeSize
2737 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2738 #if !_BOOT_MODE
2739 		fTree->fStream->GetVolume()->Panic();
2740 #endif
2741 		RETURN_ERROR(B_BAD_DATA);
2742 	}
2743 
2744 	// include the termination for string types
2745 	bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE;
2746 	if (length + (needsTermination ? 1 : 0) > maxLength) {
2747 		if (!needsTermination || maxLength < INODE_FILE_NAME_LENGTH) {
2748 			// The buffer is too small, restore the last key and return
2749 			// an error
2750 			fCurrentNodeOffset = savedNodeOffset;
2751 			fCurrentKey = savedKey;
2752 			return B_BUFFER_OVERFLOW;
2753 		}
2754 
2755 		// Always cut off strings at the maximum buffer size, and leave
2756 		// room for a terminating null byte.
2757 		// This allows to handle larger key sizes gracefully.
2758 		length = maxLength - 1;
2759 	}
2760 
2761 	memcpy(key, keyStart, length);
2762 
2763 	if (needsTermination)
2764 		((char*)key)[length] = '\0';
2765 
2766 	*keyLength = length;
2767 
2768 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2769 
2770 	// duplicate fragments?
2771 	uint8 type = bplustree_node::LinkType(offset);
2772 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2773 		|| type == BPLUSTREE_DUPLICATE_NODE) {
2774 		fDuplicateNode = offset;
2775 
2776 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2777 			false);
2778 		if (node == NULL)
2779 			RETURN_ERROR(B_ERROR);
2780 
2781 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2782 
2783 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2784 		if (fNumDuplicates) {
2785 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2786 			fDuplicate = 1;
2787 			if (duplicate)
2788 				*duplicate = 1;
2789 		} else {
2790 			// Shouldn't happen, but we're dealing here with potentially
2791 			// corrupt disks...
2792 			fDuplicateNode = BPLUSTREE_NULL;
2793 			offset = 0;
2794 		}
2795 	}
2796 	*value = offset;
2797 
2798 	return B_OK;
2799 }
2800 
2801 
2802 /*!	This is more or less a copy of BPlusTree::Find() - but it just
2803 	sets the current position in the iterator, regardless of if the
2804 	key could be found or not.
2805 */
2806 status_t
2807 TreeIterator::Find(const uint8* key, uint16 keyLength)
2808 {
2809 	if (fTree == NULL)
2810 		return B_INTERRUPTED;
2811 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2812 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2813 		|| key == NULL)
2814 		RETURN_ERROR(B_BAD_VALUE);
2815 
2816 #if !_BOOT_MODE
2817 	// lock access to stream
2818 	InodeReadLocker locker(fTree->fStream);
2819 #endif
2820 
2821 	off_t nodeOffset = fTree->fHeader.RootNode();
2822 
2823 	CachedNode cached(fTree);
2824 	const bplustree_node* node;
2825 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2826 		uint16 keyIndex = 0;
2827 		off_t nextOffset;
2828 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2829 			&nextOffset);
2830 
2831 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2832 			fCurrentNodeOffset = nodeOffset;
2833 			fCurrentKey = keyIndex - 1;
2834 			fDuplicateNode = BPLUSTREE_NULL;
2835 
2836 			return status;
2837 		} else if (nextOffset == nodeOffset)
2838 			RETURN_ERROR(B_ERROR);
2839 
2840 		nodeOffset = nextOffset;
2841 	}
2842 	RETURN_ERROR(B_ERROR);
2843 }
2844 
2845 
2846 void
2847 TreeIterator::SkipDuplicates()
2848 {
2849 	fDuplicateNode = BPLUSTREE_NULL;
2850 }
2851 
2852 
2853 void
2854 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2855 	uint16 splitAt, int8 change)
2856 {
2857 	if (offset != fCurrentNodeOffset)
2858 		return;
2859 
2860 	if (nextOffset != BPLUSTREE_NULL) {
2861 		fCurrentNodeOffset = nextOffset;
2862 		if (splitAt <= fCurrentKey) {
2863 			fCurrentKey -= splitAt;
2864 			keyIndex -= splitAt;
2865 		}
2866 	}
2867 
2868 	// Adjust fCurrentKey to point to the same key as before.
2869 	// Note, that if a key is inserted at the current position
2870 	// it won't be included in this tree transition.
2871 	if (keyIndex <= fCurrentKey)
2872 		fCurrentKey += change;
2873 
2874 	// TODO: duplicate handling!
2875 }
2876 
2877 
2878 void
2879 TreeIterator::Stop()
2880 {
2881 	fTree = NULL;
2882 }
2883 
2884 
2885 #ifdef DEBUG
2886 void
2887 TreeIterator::Dump()
2888 {
2889 	__out("TreeIterator at %p:\n", this);
2890 	__out("\tfTree = %p\n", fTree);
2891 	__out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset);
2892 	__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2893 	__out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n",
2894 		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2895 		fDuplicateNode);
2896 	__out("\tfDuplicate = %u\n", fDuplicate);
2897 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2898 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2899 }
2900 #endif
2901 
2902 
2903 // #pragma mark -
2904 
2905 
2906 bool
2907 bplustree_header::IsValid() const
2908 {
2909 	return Magic() == BPLUSTREE_MAGIC
2910 		&& (RootNode() % NodeSize()) == 0
2911 		&& IsValidLink(RootNode())
2912 		&& IsValidLink(FreeNode());
2913 }
2914 
2915 
2916 // #pragma mark -
2917 
2918 
2919 void
2920 bplustree_node::Initialize()
2921 {
2922 	left_link = right_link = overflow_link
2923 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2924 	all_key_count = 0;
2925 	all_key_length = 0;
2926 }
2927 
2928 
2929 uint8*
2930 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2931 {
2932 	if (index < 0 || index > NumKeys())
2933 		return NULL;
2934 
2935 	uint8* keyStart = Keys();
2936 	uint16* keyLengths = KeyLengths();
2937 
2938 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2939 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2940 	if (index > 0)
2941 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2942 
2943 	return keyStart;
2944 }
2945 
2946 
2947 uint8
2948 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2949 {
2950 	// the duplicate fragment handling is currently hard-coded to a node size
2951 	// of 1024 bytes - with future versions of BFS, this may be a problem
2952 
2953 	if (isFragment) {
2954 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2955 
2956 		return ((off_t*)this)[fragment];
2957 	}
2958 	return OverflowLink();
2959 }
2960 
2961 
2962 off_t
2963 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2964 {
2965 	uint32 start;
2966 	if (isFragment)
2967 		start = 8 * ((uint64)offset & 0x3ff);
2968 	else
2969 		start = 2;
2970 
2971 	return ((off_t*)this)[start + 1 + index];
2972 }
2973 
2974 
2975 /*!	Although the name suggests it, this function doesn't return the real
2976 	used fragment count; at least, it can only count to two: it returns
2977 	0, if there is no fragment used, 1 if there is only one fragment
2978 	used, and 2 if there are at least 2 fragments used.
2979 */
2980 uint32
2981 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2982 {
2983 	uint32 used = 0;
2984 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2985 		duplicate_array* array = FragmentAt(i);
2986 		if (array->Count() > 0 && ++used > 1)
2987 			return used;
2988 	}
2989 	return used;
2990 }
2991 
2992 
2993 status_t
2994 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2995 {
2996 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2997 		DEBUGGER(("invalid node: key/length count"));
2998 
2999 	for (int32 i = 0; i < NumKeys(); i++) {
3000 		uint16 length = 0;
3001 		uint8* key = KeyAt(i, &length);
3002 		if (key + length + sizeof(off_t) + sizeof(uint16)
3003 				> (uint8*)this + nodeSize
3004 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
3005 			dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
3006 			return B_BAD_DATA;
3007 		}
3008 		if (Values()[i] == -1) {
3009 			dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
3010 				"corrupted\n", this, (int)i, Values()[i]);
3011 			return B_BAD_DATA;
3012 		}
3013 	}
3014 	return B_OK;
3015 }
3016 
3017 
3018 // #pragma mark -
3019 
3020 
3021 #if !_BOOT_MODE
3022 BitmapArray::BitmapArray(size_t numBits)
3023 {
3024 	fSize = (numBits + 7) / 8;
3025 	fBitmap = (uint8*)calloc(fSize, 1);
3026 	fCountSet = 0;
3027 }
3028 
3029 
3030 BitmapArray::~BitmapArray()
3031 {
3032 	free(fBitmap);
3033 }
3034 
3035 
3036 status_t
3037 BitmapArray::InitCheck() const
3038 {
3039 	return fBitmap != NULL ? B_OK : B_NO_MEMORY;
3040 }
3041 
3042 
3043 bool
3044 BitmapArray::IsSet(size_t index) const
3045 {
3046 	uint32 byteIndex = index / 8;
3047 	if (byteIndex >= fSize)
3048 		return false;
3049 
3050 	return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
3051 }
3052 
3053 
3054 void
3055 BitmapArray::Set(size_t index, bool set)
3056 {
3057 	uint32 byteIndex = index / 8;
3058 	if (byteIndex >= fSize)
3059 		return;
3060 
3061 	if (set) {
3062 		fBitmap[byteIndex] |= 1UL << (index & 0x7);
3063 		fCountSet++;
3064 	} else {
3065 		fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
3066 		fCountSet--;
3067 	}
3068 }
3069 #endif // !_BOOT_MODE
3070 
3071 
3072 // #pragma mark -
3073 
3074 
3075 bool
3076 duplicate_array::_FindInternal(off_t value, int32& index) const
3077 {
3078 	int32 min = 0, max = Count() - 1;
3079 	off_t cmp;
3080 	while (min <= max) {
3081 		index = (min + max) / 2;
3082 
3083 		cmp = ValueAt(index) - value;
3084 		if (cmp < 0)
3085 			min = index + 1;
3086 		else if (cmp > 0)
3087 			max = index - 1;
3088 		else
3089 			return true;
3090 	}
3091 	return false;
3092 }
3093 
3094 
3095 void
3096 duplicate_array::Insert(off_t value)
3097 {
3098 	// if there are more than 8 values in this array, use a
3099 	// binary search, if not, just iterate linearly to find
3100 	// the insertion point
3101 	int32 size = Count();
3102 	int32 i = 0;
3103 	if (size > 8 ) {
3104 		if (!_FindInternal(value, i) && ValueAt(i) <= value)
3105 			i++;
3106 	} else {
3107 		for (i = 0; i < size; i++) {
3108 			if (ValueAt(i) > value)
3109 				break;
3110 		}
3111 	}
3112 
3113 	memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
3114 	values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
3115 	count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
3116 }
3117 
3118 
3119 bool
3120 duplicate_array::Remove(off_t value)
3121 {
3122 	int32 index = Find(value);
3123 	if (index == -1)
3124 		return false;
3125 
3126 	int32 newSize = Count() - 1;
3127 	memmove(&values[index], &values[index + 1],
3128 		(newSize - index) * sizeof(off_t));
3129 	count = HOST_ENDIAN_TO_BFS_INT64(newSize);
3130 
3131 	return true;
3132 }
3133 
3134 
3135 #if _BOOT_MODE
3136 } // namespace BFS
3137 #endif
3138