xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 4c07199d8201fcf267e90be0d24b76799d03cea6)
1 /*
2  * Copyright 2001-2017, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 
10 //! B+Tree implementation
11 
12 
13 #include "BPlusTree.h"
14 
15 #include <file_systems/QueryParserUtils.h>
16 
17 #include "Debug.h"
18 #include "Utility.h"
19 
20 #if !_BOOT_MODE
21 #	include "Inode.h"
22 #else
23 #	include "Stream.h"
24 
25 // BFS::Stream from the bootloader has the same API as Inode.
26 #	define Inode BFS::Stream
27 
28 #	define strerror(x)		"error message unavailable"
29 
30 namespace BFS {
31 #endif
32 
33 
34 /*!	Simple array used for the duplicate handling in the B+Tree. This is an
35 	on disk structure.
36 */
37 struct duplicate_array {
38 	off_t	count;
39 	off_t	values[0];
40 
41 	inline bool IsEmpty() const
42 	{
43 		return count == 0;
44 	}
45 
46 	inline int32 Count() const
47 	{
48 		return (int32)BFS_ENDIAN_TO_HOST_INT64(count);
49 	}
50 
51 	inline off_t ValueAt(uint32 index) const
52 	{
53 		return BFS_ENDIAN_TO_HOST_INT64(values[index]);
54 	}
55 
56 	inline void SetValueAt(uint32 index, off_t value)
57 	{
58 		values[index] = HOST_ENDIAN_TO_BFS_INT64(value);
59 	}
60 
61 	inline int32 Find(off_t value) const
62 	{
63 		int32 i;
64 		return _FindInternal(value, i) ? i : -1;
65 	}
66 
67 	void Insert(off_t value);
68 	bool Remove(off_t value);
69 
70 private:
71 	bool _FindInternal(off_t value, int32& index) const;
72 } _PACKED;
73 
74 
75 #ifdef DEBUG
76 class NodeChecker {
77 public:
78 	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
79 		:
80 		fNode(node),
81 		fSize(nodeSize),
82 		fText(text)
83 	{
84 		Check("integrity check failed on construction.");
85 	}
86 
87 	~NodeChecker()
88 	{
89 		Check("integrity check failed on destruction.");
90 	}
91 
92 	void
93 	Check(const char* message)
94 	{
95 		if (fNode->CheckIntegrity(fSize) != B_OK) {
96 			dprintf("%s: %s\n", fText, message);
97 			DEBUGGER(("NodeChecker integrity check failed!"));
98 		}
99 	}
100 
101 private:
102 	const bplustree_node*	fNode;
103 	int32					fSize;
104 	const char*				fText;
105 };
106 #endif // DEBUG
107 
108 
109 #if !_BOOT_MODE
110 class BitmapArray {
111 public:
112 								BitmapArray(size_t numBits);
113 								~BitmapArray();
114 
115 			status_t			InitCheck() const;
116 
117 			bool				IsSet(size_t index) const;
118 			void				Set(size_t index, bool set);
119 
120 			size_t				CountSet() const { return fCountSet; }
121 
122 private:
123 			uint8*				fBitmap;
124 			size_t				fSize;
125 			size_t				fCountSet;
126 };
127 
128 
129 struct TreeCheck {
130 	TreeCheck(BPlusTree* tree)
131 		:
132 		fLevelCount(0),
133 		fFreeCount(0),
134 		fNodeSize(tree->NodeSize()),
135 		fMaxLevels(tree->fHeader.MaxNumberOfLevels()),
136 		fFoundErrors(0),
137 		fVisited(tree->Stream()->Size() / tree->NodeSize()),
138 		fVisitedFragment(tree->Stream()->Size() / tree->NodeSize())
139 	{
140 		fPreviousOffsets = (off_t*)malloc(
141 			sizeof(off_t) * tree->fHeader.MaxNumberOfLevels());
142 		if (fPreviousOffsets != NULL) {
143 			for (size_t i = 0; i < fMaxLevels; i++)
144 				fPreviousOffsets[i] = BPLUSTREE_NULL;
145 		}
146 	}
147 
148 	~TreeCheck()
149 	{
150 		free(fPreviousOffsets);
151 	}
152 
153 	status_t InitCheck() const
154 	{
155 		if (fPreviousOffsets == NULL)
156 			return B_NO_MEMORY;
157 
158 		status_t status = fVisited.InitCheck();
159 		if (status != B_OK)
160 			return status;
161 
162 		return fVisitedFragment.InitCheck();
163 	}
164 
165 	bool Visited(off_t offset) const
166 	{
167 		return fVisited.IsSet(offset / fNodeSize);
168 	}
169 
170 	void SetVisited(off_t offset)
171 	{
172 		fVisited.Set(offset / fNodeSize, true);
173 	}
174 
175 	size_t VisitedCount() const
176 	{
177 		return fVisited.CountSet();
178 	}
179 
180 	bool VisitedFragment(off_t offset) const
181 	{
182 		return fVisitedFragment.IsSet(offset / fNodeSize);
183 	}
184 
185 	void SetVisitedFragment(off_t offset)
186 	{
187 		fVisitedFragment.Set(offset / fNodeSize, true);
188 	}
189 
190 	uint32 MaxLevels() const
191 	{
192 		return fLevelCount;
193 	}
194 
195 	void SetLevel(uint32 level)
196 	{
197 		if (fLevelCount < level)
198 			fLevelCount = level;
199 	}
200 
201 	off_t PreviousOffset(uint32 level)
202 	{
203 		return fPreviousOffsets[level];
204 	}
205 
206 	void SetPreviousOffset(uint32 level, off_t offset)
207 	{
208 		fPreviousOffsets[level] = offset;
209 	}
210 
211 	void FoundError()
212 	{
213 		fFoundErrors++;
214 	}
215 
216 	bool ErrorsFound()
217 	{
218 		return fFoundErrors != 0;
219 	}
220 
221 private:
222 			uint32				fLevelCount;
223 			uint32				fFreeCount;
224 			uint32				fNodeSize;
225 			uint32				fMaxLevels;
226 			uint32				fFoundErrors;
227 			BitmapArray			fVisited;
228 			BitmapArray			fVisitedFragment;
229 			off_t*				fPreviousOffsets;
230 };
231 
232 
233 // #pragma mark -
234 
235 
236 // Node Caching for the BPlusTree class
237 //
238 // With write support, there is the need for a function that allocates new
239 // nodes by either returning empty nodes, or by growing the file's data stream
240 //
241 // !! The CachedNode class assumes that you have properly locked the stream
242 // !! before asking for nodes.
243 //
244 // Note: This code will fail if the block size is smaller than the node size!
245 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
246 // is hard-coded to 1024 bytes, that's not an issue now.
247 
248 void
249 CachedNode::UnsetUnchanged(Transaction& transaction)
250 {
251 	if (fTree == NULL || fTree->fStream == NULL)
252 		return;
253 
254 	if (fNode != NULL) {
255 		void* cache = fTree->fStream->GetVolume()->BlockCache();
256 
257 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
258 		block_cache_put(cache, fBlockNumber);
259 		fNode = NULL;
260 	}
261 }
262 #endif // !_BOOT_MODE
263 
264 
265 void
266 CachedNode::Unset()
267 {
268 	if (fTree == NULL || fTree->fStream == NULL)
269 		return;
270 
271 	if (fNode != NULL) {
272 #if !_BOOT_MODE
273 		if (fWritable && fOffset == 0) {
274 			// The B+tree header has been updated - we need to update the
275 			// BPlusTrees copy of it, as well.
276 			memcpy(&fTree->fHeader, fNode, sizeof(bplustree_header));
277 		}
278 
279 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
280 			fBlockNumber);
281 #endif // !_BOOT_MODE
282 
283 		fNode = NULL;
284 	}
285 }
286 
287 
288 const bplustree_node*
289 CachedNode::SetTo(off_t offset, bool check)
290 {
291 	const bplustree_node* node;
292 	if (SetTo(offset, &node, check) == B_OK)
293 		return node;
294 
295 	return NULL;
296 }
297 
298 
299 status_t
300 CachedNode::SetTo(off_t offset, const bplustree_node** _node, bool check)
301 {
302 	if (fTree == NULL || fTree->fStream == NULL)
303 		RETURN_ERROR(B_BAD_VALUE);
304 
305 	Unset();
306 
307 	// You can only ask for nodes at valid positions - you can't
308 	// even access the b+tree header with this method (use SetToHeader()
309 	// instead)
310 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
311 		|| offset <= 0
312 		|| (offset % fTree->fNodeSize) != 0) {
313 		RETURN_ERROR(B_BAD_VALUE);
314 	}
315 
316 	if (InternalSetTo(NULL, offset) != NULL && check) {
317 		// sanity checks (links, all_key_count)
318 		if (!fTree->fHeader.CheckNode(fNode)) {
319 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
320 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
321 				fBlockNumber, fTree->fStream->ID()));
322 			return B_BAD_DATA;
323 		}
324 	}
325 
326 	*_node = fNode;
327 	return B_OK;
328 }
329 
330 
331 #if !_BOOT_MODE
332 bplustree_node*
333 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
334 {
335 	if (fTree == NULL || fTree->fStream == NULL) {
336 		REPORT_ERROR(B_BAD_VALUE);
337 		return NULL;
338 	}
339 
340 	Unset();
341 
342 	// You can only ask for nodes at valid positions - you can't
343 	// even access the b+tree header with this method (use SetToHeader()
344 	// instead)
345 	if (offset > fTree->fHeader.MaximumSize() - fTree->fNodeSize
346 		|| offset <= 0
347 		|| (offset % fTree->fNodeSize) != 0)
348 		return NULL;
349 
350 	if (InternalSetTo(&transaction, offset) != NULL && check) {
351 		// sanity checks (links, all_key_count)
352 		if (!fTree->fHeader.CheckNode(fNode)) {
353 			FATAL(("invalid node [%p] read from offset %" B_PRIdOFF " (block %"
354 				B_PRIdOFF "), inode at %" B_PRIdINO "\n", fNode, offset,
355 				fBlockNumber, fTree->fStream->ID()));
356 			return NULL;
357 		}
358 	}
359 	return fNode;
360 }
361 
362 
363 bplustree_node*
364 CachedNode::MakeWritable(Transaction& transaction)
365 {
366 	if (fNode == NULL)
367 		return NULL;
368 
369 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
370 			fBlockNumber, transaction.ID()) == B_OK) {
371 		return fNode;
372 	}
373 
374 	return NULL;
375 }
376 #endif // !_BOOT_MODE
377 
378 
379 const bplustree_header*
380 CachedNode::SetToHeader()
381 {
382 	if (fTree == NULL || fTree->fStream == NULL) {
383 		REPORT_ERROR(B_BAD_VALUE);
384 		return NULL;
385 	}
386 
387 	Unset();
388 
389 	InternalSetTo(NULL, 0LL);
390 	return (bplustree_header*)fNode;
391 }
392 
393 
394 #if !_BOOT_MODE
395 bplustree_header*
396 CachedNode::SetToWritableHeader(Transaction& transaction)
397 {
398 	if (fTree == NULL || fTree->fStream == NULL) {
399 		REPORT_ERROR(B_BAD_VALUE);
400 		return NULL;
401 	}
402 
403 	Unset();
404 
405 	InternalSetTo(&transaction, 0LL);
406 
407 	if (fNode != NULL && !fTree->fInTransaction) {
408 		transaction.AddListener(fTree);
409 		fTree->fInTransaction = true;
410 
411 		if (!transaction.GetVolume()->IsInitializing()) {
412 			acquire_vnode(transaction.GetVolume()->FSVolume(),
413 				fTree->fStream->ID());
414 		}
415 	}
416 
417 	return (bplustree_header*)fNode;
418 }
419 #endif // !_BOOT_MODE
420 
421 
422 bplustree_node*
423 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
424 {
425 	fNode = NULL;
426 	fOffset = offset;
427 
428 	off_t fileOffset;
429 	block_run run;
430 	if (offset < fTree->fStream->Size()
431 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
432 
433 #if !_BOOT_MODE
434 		Volume* volume = fTree->fStream->GetVolume();
435 #else
436 		Volume* volume = &fTree->fStream->GetVolume();
437 #endif
438 
439 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
440 		fBlockNumber = volume->ToBlock(run) + blockOffset;
441 		uint8* block = NULL;
442 
443 #if !_BOOT_MODE
444 		if (transaction != NULL) {
445 			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
446 				fBlockNumber, transaction->ID());
447 			fWritable = true;
448 		} else {
449 			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
450 			fWritable = false;
451 		}
452 #else // !_BOOT_MODE
453 		if (fBlock == NULL) {
454 			fBlock = (uint8*)malloc(volume->BlockSize());
455 			if (fBlock == NULL)
456 				return NULL;
457 		}
458 
459 		if (read_pos(volume->Device(), fBlockNumber << volume->BlockShift(),
460 				fBlock, volume->BlockSize()) == (ssize_t)volume->BlockSize()) {
461 			block = fBlock;
462 		}
463 
464 		fWritable = false;
465 #endif // _BOOT_MODE
466 
467 		if (block != NULL) {
468 			// The node is somewhere in that block...
469 			// (confusing offset calculation)
470 			fNode = (bplustree_node*)(block + offset
471 				- (fileOffset + (blockOffset << volume->BlockShift())));
472 		} else
473 			REPORT_ERROR(B_IO_ERROR);
474 	}
475 	return fNode;
476 }
477 
478 
479 #if !_BOOT_MODE
480 status_t
481 CachedNode::Free(Transaction& transaction, off_t offset)
482 {
483 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
484 		RETURN_ERROR(B_BAD_VALUE);
485 
486 	// TODO: scan the free nodes list and remove all nodes at the end
487 	// of the tree - perhaps that shouldn't be done everytime that
488 	// function is called, perhaps it should be done when the directory
489 	// inode is closed or based on some calculation or whatever...
490 
491 	CachedNode cached(fTree);
492 	bplustree_header* header = cached.SetToWritableHeader(transaction);
493 	if (header == NULL)
494 		return B_IO_ERROR;
495 
496 #if 0
497 	// TODO: temporarily disabled because CheckNode() doesn't like this...
498 	// 		Also, it's such an edge case that it's almost useless, anyway.
499 	// if the node is the last one in the tree, we shrink
500 	// the tree and file size by one node
501 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
502 	if (offset == lastOffset) {
503 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
504 		if (status != B_OK)
505 			return status;
506 
507 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
508 		return B_OK;
509 	}
510 #endif
511 
512 	// add the node to the free nodes list
513 	fNode->left_link = header->free_node_pointer;
514 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
515 
516 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
517 	return B_OK;
518 }
519 
520 
521 status_t
522 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
523 	off_t* _offset)
524 {
525 	if (fTree == NULL || fTree->fStream == NULL)
526 		RETURN_ERROR(B_BAD_VALUE);
527 
528 	Unset();
529 
530 	bplustree_header* header;
531 	status_t status;
532 
533 	// if there are any free nodes, recycle them
534 	if (SetToWritable(transaction, fTree->fHeader.FreeNode(), false) != NULL) {
535 		CachedNode cached(fTree);
536 		header = cached.SetToWritableHeader(transaction);
537 		if (header == NULL)
538 			return B_IO_ERROR;
539 
540 		*_offset = header->FreeNode();
541 		*_node = fNode;
542 
543 		// set new free node pointer
544 		header->free_node_pointer = fNode->left_link;
545 		fNode->Initialize();
546 		return B_OK;
547 	}
548 
549 	// allocate space for a new node
550 	Inode* stream = fTree->fStream;
551 	if ((status = stream->Append(transaction, fTree->fNodeSize)) != B_OK)
552 		return status;
553 
554 	CachedNode cached(fTree);
555 	header = cached.SetToWritableHeader(transaction);
556 	if (header == NULL)
557 		return B_IO_ERROR;
558 
559 	// the maximum_size has to be changed before the call to SetTo() - or
560 	// else it will fail because the requested node is out of bounds
561 	off_t offset = header->MaximumSize();
562 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
563 		+ fTree->fNodeSize);
564 
565 	cached.Unset();
566 		// SetToWritable() below needs the new values in the tree's header
567 
568 	if (SetToWritable(transaction, offset, false) == NULL)
569 		RETURN_ERROR(B_ERROR);
570 
571 	fNode->Initialize();
572 
573 	*_offset = offset;
574 	*_node = fNode;
575 	return B_OK;
576 }
577 
578 
579 //	#pragma mark -
580 
581 
582 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
583 	:
584 	fStream(NULL),
585 	fInTransaction(false)
586 {
587 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
588 	SetTo(transaction, stream);
589 }
590 #endif // !_BOOT_MODE
591 
592 
593 BPlusTree::BPlusTree(Inode* stream)
594 	:
595 	fStream(NULL),
596 	fInTransaction(false)
597 {
598 #if !_BOOT_MODE
599 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
600 #endif
601 
602 	SetTo(stream);
603 }
604 
605 
606 BPlusTree::BPlusTree()
607 	:
608 	fStream(NULL),
609 	fNodeSize(BPLUSTREE_NODE_SIZE),
610 	fAllowDuplicates(true),
611 	fInTransaction(false),
612 	fStatus(B_NO_INIT)
613 {
614 #if !_BOOT_MODE
615 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
616 #endif
617 }
618 
619 
620 BPlusTree::~BPlusTree()
621 {
622 #if !_BOOT_MODE
623 	// if there are any TreeIterators left, we need to stop them
624 	// (can happen when the tree's inode gets deleted while
625 	// traversing the tree - a TreeIterator doesn't lock the inode)
626 	mutex_lock(&fIteratorLock);
627 
628 	SinglyLinkedList<TreeIterator>::Iterator iterator
629 		= fIterators.GetIterator();
630 	while (iterator.HasNext())
631 		iterator.Next()->Stop();
632 
633 	mutex_destroy(&fIteratorLock);
634 
635 	ASSERT(!fInTransaction);
636 #endif // !_BOOT_MODE
637 }
638 
639 
640 #if !_BOOT_MODE
641 /*! Create a new B+Tree on the specified stream */
642 status_t
643 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
644 {
645 	// initializes in-memory B+Tree
646 
647 	fStream = stream;
648 
649 	CachedNode cached(this);
650 	bplustree_header* header = cached.SetToWritableHeader(transaction);
651 	if (header == NULL) {
652 		// allocate space for new header + node!
653 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
654 		if (fStatus != B_OK)
655 			RETURN_ERROR(fStatus);
656 
657 		header = cached.SetToWritableHeader(transaction);
658 		if (header == NULL)
659 			RETURN_ERROR(fStatus = B_ERROR);
660 	}
661 
662 	fAllowDuplicates = stream->IsIndex()
663 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
664 
665 	fNodeSize = nodeSize;
666 
667 	// initialize b+tree header
668  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
669  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
670  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
671  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
672  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
673  	header->free_node_pointer
674  		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
675  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
676 
677 	cached.Unset();
678 
679 	// initialize b+tree root node
680 	cached.SetToWritable(transaction, fHeader.RootNode(), false);
681 	if (cached.Node() == NULL)
682 		RETURN_ERROR(B_IO_ERROR);
683 
684 	cached.Node()->Initialize();
685 
686 	return fStatus = B_OK;
687 }
688 #endif // !_BOOT_MODE
689 
690 
691 status_t
692 BPlusTree::SetTo(Inode* stream)
693 {
694 	if (stream == NULL)
695 		RETURN_ERROR(fStatus = B_BAD_VALUE);
696 
697 	fStream = stream;
698 
699 	// get on-disk B+Tree header
700 
701 	CachedNode cached(this);
702 	const bplustree_header* header = cached.SetToHeader();
703 	if (header != NULL)
704 		memcpy(&fHeader, header, sizeof(bplustree_header));
705 	else
706 		RETURN_ERROR(fStatus = B_IO_ERROR);
707 
708 	// is header valid?
709 
710 	if (fHeader.MaximumSize() != stream->Size()) {
711 		dprintf("B+tree header size %" B_PRIdOFF " doesn't fit file size %"
712 			B_PRIdOFF "!\n", fHeader.MaximumSize(), stream->Size());
713 		// we can't change the header since we don't have a transaction
714 		//fHeader.maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
715 	}
716 	if (!fHeader.IsValid()) {
717 #ifdef DEBUG
718 		dump_bplustree_header(&fHeader);
719 		dump_block((const char*)&fHeader, 128);
720 #endif
721 		RETURN_ERROR(fStatus = B_BAD_DATA);
722 	}
723 
724 	fNodeSize = fHeader.NodeSize();
725 
726 	// validity check
727 	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
728 		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
729 		S_DOUBLE_INDEX};
730 	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
731 		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
732 		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
733 
734 	if (fHeader.DataType() > BPLUSTREE_DOUBLE_TYPE
735 		|| ((stream->Mode() & S_INDEX_DIR) != 0
736 			&& kToMode[fHeader.DataType()] != mode)
737 		|| !stream->IsContainer()) {
738 		D(	dump_bplustree_header(&fHeader);
739 			dump_inode(&stream->Node());
740 		);
741 		RETURN_ERROR(fStatus = B_BAD_TYPE);
742 	}
743 
744 	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
745 	// in the original BFS code - we will honour it nevertheless
746 	fAllowDuplicates = stream->IsIndex()
747 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
748 
749 	cached.SetTo(fHeader.RootNode());
750 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
751 }
752 
753 
754 status_t
755 BPlusTree::InitCheck()
756 {
757 	return fStatus;
758 }
759 
760 
761 #if !_BOOT_MODE
762 status_t
763 BPlusTree::Validate(bool repair, bool& _errorsFound)
764 {
765 	TreeCheck check(this);
766 	if (check.InitCheck() != B_OK)
767 		return B_NO_MEMORY;
768 
769 	check.SetVisited(0);
770 
771 	// Walk the free nodes
772 
773 	CachedNode cached(this);
774 	off_t freeOffset = fHeader.FreeNode();
775 	while (freeOffset > 0) {
776 		const bplustree_node* node;
777 		status_t status = cached.SetTo(freeOffset, &node, false);
778 		if (status != B_OK) {
779 			if (status == B_IO_ERROR)
780 				return B_IO_ERROR;
781 
782 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF " could "
783 				"not be read: %s\n", fStream->ID(), freeOffset,
784 				strerror(status));
785 			check.FoundError();
786 			break;
787 		}
788 
789 		if (check.Visited(freeOffset)) {
790 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
791 				" circular!\n", fStream->ID(), freeOffset);
792 			// TODO: if 'repair' is true, we could collect all unvisited nodes
793 			// at the end, and put the into the free list
794 			check.FoundError();
795 			break;
796 		}
797 
798 		check.SetVisited(freeOffset);
799 
800 		if (node->OverflowLink() != BPLUSTREE_FREE) {
801 			dprintf("inode %" B_PRIdOFF ": free node at %" B_PRIdOFF
802 				" misses free mark!\n", fStream->ID(), freeOffset);
803 		}
804 		freeOffset = node->LeftLink();
805 	}
806 
807 	// Iterate over the complete tree recursively
808 
809 	const bplustree_node* root;
810 	status_t status = cached.SetTo(fHeader.RootNode(), &root, true);
811 	if (status != B_OK)
812 		return status;
813 
814 	status = _ValidateChildren(check, 0, fHeader.RootNode(), NULL, 0, root);
815 
816 	if (check.ErrorsFound())
817 		_errorsFound = true;
818 
819 	if (status != B_OK)
820 		return status;
821 
822 	if (check.MaxLevels() + 1 != fHeader.MaxNumberOfLevels()) {
823 		dprintf("inode %" B_PRIdOFF ": found %" B_PRIu32 " max levels, "
824 			"declared %" B_PRIu32 "!\n", fStream->ID(), check.MaxLevels(),
825 			fHeader.MaxNumberOfLevels());
826 	}
827 
828 	if ((off_t)check.VisitedCount() != fHeader.MaximumSize() / fNodeSize) {
829 		dprintf("inode %" B_PRIdOFF ": visited %" B_PRIuSIZE " from %" B_PRIdOFF
830 			" nodes.\n", fStream->ID(), check.VisitedCount(),
831 			fHeader.MaximumSize() / fNodeSize);
832 	}
833 
834 	return B_OK;
835 }
836 
837 
838 status_t
839 BPlusTree::MakeEmpty()
840 {
841 	// Put all nodes into the free list in order
842 	Transaction transaction(fStream->GetVolume(), fStream->BlockNumber());
843 
844 	// Reset the header, and root node
845 	CachedNode cached(this);
846 	bplustree_header* header = cached.SetToWritableHeader(transaction);
847 	if (header == NULL)
848 		return B_IO_ERROR;
849 
850 	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
851 	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(NodeSize());
852 	if (fStream->Size() > (off_t)NodeSize() * 2)
853 		header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(2 * NodeSize());
854 	else {
855 		header->free_node_pointer
856 			= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
857 	}
858 
859 	bplustree_node* node = cached.SetToWritable(transaction, NodeSize(), false);
860 	if (node == NULL)
861 		return B_IO_ERROR;
862 
863 	node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
864 	node->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
865 	node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
866 	node->all_key_count = 0;
867 	node->all_key_length = 0;
868 
869 	for (off_t offset = 2 * NodeSize(); offset < fStream->Size();
870 			offset += NodeSize()) {
871 		bplustree_node* node = cached.SetToWritable(transaction, offset, false);
872 		if (node == NULL) {
873 			dprintf("--> could not open %" B_PRIdOFF "\n", offset);
874 			return B_IO_ERROR;
875 		}
876 		if (offset < fStream->Size() - (off_t)NodeSize())
877 			node->left_link = HOST_ENDIAN_TO_BFS_INT64(offset + NodeSize());
878 		else
879 			node->left_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
880 
881 		node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
882 
883 		// It's not important to write it out in a single transaction; just
884 		// make sure it doesn't get too large
885 		if (offset % (1024 * 1024) == 0) {
886 			transaction.Done();
887 			transaction.Start(fStream->GetVolume(), fStream->BlockNumber());
888 		}
889 	}
890 
891 	return transaction.Done();
892 }
893 
894 
895 int32
896 BPlusTree::TypeCodeToKeyType(type_code code)
897 {
898 	switch (code) {
899 		case B_STRING_TYPE:
900 			return BPLUSTREE_STRING_TYPE;
901 		case B_SSIZE_T_TYPE:
902 		case B_INT32_TYPE:
903 			return BPLUSTREE_INT32_TYPE;
904 		case B_SIZE_T_TYPE:
905 		case B_UINT32_TYPE:
906 			return BPLUSTREE_UINT32_TYPE;
907 		case B_OFF_T_TYPE:
908 		case B_INT64_TYPE:
909 			return BPLUSTREE_INT64_TYPE;
910 		case B_UINT64_TYPE:
911 			return BPLUSTREE_UINT64_TYPE;
912 		case B_FLOAT_TYPE:
913 			return BPLUSTREE_FLOAT_TYPE;
914 		case B_DOUBLE_TYPE:
915 			return BPLUSTREE_DOUBLE_TYPE;
916 	}
917 	return -1;
918 }
919 
920 
921 int32
922 BPlusTree::ModeToKeyType(mode_t mode)
923 {
924 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
925 			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
926 		case S_INT_INDEX:
927 			return BPLUSTREE_INT32_TYPE;
928 		case S_UINT_INDEX:
929 			return BPLUSTREE_UINT32_TYPE;
930 		case S_LONG_LONG_INDEX:
931 			return BPLUSTREE_INT64_TYPE;
932 		case S_ULONG_LONG_INDEX:
933 			return BPLUSTREE_UINT64_TYPE;
934 		case S_FLOAT_INDEX:
935 			return BPLUSTREE_FLOAT_TYPE;
936 		case S_DOUBLE_INDEX:
937 			return BPLUSTREE_DOUBLE_TYPE;
938 		case S_STR_INDEX:
939 		default:
940 			// default is for standard directories
941 			return BPLUSTREE_STRING_TYPE;
942 	}
943 }
944 #endif // !_BOOT_MODE
945 
946 
947 //	#pragma mark - TransactionListener implementation
948 
949 
950 #if !_BOOT_MODE
951 void
952 BPlusTree::TransactionDone(bool success)
953 {
954 	if (!success) {
955 		// update header from disk
956 		CachedNode cached(this);
957 		const bplustree_header* header = cached.SetToHeader();
958 		if (header != NULL)
959 			memcpy(&fHeader, header, sizeof(bplustree_header));
960 	}
961 }
962 
963 
964 void
965 BPlusTree::RemovedFromTransaction()
966 {
967 	fInTransaction = false;
968 
969 	if (!fStream->GetVolume()->IsInitializing())
970 		put_vnode(fStream->GetVolume()->FSVolume(), fStream->ID());
971 }
972 
973 
974 //	#pragma mark -
975 
976 
977 void
978 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
979 	uint16 splitAt, int8 change)
980 {
981 	// Although every iterator which is affected by this update currently
982 	// waits on a semaphore, other iterators could be added/removed at
983 	// any time, so we need to protect this loop
984 	MutexLocker _(fIteratorLock);
985 
986 	SinglyLinkedList<TreeIterator>::Iterator iterator
987 		= fIterators.GetIterator();
988 	while (iterator.HasNext())
989 		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
990 }
991 
992 
993 void
994 BPlusTree::_AddIterator(TreeIterator* iterator)
995 {
996 	MutexLocker _(fIteratorLock);
997 	fIterators.Add(iterator);
998 }
999 
1000 
1001 void
1002 BPlusTree::_RemoveIterator(TreeIterator* iterator)
1003 {
1004 	MutexLocker _(fIteratorLock);
1005 	fIterators.Remove(iterator);
1006 }
1007 #endif // !_BOOT_MODE
1008 
1009 
1010 int32
1011 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
1012 	int keyLength2)
1013 {
1014 	type_code type = 0;
1015 	switch (fHeader.DataType()) {
1016 	    case BPLUSTREE_STRING_TYPE:
1017 	    	type = B_STRING_TYPE;
1018 	    	break;
1019 		case BPLUSTREE_INT32_TYPE:
1020 	    	type = B_INT32_TYPE;
1021 	    	break;
1022 		case BPLUSTREE_UINT32_TYPE:
1023 	    	type = B_UINT32_TYPE;
1024 	    	break;
1025 		case BPLUSTREE_INT64_TYPE:
1026 	    	type = B_INT64_TYPE;
1027 	    	break;
1028 		case BPLUSTREE_UINT64_TYPE:
1029 	    	type = B_UINT64_TYPE;
1030 	    	break;
1031 		case BPLUSTREE_FLOAT_TYPE:
1032 	    	type = B_FLOAT_TYPE;
1033 	    	break;
1034 		case BPLUSTREE_DOUBLE_TYPE:
1035 	    	type = B_DOUBLE_TYPE;
1036 	    	break;
1037 	}
1038 	return QueryParser::compareKeys(type, key1, keyLength1, key2, keyLength2);
1039 }
1040 
1041 
1042 status_t
1043 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
1044 	uint16 keyLength, uint16* _index, off_t* _next)
1045 {
1046 #ifdef DEBUG
1047 	NodeChecker checker(node, fNodeSize, "find");
1048 #endif
1049 
1050 	if (node->all_key_count == 0) {
1051 		if (_index)
1052 			*_index = 0;
1053 		if (_next)
1054 			*_next = node->OverflowLink();
1055 		return B_ENTRY_NOT_FOUND;
1056 	}
1057 
1058 	Unaligned<off_t>* values = node->Values();
1059 	int16 saveIndex = -1;
1060 
1061 	// binary search in the key array
1062 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
1063 		uint16 i = (first + last) >> 1;
1064 
1065 		uint16 searchLength = 0;
1066 		uint8* searchKey = node->KeyAt(i, &searchLength);
1067 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
1068 				> (uint8*)node + fNodeSize
1069 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
1070 #if !_BOOT_MODE
1071 			fStream->GetVolume()->Panic();
1072 #endif
1073 			RETURN_ERROR(B_BAD_DATA);
1074 		}
1075 
1076 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
1077 		if (cmp < 0) {
1078 			last = i - 1;
1079 			saveIndex = i;
1080 		} else if (cmp > 0) {
1081 			saveIndex = first = i + 1;
1082 		} else {
1083 			if (_index)
1084 				*_index = i;
1085 			if (_next)
1086 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1087 			return B_OK;
1088 		}
1089 	}
1090 
1091 	if (_index)
1092 		*_index = saveIndex;
1093 	if (_next) {
1094 		if (saveIndex == node->NumKeys())
1095 			*_next = node->OverflowLink();
1096 		else
1097 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
1098 	}
1099 	return B_ENTRY_NOT_FOUND;
1100 }
1101 
1102 
1103 #if !_BOOT_MODE
1104 /*!	Prepares the stack to contain all nodes that were passed while
1105 	following the key, from the root node to the leaf node that could
1106 	or should contain that key.
1107 */
1108 status_t
1109 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
1110 	uint16 keyLength)
1111 {
1112 	// set the root node to begin with
1113 	node_and_key nodeAndKey;
1114 	nodeAndKey.nodeOffset = fHeader.RootNode();
1115 
1116 	CachedNode cached(this);
1117 	const bplustree_node* node;
1118 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1119 		// if we are already on leaf level, we're done
1120 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1121 			// node that the keyIndex is not properly set here (but it's not
1122 			// needed in the calling functions anyway)!
1123 			nodeAndKey.keyIndex = 0;
1124 			stack.Push(nodeAndKey);
1125 			return B_OK;
1126 		}
1127 
1128 		off_t nextOffset;
1129 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
1130 			&nextOffset);
1131 
1132 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
1133 			RETURN_ERROR(B_ERROR);
1134 
1135 		if ((uint32)stack.CountItems() > fHeader.MaxNumberOfLevels()) {
1136 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
1137 			break;
1138 		}
1139 
1140 		// put the node offset & the correct keyIndex on the stack
1141 		stack.Push(nodeAndKey);
1142 
1143 		nodeAndKey.nodeOffset = nextOffset;
1144 	}
1145 
1146 	FATAL(("BPlusTree::_SeekDown() could not open node %" B_PRIdOFF ", inode %"
1147 		B_PRIdOFF "\n", nodeAndKey.nodeOffset, fStream->ID()));
1148 	return B_ERROR;
1149 }
1150 
1151 
1152 /*!	This will find a free duplicate fragment in the given bplustree_node.
1153 	The CachedNode will be set to the writable fragment on success.
1154 */
1155 status_t
1156 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
1157 	const bplustree_node* node, CachedNode& cached,
1158 	off_t* _offset, bplustree_node** _fragment, uint32* _index)
1159 {
1160 	Unaligned<off_t>* values = node->Values();
1161 	for (int32 i = 0; i < node->NumKeys(); i++) {
1162 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
1163 
1164 		// does the value link to a duplicate fragment?
1165 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
1166 			continue;
1167 
1168 		const bplustree_node* fragment = cached.SetTo(
1169 			bplustree_node::FragmentOffset(value), false);
1170 		if (fragment == NULL) {
1171 			FATAL(("Could not get duplicate fragment at %" B_PRIdOFF ", inode %"
1172 				B_PRIdOFF "\n", value, fStream->ID()));
1173 			continue;
1174 		}
1175 
1176 		// see if there is some space left for us
1177 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
1178 		for (uint32 j = 0; j < num; j++) {
1179 			duplicate_array* array = fragment->FragmentAt(j);
1180 
1181 			if (array->IsEmpty()) {
1182 				// found an unused fragment
1183 				*_fragment = cached.MakeWritable(transaction);
1184 				if (*_fragment == NULL)
1185 					return B_IO_ERROR;
1186 
1187 				*_offset = bplustree_node::FragmentOffset(value);
1188 				*_index = j;
1189 				return B_OK;
1190 			}
1191 		}
1192 	}
1193 	return B_ENTRY_NOT_FOUND;
1194 }
1195 
1196 
1197 status_t
1198 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
1199 	const bplustree_node* node, uint16 index, off_t value)
1200 {
1201 	CachedNode cachedDuplicate(this);
1202 	Unaligned<off_t>* values = node->Values();
1203 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1204 	status_t status;
1205 	off_t offset;
1206 
1207 	if (bplustree_node::IsDuplicate(oldValue)) {
1208 		// If it's a duplicate fragment, try to insert it into that, or if it
1209 		// doesn't fit anymore, create a new duplicate node
1210 
1211 		if (bplustree_node::LinkType(oldValue)
1212 				== BPLUSTREE_DUPLICATE_FRAGMENT) {
1213 			bplustree_node* duplicate = cachedDuplicate.SetToWritable(
1214 				transaction, bplustree_node::FragmentOffset(oldValue), false);
1215 			if (duplicate == NULL)
1216 				return B_IO_ERROR;
1217 
1218 			duplicate_array* array = duplicate->FragmentAt(
1219 				bplustree_node::FragmentIndex(oldValue));
1220 			int32 arrayCount = array->Count();
1221 			if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount < 1) {
1222 				FATAL(("_InsertDuplicate: Invalid array[%d] size in fragment "
1223 					"%" B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1224 					(int)bplustree_node::FragmentIndex(oldValue),
1225 					bplustree_node::FragmentOffset(oldValue), arrayCount,
1226 					fStream->ID()));
1227 				return B_BAD_DATA;
1228 			}
1229 
1230 			if (arrayCount < NUM_FRAGMENT_VALUES) {
1231 				array->Insert(value);
1232 			} else {
1233 				// Test if the fragment will be empty if we remove this key's
1234 				// values
1235 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
1236 					// The node will be empty without our values, so let us
1237 					// reuse it as a duplicate node
1238 					offset = bplustree_node::FragmentOffset(oldValue);
1239 
1240 					memmove(duplicate->DuplicateArray(), array,
1241 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1242 					duplicate->left_link = duplicate->right_link
1243 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1244 
1245 					array = duplicate->DuplicateArray();
1246 					array->Insert(value);
1247 				} else {
1248 					// Create a new duplicate node
1249 
1250 					cachedDuplicate.UnsetUnchanged(transaction);
1251 						// The old duplicate has not been touched, so we can
1252 						// reuse it
1253 
1254 					bplustree_node* newDuplicate;
1255 					status = cachedDuplicate.Allocate(transaction,
1256 						&newDuplicate, &offset);
1257 					if (status != B_OK)
1258 						RETURN_ERROR(status);
1259 
1260 					// Copy the array from the fragment node to the duplicate
1261 					// node and free the old entry (by zero'ing all values)
1262 					newDuplicate->overflow_link = array->count;
1263 					memcpy(&newDuplicate->all_key_count, &array->values[0],
1264 						array->Count() * sizeof(off_t));
1265 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1266 
1267 					array = newDuplicate->DuplicateArray();
1268 					array->Insert(value);
1269 				}
1270 
1271 				// Update the main pointer to link to a duplicate node
1272 				if (cached.MakeWritable(transaction) == NULL)
1273 					return B_IO_ERROR;
1274 
1275 				values[index]
1276 					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1277 						BPLUSTREE_DUPLICATE_NODE, offset));
1278 			}
1279 
1280 			return B_OK;
1281 		}
1282 
1283 		// Put the value into a dedicated duplicate node
1284 
1285 		// search for free space in the duplicate nodes of that key
1286 		duplicate_array* array;
1287 		int32 arrayCount;
1288 		const bplustree_node* duplicate;
1289 		off_t duplicateOffset;
1290 		do {
1291 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1292 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1293 			if (duplicate == NULL)
1294 				return B_IO_ERROR;
1295 
1296 			array = duplicate->DuplicateArray();
1297 			arrayCount =array->Count();
1298 			if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1299 				FATAL(("_InsertDuplicate: Invalid array size in duplicate %"
1300 					B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1301 					duplicateOffset, arrayCount, fStream->ID()));
1302 				return B_BAD_DATA;
1303 			}
1304 		} while (arrayCount >= NUM_DUPLICATE_VALUES
1305 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
1306 
1307 		bplustree_node* writableDuplicate
1308 			= cachedDuplicate.MakeWritable(transaction);
1309 		if (writableDuplicate == NULL)
1310 			return B_IO_ERROR;
1311 
1312 		if (arrayCount < NUM_DUPLICATE_VALUES) {
1313 			array = writableDuplicate->DuplicateArray();
1314 			array->Insert(value);
1315 		} else {
1316 			// no space left - add a new duplicate node
1317 
1318 			bplustree_node* newDuplicate;
1319 			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
1320 				&offset);
1321 			if (status != B_OK)
1322 				RETURN_ERROR(status);
1323 
1324 			// link the two nodes together
1325 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
1326 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
1327 
1328 			array = newDuplicate->DuplicateArray();
1329 			array->count = 0;
1330 			array->Insert(value);
1331 		}
1332 		return B_OK;
1333 	}
1334 
1335 	// Search for a free duplicate fragment or create a new one
1336 	// to insert the duplicate value into
1337 
1338 	uint32 fragmentIndex = 0;
1339 	bplustree_node* fragment;
1340 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
1341 			&offset, &fragment, &fragmentIndex) != B_OK) {
1342 		// allocate a new duplicate fragment node
1343 		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
1344 		if (status != B_OK)
1345 			RETURN_ERROR(status);
1346 
1347 		memset(fragment, 0, fNodeSize);
1348 	}
1349 
1350 	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
1351 	array->Insert(oldValue);
1352 	array->Insert(value);
1353 
1354 	if (cached.MakeWritable(transaction) == NULL)
1355 		return B_IO_ERROR;
1356 
1357 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1358 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
1359 
1360 	return B_OK;
1361 }
1362 
1363 
1364 void
1365 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
1366 	uint16 keyLength, off_t value)
1367 {
1368 	// should never happen, but who knows?
1369 	if (index > node->NumKeys())
1370 		return;
1371 
1372 	Unaligned<off_t>* values = node->Values();
1373 	Unaligned<uint16>* keyLengths = node->KeyLengths();
1374 	uint8* keys = node->Keys();
1375 
1376 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
1377 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
1378 		+ keyLength);
1379 
1380 	Unaligned<off_t>* newValues = node->Values();
1381 	Unaligned<uint16>* newKeyLengths = node->KeyLengths();
1382 
1383 	// move values and copy new value into them
1384 	memmove(newValues + index + 1, values + index,
1385 		sizeof(off_t) * (node->NumKeys() - 1 - index));
1386 	memmove(newValues, values, sizeof(off_t) * index);
1387 
1388 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
1389 
1390 	// move and update key length index
1391 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
1392 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1393 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
1394 	}
1395 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
1396 
1397 	int32 keyStart;
1398 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
1399 		+ (keyStart = index > 0
1400 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
1401 
1402 	// move keys and copy new key into them
1403 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
1404 	int32 size = node->AllKeyLength() - length;
1405 	if (size > 0)
1406 		memmove(keys + length, keys + length - keyLength, size);
1407 
1408 	memcpy(keys + keyStart, key, keyLength);
1409 }
1410 
1411 
1412 /*!	Splits the \a node into two halves - the other half will be put into
1413 	\a other. It also takes care to create a new overflow link if the node
1414 	to split is an index node.
1415 */
1416 status_t
1417 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1418 	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1419 	uint16* _keyLength, off_t* _value)
1420 {
1421 	if (*_keyIndex > node->NumKeys() + 1)
1422 		return B_BAD_VALUE;
1423 
1424 	Unaligned<uint16>* inKeyLengths = node->KeyLengths();
1425 	Unaligned<off_t>* inKeyValues = node->Values();
1426 	uint8* inKeys = node->Keys();
1427 	uint8* outKeys = other->Keys();
1428 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1429 
1430 	if (keyIndex > node->NumKeys()) {
1431 		FATAL(("key index out of bounds: %d, num keys: %u, inode %" B_PRIdOFF
1432 			"\n", (int)keyIndex, node->NumKeys(), fStream->ID()));
1433 		return B_BAD_VALUE;
1434 	}
1435 
1436 	// How many keys will fit in one (half) page?
1437 	// The following loop will find the answer to this question and
1438 	// change the key lengths indices for their new home
1439 
1440 	// "bytes" is the number of bytes written for the new key,
1441 	// "bytesBefore" are the bytes before that key
1442 	// "bytesAfter" are the bytes after the new key, if any
1443 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1444 
1445 	size_t size = fNodeSize >> 1;
1446 	int32 out, in;
1447 	size_t keyLengths = 0;
1448 	for (in = out = 0; in < node->NumKeys() + 1;) {
1449 		keyLengths = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]);
1450 
1451 		if (in == keyIndex && !bytes) {
1452 			bytes = *_keyLength;
1453 			bytesBefore = in > 0
1454 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1455 		} else {
1456 			if (keyIndex < out)
1457 				bytesAfter = keyLengths - bytesBefore;
1458 
1459 			in++;
1460 		}
1461 		out++;
1462 
1463 		if (key_align(sizeof(bplustree_node) + bytes + keyLengths)
1464 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1465 			// we have found the number of keys in the new node!
1466 			break;
1467 		}
1468 	}
1469 
1470 	// if the new key was not inserted, set the length of the keys
1471 	// that can be copied directly
1472 
1473 	if (keyIndex >= out && in > 0)
1474 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1475 	else if (keyIndex + 1 < out)
1476 		bytesAfter = keyLengths - bytesBefore;
1477 
1478 	if (bytesBefore < 0 || bytesAfter < 0)
1479 		return B_BAD_DATA;
1480 
1481 	other->left_link = node->left_link;
1482 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1483 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1484 		+ bytesAfter);
1485 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1486 
1487 	Unaligned<uint16>* outKeyLengths = other->KeyLengths();
1488 	Unaligned<off_t>* outKeyValues = other->Values();
1489 	int32 keys = out > keyIndex ? keyIndex : out;
1490 
1491 	if (bytesBefore) {
1492 		// copy the keys
1493 		memcpy(outKeys, inKeys, bytesBefore);
1494 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1495 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1496 	}
1497 	if (bytes) {
1498 		// copy the newly inserted key
1499 		memcpy(outKeys + bytesBefore, key, bytes);
1500 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1501 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1502 
1503 		if (bytesAfter) {
1504 			// copy the keys after the new key
1505 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1506 				bytesAfter);
1507 			keys = out - keyIndex - 1;
1508 			for (int32 i = 0;i < keys;i++) {
1509 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1510 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1511 						+ bytes);
1512 			}
1513 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1514 				keys * sizeof(off_t));
1515 		}
1516 	}
1517 
1518 	// if the new key was already inserted, we shouldn't use it again
1519 	if (in != out)
1520 		keyIndex--;
1521 
1522 	int32 total = bytesBefore + bytesAfter;
1523 
1524 	// these variables are for the key that will be returned
1525 	// to the parent node
1526 	uint8* newKey = NULL;
1527 	uint16 newLength = 0;
1528 	bool newAllocated = false;
1529 
1530 	// If we have split an index node, we have to drop the first key
1531 	// of the next node (which can also be the new key to insert).
1532 	// The dropped key is also the one which has to be inserted in
1533 	// the parent node, so we will set the "newKey" already here.
1534 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1535 		if (in == keyIndex) {
1536 			newKey = key;
1537 			newLength = *_keyLength;
1538 
1539 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1540 			keyIndex--;
1541 		} else {
1542 			// If a key is dropped (is not the new key), we have to copy
1543 			// it, because it would be lost if not.
1544 			uint8* droppedKey = node->KeyAt(in, &newLength);
1545 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1546 					> (uint8*)node + fNodeSize
1547 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1548 				fStream->GetVolume()->Panic();
1549 				RETURN_ERROR(B_BAD_DATA);
1550 			}
1551 			newKey = (uint8*)malloc(newLength);
1552 			if (newKey == NULL)
1553 				return B_NO_MEMORY;
1554 
1555 			newAllocated = true;
1556 			memcpy(newKey, droppedKey, newLength);
1557 
1558 			other->overflow_link = inKeyValues[in];
1559 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1560 		}
1561 	}
1562 
1563 	// and now the same game for the other page and the rest of the keys
1564 	// (but with memmove() instead of memcpy(), because they may overlap)
1565 
1566 	bytesBefore = bytesAfter = bytes = 0;
1567 	out = 0;
1568 	int32 skip = in;
1569 	while (in < node->NumKeys() + 1) {
1570 		if (in == keyIndex && !bytes) {
1571 			// it's enough to set bytesBefore once here, because we do
1572 			// not need to know the exact length of all keys in this
1573 			// loop
1574 			bytesBefore = in > skip
1575 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1576 			bytes = *_keyLength;
1577 			out++;
1578 		} else {
1579 			if (in < node->NumKeys()) {
1580 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1581 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1582 
1583 				if (bytes) {
1584 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1585 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1586 
1587 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1588 						- bytesBefore - bytes;
1589 				}
1590 				out++;
1591 			}
1592 			in++;
1593 		}
1594 	}
1595 
1596 	// adjust the byte counts (since we were a bit lazy in the loop)
1597 	if (keyIndex < skip)
1598 		bytesBefore = node->AllKeyLength() - total;
1599 
1600 	if (bytesBefore < 0 || bytesAfter < 0) {
1601 		if (newAllocated)
1602 			free(newKey);
1603 		return B_BAD_DATA;
1604 	}
1605 
1606 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1607 		// right link, and overflow link can stay the same
1608 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1609 		+ bytesAfter);
1610 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1611 
1612 	// array positions have changed
1613 	outKeyLengths = node->KeyLengths();
1614 	outKeyValues = node->Values();
1615 
1616 	// move the keys in the old node: the order is important here,
1617 	// because we don't want to overwrite any contents
1618 
1619 	keys = keyIndex <= skip ? out : keyIndex - skip;
1620 	keyIndex -= skip;
1621 	in = out - keyIndex - 1;
1622 		// Note: keyIndex and in will contain invalid values when the new key
1623 		// went to the other node. But in this case bytes and bytesAfter are
1624 		// 0 and subsequently we never use keyIndex and in.
1625 
1626 	if (bytesBefore)
1627 		memmove(inKeys, inKeys + total, bytesBefore);
1628 	if (bytesAfter) {
1629 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1630 			bytesAfter);
1631 	}
1632 
1633 	if (bytesBefore)
1634 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1635 	if (bytesAfter) {
1636 		// if byteAfter is > 0, keyIndex is larger than skip
1637 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1638 			in * sizeof(uint16));
1639 	}
1640 
1641 	if (bytesBefore)
1642 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1643 	if (bytesAfter) {
1644 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1645 			in * sizeof(off_t));
1646 	}
1647 
1648 	if (bytes) {
1649 		// finally, copy the newly inserted key (don't overwrite anything)
1650 		memcpy(inKeys + bytesBefore, key, bytes);
1651 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1652 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1653 	}
1654 
1655 	// Prepare the key that will be inserted in the parent node which
1656 	// is either the dropped key or the last of the other node.
1657 	// If it's the dropped key, "newKey" was already set earlier.
1658 
1659 	if (newKey == NULL)
1660 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1661 
1662 	memcpy(key, newKey, newLength);
1663 	*_keyLength = newLength;
1664 	*_value = otherOffset;
1665 
1666 	if (newAllocated)
1667 		free(newKey);
1668 
1669 	return B_OK;
1670 }
1671 
1672 
1673 /*!	This inserts a key into the tree. The changes made to the tree will
1674 	all be part of the \a transaction.
1675 	You need to have the inode write locked.
1676 */
1677 status_t
1678 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1679 	off_t value)
1680 {
1681 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1682 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1683 		RETURN_ERROR(B_BAD_VALUE);
1684 #ifdef DEBUG
1685 	if (value < 0)
1686 		panic("tried to insert invalid value %" B_PRId64 "!\n", value);
1687 #endif
1688 
1689 	ASSERT_WRITE_LOCKED_INODE(fStream);
1690 
1691 	Stack<node_and_key> stack;
1692 	if (_SeekDown(stack, key, keyLength) != B_OK)
1693 		RETURN_ERROR(B_ERROR);
1694 
1695 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH];
1696 	memcpy(keyBuffer, key, keyLength);
1697 
1698 	node_and_key nodeAndKey;
1699 	const bplustree_node* node;
1700 
1701 	CachedNode cached(this);
1702 	while (stack.Pop(&nodeAndKey)
1703 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1704 #ifdef DEBUG
1705 		NodeChecker checker(node, fNodeSize, "insert");
1706 #endif
1707 		if (node->IsLeaf()) {
1708 			// first round, check for duplicate entries
1709 			status_t status = _FindKey(node, key, keyLength,
1710 				&nodeAndKey.keyIndex);
1711 
1712 			// is this a duplicate entry?
1713 			if (status == B_OK) {
1714 				if (fAllowDuplicates) {
1715 					status = _InsertDuplicate(transaction, cached, node,
1716 						nodeAndKey.keyIndex, value);
1717 					if (status != B_OK)
1718 						RETURN_ERROR(status);
1719 					return B_OK;
1720 				}
1721 
1722 				return B_NAME_IN_USE;
1723 			}
1724 		}
1725 
1726 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1727 		if (writableNode == NULL)
1728 			return B_IO_ERROR;
1729 
1730 		// is the node big enough to hold the pair?
1731 		if (int32(key_align(sizeof(bplustree_node)
1732 				+ writableNode->AllKeyLength() + keyLength)
1733 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1734 				+ sizeof(off_t))) < fNodeSize) {
1735 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1736 				keyBuffer, keyLength, value);
1737 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1738 				nodeAndKey.keyIndex, 0, 1);
1739 
1740 			return B_OK;
1741 		} else {
1742 			CachedNode cachedNewRoot(this);
1743 			CachedNode cachedOther(this);
1744 
1745 			// do we need to allocate a new root node? if so, then do
1746 			// it now
1747 			off_t newRoot = BPLUSTREE_NULL;
1748 			if (nodeAndKey.nodeOffset == fHeader.RootNode()) {
1749 				bplustree_node* root;
1750 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1751 					&newRoot);
1752 				if (status != B_OK) {
1753 					// The tree is most likely corrupted!
1754 					// But it's still sane at leaf level - we could set
1755 					// a flag in the header that forces the tree to be
1756 					// rebuild next time...
1757 					// But since we will have journaling, that's not a big
1758 					// problem anyway.
1759 					RETURN_ERROR(status);
1760 				}
1761 			}
1762 
1763 			// reserve space for the other node
1764 			bplustree_node* other;
1765 			off_t otherOffset;
1766 			status_t status = cachedOther.Allocate(transaction, &other,
1767 				&otherOffset);
1768 			if (status != B_OK) {
1769 				cachedNewRoot.Free(transaction, newRoot);
1770 				RETURN_ERROR(status);
1771 			}
1772 
1773 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1774 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1775 					&value) != B_OK) {
1776 				// free root node & other node here
1777 				cachedOther.Free(transaction, otherOffset);
1778 				cachedNewRoot.Free(transaction, newRoot);
1779 
1780 				RETURN_ERROR(B_ERROR);
1781 			}
1782 #ifdef DEBUG
1783 			checker.Check("insert split");
1784 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1785 #endif
1786 
1787 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1788 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1789 
1790 			// update the right link of the node in the left of the new node
1791 			if ((other = cachedOther.SetToWritable(transaction,
1792 					other->LeftLink())) != NULL) {
1793 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1794 			}
1795 
1796 			// create a new root if necessary
1797 			if (newRoot != BPLUSTREE_NULL) {
1798 				bplustree_node* root = cachedNewRoot.Node();
1799 
1800 				_InsertKey(root, 0, keyBuffer, keyLength,
1801 					writableNode->LeftLink());
1802 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1803 					nodeAndKey.nodeOffset);
1804 
1805 				CachedNode cached(this);
1806 				bplustree_header* header
1807 					= cached.SetToWritableHeader(transaction);
1808 				if (header == NULL)
1809 					return B_IO_ERROR;
1810 
1811 				// finally, update header to point to the new root
1812 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1813 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1814 					header->MaxNumberOfLevels() + 1);
1815 
1816 				return B_OK;
1817 			}
1818 		}
1819 	}
1820 	RETURN_ERROR(B_ERROR);
1821 }
1822 
1823 
1824 /*!	Removes the duplicate index/value pair from the tree.
1825 	It's part of the private tree interface.
1826 */
1827 status_t
1828 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1829 	const bplustree_node* node, CachedNode& cached, uint16 index,
1830 	off_t value)
1831 {
1832 	Unaligned<off_t>* values = node->Values();
1833 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1834 
1835 	CachedNode cachedDuplicate(this);
1836 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1837 	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1838 		duplicateOffset, false);
1839 	if (duplicate == NULL)
1840 		return B_IO_ERROR;
1841 
1842 	// if it's a duplicate fragment, remove the entry from there
1843 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1844 		duplicate_array* array = duplicate->FragmentAt(
1845 			bplustree_node::FragmentIndex(oldValue));
1846 		int32 arrayCount = array->Count();
1847 
1848 		if (arrayCount > NUM_FRAGMENT_VALUES || arrayCount <= 1) {
1849 			FATAL(("_RemoveDuplicate: Invalid array[%d] size in fragment %"
1850 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1851 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset,
1852 				arrayCount, fStream->ID()));
1853 			return B_BAD_DATA;
1854 		}
1855 		if (!array->Remove(value)) {
1856 			FATAL(("Oh no, value %" B_PRIdOFF " not found in fragments of node "
1857 				"%" B_PRIdOFF "..., inode %" B_PRIdOFF "\n", value,
1858 				duplicateOffset, fStream->ID()));
1859 			return B_ENTRY_NOT_FOUND;
1860 		}
1861 
1862 		// remove the array from the fragment node if it is empty
1863 		if (--arrayCount == 1) {
1864 			// set the link to the remaining value
1865 			if (cached.MakeWritable(transaction) == NULL)
1866 				return B_IO_ERROR;
1867 
1868 			values[index] = array->values[0];
1869 
1870 			// Remove the whole fragment node, if this was the only array,
1871 			// otherwise free just the array
1872 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1873 				status_t status = cachedDuplicate.Free(transaction,
1874 					duplicateOffset);
1875 				if (status != B_OK)
1876 					return status;
1877 			} else
1878 				array->count = 0;
1879 		}
1880 		return B_OK;
1881 	}
1882 
1883 	// Remove value from a duplicate node!
1884 
1885 	duplicate_array* array = NULL;
1886 	int32 arrayCount = 0;
1887 
1888 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1889 		FATAL(("invalid duplicate node: first left link points to %" B_PRIdOFF
1890 			", inode %" B_PRIdOFF "!\n", duplicate->LeftLink(), fStream->ID()));
1891 		return B_BAD_DATA;
1892 	}
1893 
1894 	// Search the duplicate nodes until the entry could be found (and removed)
1895 	while (duplicate != NULL) {
1896 		array = duplicate->DuplicateArray();
1897 		arrayCount = array->Count();
1898 
1899 		if (arrayCount > NUM_DUPLICATE_VALUES || arrayCount < 0) {
1900 			FATAL(("_RemoveDuplicate: Invalid array size in duplicate %"
1901 				B_PRIdOFF " == %" B_PRId32 ", inode %" B_PRIdOFF "!\n",
1902 				duplicateOffset, arrayCount, fStream->ID()));
1903 			return B_BAD_DATA;
1904 		}
1905 
1906 		if (array->Remove(value)) {
1907 			arrayCount--;
1908 			break;
1909 		}
1910 
1911 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1912 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1913 
1914 		cachedDuplicate.UnsetUnchanged(transaction);
1915 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1916 			false);
1917 	}
1918 	if (duplicate == NULL)
1919 		RETURN_ERROR(B_IO_ERROR);
1920 
1921 	// The entry got removed from the duplicate node, but we might want to free
1922 	// it now in case it's empty
1923 
1924 	while (true) {
1925 		off_t left = duplicate->LeftLink();
1926 		off_t right = duplicate->RightLink();
1927 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1928 
1929 		if ((isLast && arrayCount == 1) || arrayCount == 0) {
1930 			// Free empty duplicate page, link their siblings together, and
1931 			// update the duplicate link if needed (ie. when we either remove
1932 			// the last duplicate node or have a new first one)
1933 
1934 			if (left == BPLUSTREE_NULL) {
1935 				// the duplicate link points to us
1936 				if (cached.MakeWritable(transaction) == NULL)
1937 					return B_IO_ERROR;
1938 
1939 				if (arrayCount == 1) {
1940 					// This is the last node, and there is only one value left;
1941 					// replace the duplicate link with that value, it's no
1942 					// duplicate anymore
1943 					values[index] = array->values[0];
1944 				} else {
1945 					// Move the duplicate link to the next node
1946 					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1947 						bplustree_node::MakeLink(
1948 							BPLUSTREE_DUPLICATE_NODE, right));
1949 				}
1950 			}
1951 
1952 			status_t status = cachedDuplicate.Free(transaction,
1953 				duplicateOffset);
1954 			if (status != B_OK)
1955 				return status;
1956 
1957 			if (left != BPLUSTREE_NULL
1958 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1959 						false)) != NULL) {
1960 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1961 
1962 				// If the next node is the last node, we need to free that node
1963 				// and convert the duplicate entry back into a normal entry
1964 				array = duplicate->DuplicateArray();
1965 				arrayCount = array->Count();
1966 				if (right == BPLUSTREE_NULL
1967 					&& duplicate->LeftLink() == BPLUSTREE_NULL
1968 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1969 					duplicateOffset = left;
1970 					continue;
1971 				}
1972 			}
1973 			if (right != BPLUSTREE_NULL
1974 				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1975 						right, false)) != NULL) {
1976 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1977 
1978 				// Again, we may need to turn the duplicate entry back into a
1979 				// normal entry
1980 				array = duplicate->DuplicateArray();
1981 				arrayCount = array->Count();
1982 				if (left == BPLUSTREE_NULL
1983 					&& duplicate->RightLink() == BPLUSTREE_NULL
1984 					&& arrayCount <= NUM_FRAGMENT_VALUES) {
1985 					duplicateOffset = right;
1986 					continue;
1987 				}
1988 			}
1989 			return B_OK;
1990 		}
1991 		if (isLast && arrayCount <= NUM_FRAGMENT_VALUES) {
1992 			// If the number of entries fits in a duplicate fragment, then
1993 			// either find a free fragment node, or convert this node to a
1994 			// fragment node.
1995 			CachedNode cachedOther(this);
1996 
1997 			bplustree_node* fragment = NULL;
1998 			uint32 fragmentIndex = 0;
1999 			off_t offset;
2000 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
2001 					&offset, &fragment, &fragmentIndex) == B_OK) {
2002 				// move to other node
2003 				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
2004 				memcpy(target, array,
2005 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2006 
2007 				cachedDuplicate.Free(transaction, duplicateOffset);
2008 				duplicateOffset = offset;
2009 			} else {
2010 				// convert node
2011 				memmove(duplicate, array,
2012 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2013 				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
2014 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
2015 			}
2016 
2017 			if (cached.MakeWritable(transaction) == NULL)
2018 				return B_IO_ERROR;
2019 
2020 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
2021 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
2022 		}
2023 		return B_OK;
2024 	}
2025 }
2026 
2027 
2028 /*!	Removes the key with the given index from the specified node.
2029 	Since it has to get the key from the node anyway (to obtain it's
2030 	pointer), it's not needed to pass the key & its length, although
2031 	the calling method (BPlusTree::Remove()) have this data.
2032 */
2033 void
2034 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
2035 {
2036 	// should never happen, but who knows?
2037 	if (index > node->NumKeys() && node->NumKeys() > 0) {
2038 		FATAL(("Asked me to remove key outer limits: %u, inode %" B_PRIdOFF
2039 			"\n", index, fStream->ID()));
2040 		return;
2041 	}
2042 
2043 	Unaligned<off_t>* values = node->Values();
2044 
2045 	// if we would have to drop the overflow link, drop
2046 	// the last key instead and update the overflow link
2047 	// to the value of that one
2048 	if (!node->IsLeaf() && index == node->NumKeys())
2049 		node->overflow_link = values[--index];
2050 
2051 	uint16 length = 0;
2052 	uint8* key = node->KeyAt(index, &length);
2053 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
2054 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2055 		FATAL(("Key length to long: %s, %u inode %" B_PRIdOFF "\n", key, length,
2056 			fStream->ID()));
2057 		fStream->GetVolume()->Panic();
2058 		return;
2059 	}
2060 
2061 	Unaligned<uint16>* keyLengths = node->KeyLengths();
2062 	uint8* keys = node->Keys();
2063 
2064 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
2065 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
2066 		node->AllKeyLength() - length);
2067 
2068 	Unaligned<off_t>* newValues = node->Values();
2069 	Unaligned<uint16>* newKeyLengths = node->KeyLengths();
2070 
2071 	// move key data
2072 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
2073 
2074 	// move and update key lengths
2075 	if (index > 0 && newKeyLengths != keyLengths)
2076 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
2077 	for (uint16 i = index; i < node->NumKeys(); i++) {
2078 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
2079 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
2080 	}
2081 
2082 	// move values
2083 	if (index > 0)
2084 		memmove(newValues, values, index * sizeof(off_t));
2085 	if (node->NumKeys() > index) {
2086 		memmove(newValues + index, values + index + 1,
2087 			(node->NumKeys() - index) * sizeof(off_t));
2088 	}
2089 }
2090 
2091 
2092 /*!	Removes the specified key from the tree. The "value" parameter is only used
2093 	for trees which allow duplicates, so you may safely ignore it.
2094 	It's not an optional parameter, so at least you have to think about it.
2095 	You need to have the inode write locked.
2096 */
2097 status_t
2098 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
2099 	off_t value)
2100 {
2101 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2102 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2103 		RETURN_ERROR(B_BAD_VALUE);
2104 
2105 	ASSERT_WRITE_LOCKED_INODE(fStream);
2106 
2107 	Stack<node_and_key> stack;
2108 	if (_SeekDown(stack, key, keyLength) != B_OK)
2109 		RETURN_ERROR(B_ERROR);
2110 
2111 	node_and_key nodeAndKey;
2112 	const bplustree_node* node;
2113 
2114 	CachedNode cached(this);
2115 	while (stack.Pop(&nodeAndKey)
2116 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
2117 #ifdef DEBUG
2118 		NodeChecker checker(node, fNodeSize, "remove");
2119 #endif
2120 		if (node->IsLeaf()) {
2121 			// first round, check for duplicate entries
2122 			status_t status = _FindKey(node, key, keyLength,
2123 				&nodeAndKey.keyIndex);
2124 			if (status != B_OK)
2125 				RETURN_ERROR(status);
2126 
2127 			// Is this a duplicate entry?
2128 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
2129 					node->Values()[nodeAndKey.keyIndex]))) {
2130 				if (fAllowDuplicates) {
2131 					return _RemoveDuplicate(transaction, node, cached,
2132 						nodeAndKey.keyIndex, value);
2133 				}
2134 
2135 				FATAL(("dupliate node found where no duplicates are "
2136 					"allowed, inode %" B_PRIdOFF "!\n", fStream->ID()));
2137 				RETURN_ERROR(B_ERROR);
2138 			} else {
2139 				if (node->Values()[nodeAndKey.keyIndex] != value)
2140 					return B_ENTRY_NOT_FOUND;
2141 
2142 				// If we will remove the last key, the iterator will be set
2143 				// to the next node after the current - if there aren't any
2144 				// more nodes, we need a way to prevent the TreeIterators to
2145 				// touch the old node again, we use BPLUSTREE_FREE for this
2146 				off_t next = node->RightLink() == BPLUSTREE_NULL
2147 					? BPLUSTREE_FREE : node->RightLink();
2148 				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
2149 					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
2150 			}
2151 		}
2152 
2153 		bplustree_node* writableNode = cached.MakeWritable(transaction);
2154 		if (writableNode == NULL)
2155 			return B_IO_ERROR;
2156 
2157 		// if it's an empty root node, we have to convert it
2158 		// to a leaf node by dropping the overflow link, or,
2159 		// if it's already a leaf node, just empty it
2160 		if (nodeAndKey.nodeOffset == fHeader.RootNode()
2161 			&& (node->NumKeys() == 0
2162 				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
2163 			writableNode->overflow_link
2164 				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2165 			writableNode->all_key_count = 0;
2166 			writableNode->all_key_length = 0;
2167 
2168 			// if we've made a leaf node out of the root node, we need
2169 			// to reset the maximum number of levels in the header
2170 			if (fHeader.MaxNumberOfLevels() != 1) {
2171 				CachedNode cached(this);
2172 				bplustree_header* header
2173 					= cached.SetToWritableHeader(transaction);
2174 				if (header == NULL)
2175 					return B_IO_ERROR;
2176 
2177 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
2178 			}
2179 			return B_OK;
2180 		}
2181 
2182 		// if there is only one key left, we don't have to remove
2183 		// it, we can just dump the node (index nodes still have
2184 		// the overflow link, so we have to drop the last key)
2185 		if (writableNode->NumKeys() > 1
2186 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
2187 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
2188 			return B_OK;
2189 		}
2190 
2191 		// when we are here, we can just free the node, but
2192 		// we have to update the right/left link of the
2193 		// siblings first
2194 		CachedNode otherCached(this);
2195 		bplustree_node* other = otherCached.SetToWritable(transaction,
2196 			writableNode->LeftLink());
2197 		if (other != NULL)
2198 			other->right_link = writableNode->right_link;
2199 
2200 		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
2201 				!= NULL) {
2202 			other->left_link = writableNode->left_link;
2203 		}
2204 
2205 		cached.Free(transaction, nodeAndKey.nodeOffset);
2206 	}
2207 	RETURN_ERROR(B_ERROR);
2208 }
2209 
2210 
2211 /*!	Replaces the value for the key in the tree.
2212 	Returns B_OK if the key could be found and its value replaced,
2213 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
2214 	to indicate that something went terribly wrong.
2215 	Note that this doesn't work with duplicates - it will just
2216 	return B_BAD_TYPE if you call this function on a tree where
2217 	duplicates are allowed.
2218 	You need to have the inode write locked.
2219 */
2220 status_t
2221 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
2222 	off_t value)
2223 {
2224 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2225 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2226 		|| key == NULL)
2227 		RETURN_ERROR(B_BAD_VALUE);
2228 
2229 	if (fAllowDuplicates)
2230 		RETURN_ERROR(B_BAD_TYPE);
2231 
2232 	ASSERT_WRITE_LOCKED_INODE(fStream);
2233 
2234 	off_t nodeOffset = fHeader.RootNode();
2235 	CachedNode cached(this);
2236 	const bplustree_node* node;
2237 
2238 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2239 		uint16 keyIndex = 0;
2240 		off_t nextOffset;
2241 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2242 			&nextOffset);
2243 
2244 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2245 			if (status == B_OK) {
2246 				bplustree_node* writableNode = cached.MakeWritable(transaction);
2247 				if (writableNode != NULL) {
2248 					writableNode->Values()[keyIndex]
2249 						= HOST_ENDIAN_TO_BFS_INT64(value);
2250 				} else
2251 					status = B_IO_ERROR;
2252 			}
2253 
2254 			return status;
2255 		} else if (nextOffset == nodeOffset)
2256 			RETURN_ERROR(B_ERROR);
2257 
2258 		nodeOffset = nextOffset;
2259 	}
2260 	RETURN_ERROR(B_ERROR);
2261 }
2262 #endif // !_BOOT_MODE
2263 
2264 
2265 /*!	Searches the key in the tree, and stores the offset found in _value,
2266 	if successful.
2267 	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
2268 	while it descends the tree.
2269 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
2270 	It can also return other errors to indicate that something went wrong.
2271 	Note that this doesn't work with duplicates - it will just return
2272 	B_BAD_TYPE if you call this function on a tree where duplicates are
2273 	allowed.
2274 	You need to have the inode read or write locked.
2275 */
2276 status_t
2277 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
2278 {
2279 	if (key == NULL || keyLength < BPLUSTREE_MIN_KEY_LENGTH)
2280 		RETURN_ERROR(B_BAD_VALUE);
2281 
2282 	if (keyLength > BPLUSTREE_MAX_KEY_LENGTH)
2283 		RETURN_ERROR(B_NAME_TOO_LONG);
2284 
2285 	if (fAllowDuplicates)
2286 		RETURN_ERROR(B_BAD_TYPE);
2287 
2288 #if !_BOOT_MODE
2289 	ASSERT_READ_LOCKED_INODE(fStream);
2290 #endif
2291 
2292 	off_t nodeOffset = fHeader.RootNode();
2293 	CachedNode cached(this);
2294 	const bplustree_node* node;
2295 
2296 #ifdef DEBUG
2297 	int32 levels = 0;
2298 #endif
2299 
2300 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2301 		uint16 keyIndex = 0;
2302 		off_t nextOffset;
2303 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
2304 			&nextOffset);
2305 
2306 #ifdef DEBUG
2307 		levels++;
2308 #endif
2309 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2310 			if (status == B_OK && _value != NULL)
2311 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
2312 
2313 #ifdef DEBUG
2314 			if (levels != (int32)fHeader.MaxNumberOfLevels())
2315 				DEBUGGER(("levels don't match"));
2316 #endif
2317 			return status;
2318 		} else if (nextOffset == nodeOffset)
2319 			RETURN_ERROR(B_ERROR);
2320 
2321 		nodeOffset = nextOffset;
2322 	}
2323 	FATAL(("b+tree node at %" B_PRIdOFF " could not be loaded, inode %"
2324 		B_PRIdOFF "\n", nodeOffset, fStream->ID()));
2325 	RETURN_ERROR(B_ERROR);
2326 }
2327 
2328 
2329 #if !_BOOT_MODE
2330 status_t
2331 BPlusTree::_ValidateChildren(TreeCheck& check, uint32 level, off_t offset,
2332 	const uint8* largestKey, uint16 largestKeyLength,
2333 	const bplustree_node* parent)
2334 {
2335 	if (parent->CheckIntegrity(fNodeSize) != B_OK) {
2336 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " integrity check "
2337 			"failed!\n", fStream->ID(), offset);
2338 		check.FoundError();
2339 		return B_OK;
2340 	}
2341 	if (level >= fHeader.MaxNumberOfLevels()) {
2342 		dprintf("inode %" B_PRIdOFF ": maximum level surpassed at %" B_PRIdOFF
2343 			"!\n", fStream->ID(), offset);
2344 		check.FoundError();
2345 		return B_OK;
2346 	}
2347 
2348 	check.SetLevel(level);
2349 
2350 	if (check.Visited(offset)) {
2351 		dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " already visited!\n",
2352 			fStream->ID(), offset);
2353 		check.FoundError();
2354 		return B_OK;
2355 	}
2356 
2357 	check.SetVisited(offset);
2358 
2359 	uint32 count = parent->NumKeys();
2360 	Unaligned<off_t>* values = parent->Values();
2361 	off_t lastOffset = check.PreviousOffset(level);
2362 	CachedNode cached(this);
2363 
2364 	for (uint32 i = 0; i < count; i++) {
2365 		uint16 keyLength;
2366 		uint8* key = parent->KeyAt(i, &keyLength);
2367 		if (largestKey != NULL) {
2368 			int result = _CompareKeys(key, keyLength, largestKey,
2369 				largestKeyLength);
2370 			if (result > 0 || (result == 0 && i != count - 1)) {
2371 				dprintf("inode %" B_PRIdOFF ": node %" B_PRIdOFF " key %"
2372 					B_PRIu32 " larger than it should!\n",
2373 					fStream->ID(), offset, i);
2374 				check.FoundError();
2375 			}
2376 		}
2377 
2378 		off_t childOffset = BFS_ENDIAN_TO_HOST_INT64(values[i]);
2379 		if (bplustree_node::IsDuplicate(childOffset)) {
2380 			// Walk the duplicate nodes
2381 			off_t duplicateOffset = bplustree_node::FragmentOffset(childOffset);
2382 			off_t lastDuplicateOffset = BPLUSTREE_NULL;
2383 
2384 			while (duplicateOffset != BPLUSTREE_NULL) {
2385 				const bplustree_node* node;
2386 				status_t status = cached.SetTo(duplicateOffset, &node, false);
2387 				if (status != B_OK) {
2388 					if (status == B_IO_ERROR)
2389 						return B_IO_ERROR;
2390 
2391 					dprintf("inode %" B_PRIdOFF ": duplicate node at %"
2392 						B_PRIdOFF " could not be read: %s\n", fStream->ID(),
2393 						duplicateOffset, strerror(status));
2394 					check.FoundError();
2395 					break;
2396 				}
2397 
2398 				bool isFragmentNode = bplustree_node::LinkType(childOffset)
2399 					== BPLUSTREE_DUPLICATE_FRAGMENT;
2400 				bool isKnownFragment = isFragmentNode
2401 					&& check.VisitedFragment(duplicateOffset);
2402 
2403 				if (!isKnownFragment && check.Visited(duplicateOffset)) {
2404 					dprintf("inode %" B_PRIdOFF ": %s node at %"
2405 						B_PRIdOFF " already visited, referenced from %"
2406 						B_PRIdOFF "!\n", fStream->ID(),
2407 						isFragmentNode ? "fragment" : "duplicate",
2408 						duplicateOffset, offset);
2409 					check.FoundError();
2410 					break;
2411 				}
2412 
2413 				// Fragment nodes may be visited more than once from different
2414 				// places
2415 				if (!check.Visited(duplicateOffset))
2416 					check.SetVisited(duplicateOffset);
2417 				if (!isKnownFragment && isFragmentNode)
2418 					check.SetVisitedFragment(duplicateOffset);
2419 
2420 				duplicate_array* array;
2421 				int32 minSize;
2422 				int32 maxSize;
2423 				if (isFragmentNode) {
2424 					array = node->FragmentAt(
2425 						bplustree_node::FragmentIndex(childOffset));
2426 					minSize = 2;
2427 					maxSize = NUM_FRAGMENT_VALUES;
2428 				} else {
2429 					array = node->DuplicateArray();
2430 					minSize = 1;
2431 					maxSize = NUM_DUPLICATE_VALUES;
2432 				}
2433 				int32 arrayCount = array->Count();
2434 
2435 				if (arrayCount < minSize || arrayCount > maxSize) {
2436 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2437 						" has invalid array size %" B_PRId32 "!\n",
2438 						fStream->ID(), duplicateOffset, arrayCount);
2439 					check.FoundError();
2440 				} else {
2441 					// Simple check if the values in the array may be valid
2442 					for (int32 j = 0; j < arrayCount; j++) {
2443 						if (!fStream->GetVolume()->IsValidInodeBlock(
2444 								array->ValueAt(j))) {
2445 							dprintf("inode %" B_PRIdOFF ": duplicate at %"
2446 								B_PRIdOFF " contains invalid block %" B_PRIdOFF
2447 								" at %" B_PRId32 "!\n", fStream->ID(),
2448 								duplicateOffset, array->ValueAt(j), j);
2449 							check.FoundError();
2450 							break;
2451 						}
2452 					}
2453 				}
2454 
2455 				// A fragment node is not linked (and does not have valid links)
2456 				if (isFragmentNode)
2457 					break;
2458 
2459 				if (node->LeftLink() != lastDuplicateOffset) {
2460 					dprintf("inode %" B_PRIdOFF ": duplicate at %" B_PRIdOFF
2461 						" has wrong left link %" B_PRIdOFF ", expected %"
2462 						B_PRIdOFF "!\n", fStream->ID(), duplicateOffset,
2463 						node->LeftLink(), lastDuplicateOffset);
2464 					check.FoundError();
2465 				}
2466 
2467 				lastDuplicateOffset = duplicateOffset;
2468 				duplicateOffset = node->RightLink();
2469 			}
2470 		} else if (!parent->IsLeaf()) {
2471 			// Test a regular child node recursively
2472 			off_t nextOffset = parent->OverflowLink();
2473 			if (i < count - 1)
2474 				nextOffset = BFS_ENDIAN_TO_HOST_INT64(values[i + 1]);
2475 
2476 			if (i == 0 && lastOffset != BPLUSTREE_NULL) {
2477 				// Test right link of the previous node
2478 				const bplustree_node* previous = cached.SetTo(lastOffset, true);
2479 				if (previous == NULL)
2480 					return B_IO_ERROR;
2481 
2482 				if (previous->RightLink() != childOffset) {
2483 					dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2484 						"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2485 						"!\n", fStream->ID(), lastOffset, previous->RightLink(),
2486 						childOffset);
2487 					check.FoundError();
2488 				}
2489 			}
2490 
2491 			status_t status = _ValidateChild(check, cached, level, childOffset,
2492 				lastOffset, nextOffset, key, keyLength);
2493 			if (status != B_OK)
2494 				return status;
2495 		} else if (!fStream->GetVolume()->IsValidInodeBlock(childOffset)) {
2496 			dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " contains "
2497 				"invalid block %" B_PRIdOFF " at %" B_PRId32 "!\n",
2498 				fStream->ID(), offset, childOffset, i);
2499 			check.FoundError();
2500 		}
2501 
2502 		lastOffset = childOffset;
2503 	}
2504 
2505 	if (parent->OverflowLink() != BPLUSTREE_NULL) {
2506 		off_t childOffset = parent->OverflowLink();
2507 		status_t status = _ValidateChild(check, cached, level, childOffset,
2508 			lastOffset, 0, NULL, 0);
2509 		if (status != B_OK)
2510 			return status;
2511 
2512 		lastOffset = childOffset;
2513 	}
2514 
2515 	check.SetPreviousOffset(level, lastOffset);
2516 	return B_OK;
2517 }
2518 
2519 
2520 status_t
2521 BPlusTree::_ValidateChild(TreeCheck& check, CachedNode& cached, uint32 level,
2522 	off_t offset, off_t lastOffset, off_t nextOffset,
2523 	const uint8* key, uint16 keyLength)
2524 {
2525 	const bplustree_node* node;
2526 	status_t status = cached.SetTo(offset, &node, true);
2527 	if (status != B_OK) {
2528 		if (status == B_IO_ERROR)
2529 			return B_IO_ERROR;
2530 
2531 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " could not be "
2532 			"read: %s\n", fStream->ID(), offset, strerror(status));
2533 		check.FoundError();
2534 		return B_OK;
2535 	}
2536 
2537 	if (node->LeftLink() != lastOffset) {
2538 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2539 			"wrong left link %" B_PRIdOFF ", expected %" B_PRIdOFF
2540 			"!\n", fStream->ID(), offset, node->LeftLink(), lastOffset);
2541 		check.FoundError();
2542 	}
2543 
2544 	if (nextOffset != 0 && node->RightLink() != nextOffset) {
2545 		dprintf("inode %" B_PRIdOFF ": node at %" B_PRIdOFF " has "
2546 			"wrong right link %" B_PRIdOFF ", expected %" B_PRIdOFF
2547 			"!\n", fStream->ID(), offset, node->RightLink(), nextOffset);
2548 		check.FoundError();
2549 	}
2550 
2551 	return _ValidateChildren(check, level + 1, offset, key, keyLength, node);
2552 }
2553 #endif // !_BOOT_MODE
2554 
2555 
2556 //	#pragma mark -
2557 
2558 
2559 TreeIterator::TreeIterator(BPlusTree* tree)
2560 	:
2561 	fTree(tree),
2562 	fCurrentNodeOffset(BPLUSTREE_NULL)
2563 {
2564 #if !_BOOT_MODE
2565 	tree->_AddIterator(this);
2566 #endif
2567 }
2568 
2569 
2570 TreeIterator::~TreeIterator()
2571 {
2572 #if !_BOOT_MODE
2573 	if (fTree)
2574 		fTree->_RemoveIterator(this);
2575 #endif
2576 }
2577 
2578 
2579 status_t
2580 TreeIterator::Goto(int8 to)
2581 {
2582 	if (fTree == NULL || fTree->fStream == NULL)
2583 		RETURN_ERROR(B_BAD_VALUE);
2584 
2585 #if !_BOOT_MODE
2586 	// lock access to stream
2587 	InodeReadLocker locker(fTree->fStream);
2588 #endif
2589 
2590 	off_t nodeOffset = fTree->fHeader.RootNode();
2591 	CachedNode cached(fTree);
2592 	const bplustree_node* node;
2593 
2594 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2595 		// is the node a leaf node?
2596 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2597 			fCurrentNodeOffset = nodeOffset;
2598 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
2599 			fDuplicateNode = BPLUSTREE_NULL;
2600 
2601 			return B_OK;
2602 		}
2603 
2604 		// get the next node offset depending on the direction (and if there
2605 		// are any keys in that node at all)
2606 		off_t nextOffset;
2607 		if (to == BPLUSTREE_END || node->all_key_count == 0)
2608 			nextOffset = node->OverflowLink();
2609 		else {
2610 			if (node->AllKeyLength() > fTree->fNodeSize
2611 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
2612 					- 8 * node->NumKeys())
2613 				RETURN_ERROR(B_ERROR);
2614 
2615 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
2616 		}
2617 		if (nextOffset == nodeOffset)
2618 			break;
2619 
2620 		nodeOffset = nextOffset;
2621 	}
2622 	FATAL(("%s fails\n", __FUNCTION__));
2623 
2624 	RETURN_ERROR(B_ERROR);
2625 }
2626 
2627 
2628 /*!	Iterates through the tree in the specified direction.
2629 	When it iterates through duplicates, the "key" is only updated for the
2630 	first entry - if you need to know when this happens, use the "duplicate"
2631 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
2632 	the other duplicates.
2633 	That's not too nice, but saves the 256 bytes that would be needed to
2634 	store the last key - if this will ever become an issue, it will be
2635 	easy to change.
2636 	The other advantage of this is, that the queries can skip all duplicates
2637 	at once when they are not relevant to them.
2638 */
2639 status_t
2640 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
2641 	uint16 maxLength, off_t* value, uint16* duplicate)
2642 {
2643 	if (fTree == NULL)
2644 		return B_INTERRUPTED;
2645 
2646 	bool forward = direction == BPLUSTREE_FORWARD;
2647 
2648 	if (fCurrentNodeOffset == BPLUSTREE_NULL
2649 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) != B_OK)
2650 		RETURN_ERROR(B_ERROR);
2651 
2652 	// if the tree was emptied since the last call
2653 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
2654 		return B_ENTRY_NOT_FOUND;
2655 
2656 #if !_BOOT_MODE
2657 	// lock access to stream
2658 	InodeReadLocker locker(fTree->fStream);
2659 #endif
2660 
2661 	CachedNode cached(fTree);
2662 	const bplustree_node* node;
2663 
2664 	if (fDuplicateNode != BPLUSTREE_NULL) {
2665 		// regardless of traverse direction the duplicates are always presented
2666 		// in the same order; since they are all considered as equal, this
2667 		// shouldn't cause any problems
2668 
2669 		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2670 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2671 				false);
2672 		} else
2673 			node = NULL;
2674 
2675 		if (node != NULL) {
2676 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2677 				// If the node is out of duplicates, we go directly to the next
2678 				// one
2679 				fDuplicateNode = node->RightLink();
2680 				if (fDuplicateNode != BPLUSTREE_NULL
2681 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2682 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2683 						false);
2684 					fDuplicate = 0;
2685 				}
2686 			}
2687 			if (fDuplicate < fNumDuplicates) {
2688 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2689 					fDuplicate++);
2690 				if (duplicate)
2691 					*duplicate = 2;
2692 				return B_OK;
2693 			}
2694 		}
2695 		fDuplicateNode = BPLUSTREE_NULL;
2696 	}
2697 
2698 	off_t savedNodeOffset = fCurrentNodeOffset;
2699 	int32 savedKey = fCurrentKey;
2700 
2701 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2702 		RETURN_ERROR(B_ERROR);
2703 
2704 	if (duplicate)
2705 		*duplicate = 0;
2706 
2707 	fCurrentKey += direction;
2708 
2709 	// is the current key in the current node?
2710 	while ((forward && fCurrentKey >= node->NumKeys())
2711 			|| (!forward && fCurrentKey < 0)) {
2712 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2713 
2714 		// are there any more nodes?
2715 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2716 			node = cached.SetTo(fCurrentNodeOffset);
2717 			if (!node)
2718 				RETURN_ERROR(B_ERROR);
2719 
2720 			// reset current key
2721 			fCurrentKey = forward ? 0 : node->NumKeys() - 1;
2722 		} else {
2723 			// there are no nodes left, so turn back to the last key
2724 			fCurrentNodeOffset = savedNodeOffset;
2725 			fCurrentKey = savedKey;
2726 
2727 			return B_ENTRY_NOT_FOUND;
2728 		}
2729 	}
2730 
2731 	if (node->all_key_count == 0)
2732 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2733 
2734 	uint16 length = 0;
2735 	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2736 	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2737 			> (uint8*)node + fTree->fNodeSize
2738 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2739 #if !_BOOT_MODE
2740 		fTree->fStream->GetVolume()->Panic();
2741 #endif
2742 		RETURN_ERROR(B_BAD_DATA);
2743 	}
2744 
2745 	// include the termination for string types
2746 	bool needsTermination = fTree->fHeader.DataType() == BPLUSTREE_STRING_TYPE;
2747 	if (length + (needsTermination ? 1 : 0) > maxLength) {
2748 		if (!needsTermination || maxLength < INODE_FILE_NAME_LENGTH) {
2749 			// The buffer is too small, restore the last key and return
2750 			// an error
2751 			fCurrentNodeOffset = savedNodeOffset;
2752 			fCurrentKey = savedKey;
2753 			return B_BUFFER_OVERFLOW;
2754 		}
2755 
2756 		// Always cut off strings at the maximum buffer size, and leave
2757 		// room for a terminating null byte.
2758 		// This allows to handle larger key sizes gracefully.
2759 		length = maxLength - 1;
2760 	}
2761 
2762 	memcpy(key, keyStart, length);
2763 
2764 	if (needsTermination)
2765 		((char*)key)[length] = '\0';
2766 
2767 	*keyLength = length;
2768 
2769 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2770 
2771 	// duplicate fragments?
2772 	uint8 type = bplustree_node::LinkType(offset);
2773 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2774 		|| type == BPLUSTREE_DUPLICATE_NODE) {
2775 		fDuplicateNode = offset;
2776 
2777 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2778 			false);
2779 		if (node == NULL)
2780 			RETURN_ERROR(B_ERROR);
2781 
2782 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2783 
2784 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2785 		if (fNumDuplicates) {
2786 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2787 			fDuplicate = 1;
2788 			if (duplicate)
2789 				*duplicate = 1;
2790 		} else {
2791 			// Shouldn't happen, but we're dealing here with potentially
2792 			// corrupt disks...
2793 			fDuplicateNode = BPLUSTREE_NULL;
2794 			offset = 0;
2795 		}
2796 	}
2797 	*value = offset;
2798 
2799 	return B_OK;
2800 }
2801 
2802 
2803 /*!	This is more or less a copy of BPlusTree::Find() - but it just
2804 	sets the current position in the iterator, regardless of if the
2805 	key could be found or not.
2806 */
2807 status_t
2808 TreeIterator::Find(const uint8* key, uint16 keyLength)
2809 {
2810 	if (fTree == NULL)
2811 		return B_INTERRUPTED;
2812 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2813 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2814 		|| key == NULL)
2815 		RETURN_ERROR(B_BAD_VALUE);
2816 
2817 #if !_BOOT_MODE
2818 	// lock access to stream
2819 	InodeReadLocker locker(fTree->fStream);
2820 #endif
2821 
2822 	off_t nodeOffset = fTree->fHeader.RootNode();
2823 
2824 	CachedNode cached(fTree);
2825 	const bplustree_node* node;
2826 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2827 		uint16 keyIndex = 0;
2828 		off_t nextOffset = 0;
2829 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2830 			&nextOffset);
2831 
2832 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2833 			fCurrentNodeOffset = nodeOffset;
2834 			fCurrentKey = keyIndex - 1;
2835 			fDuplicateNode = BPLUSTREE_NULL;
2836 
2837 			return status;
2838 		} else if (nextOffset == nodeOffset)
2839 			RETURN_ERROR(B_ERROR);
2840 
2841 		nodeOffset = nextOffset;
2842 	}
2843 	RETURN_ERROR(B_ERROR);
2844 }
2845 
2846 
2847 void
2848 TreeIterator::SkipDuplicates()
2849 {
2850 	fDuplicateNode = BPLUSTREE_NULL;
2851 }
2852 
2853 
2854 void
2855 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2856 	uint16 splitAt, int8 change)
2857 {
2858 	if (offset != fCurrentNodeOffset)
2859 		return;
2860 
2861 	if (nextOffset != BPLUSTREE_NULL) {
2862 		fCurrentNodeOffset = nextOffset;
2863 		if (splitAt <= fCurrentKey) {
2864 			fCurrentKey -= splitAt;
2865 			keyIndex -= splitAt;
2866 		}
2867 	}
2868 
2869 	// Adjust fCurrentKey to point to the same key as before.
2870 	// Note, that if a key is inserted at the current position
2871 	// it won't be included in this tree transition.
2872 	if (keyIndex <= fCurrentKey)
2873 		fCurrentKey += change;
2874 
2875 	// TODO: duplicate handling!
2876 }
2877 
2878 
2879 void
2880 TreeIterator::Stop()
2881 {
2882 	fTree = NULL;
2883 }
2884 
2885 
2886 #ifdef DEBUG
2887 void
2888 TreeIterator::Dump()
2889 {
2890 	__out("TreeIterator at %p:\n", this);
2891 	__out("\tfTree = %p\n", fTree);
2892 	__out("\tfCurrentNodeOffset = %" B_PRId64 "\n", fCurrentNodeOffset);
2893 	__out("\tfCurrentKey = %d\n", (int)fCurrentKey);
2894 	__out("\tfDuplicateNode = %" B_PRId64 " (%" B_PRId64 ", 0x%" B_PRIx64 ")\n",
2895 		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2896 		fDuplicateNode);
2897 	__out("\tfDuplicate = %u\n", fDuplicate);
2898 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2899 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2900 }
2901 #endif
2902 
2903 
2904 // #pragma mark -
2905 
2906 
2907 bool
2908 bplustree_header::IsValid() const
2909 {
2910 	return Magic() == BPLUSTREE_MAGIC
2911 		&& (RootNode() % NodeSize()) == 0
2912 		&& IsValidLink(RootNode())
2913 		&& IsValidLink(FreeNode());
2914 }
2915 
2916 
2917 // #pragma mark -
2918 
2919 
2920 void
2921 bplustree_node::Initialize()
2922 {
2923 	left_link = right_link = overflow_link
2924 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2925 	all_key_count = 0;
2926 	all_key_length = 0;
2927 }
2928 
2929 
2930 uint8*
2931 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2932 {
2933 	if (index < 0 || index > NumKeys())
2934 		return NULL;
2935 
2936 	uint8* keyStart = Keys();
2937 	Unaligned<uint16>* keyLengths = KeyLengths();
2938 
2939 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2940 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2941 	if (index > 0)
2942 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2943 
2944 	return keyStart;
2945 }
2946 
2947 
2948 uint8
2949 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2950 {
2951 	// the duplicate fragment handling is currently hard-coded to a node size
2952 	// of 1024 bytes - with future versions of BFS, this may be a problem
2953 
2954 	if (isFragment) {
2955 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2956 
2957 		return ((off_t*)this)[fragment];
2958 	}
2959 	return OverflowLink();
2960 }
2961 
2962 
2963 off_t
2964 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2965 {
2966 	uint32 start;
2967 	if (isFragment)
2968 		start = 8 * ((uint64)offset & 0x3ff);
2969 	else
2970 		start = 2;
2971 
2972 	return ((off_t*)this)[start + 1 + index];
2973 }
2974 
2975 
2976 /*!	Although the name suggests it, this function doesn't return the real
2977 	used fragment count; at least, it can only count to two: it returns
2978 	0, if there is no fragment used, 1 if there is only one fragment
2979 	used, and 2 if there are at least 2 fragments used.
2980 */
2981 uint32
2982 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2983 {
2984 	uint32 used = 0;
2985 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2986 		duplicate_array* array = FragmentAt(i);
2987 		if (array->Count() > 0 && ++used > 1)
2988 			return used;
2989 	}
2990 	return used;
2991 }
2992 
2993 
2994 status_t
2995 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2996 {
2997 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2998 		DEBUGGER(("invalid node: key/length count"));
2999 
3000 	for (int32 i = 0; i < NumKeys(); i++) {
3001 		uint16 length = 0;
3002 		uint8* key = KeyAt(i, &length);
3003 		if (key + length + sizeof(off_t) + sizeof(uint16)
3004 				> (uint8*)this + nodeSize
3005 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
3006 			dprintf("invalid node %p, key %d: keys corrupted\n", this, (int)i);
3007 			return B_BAD_DATA;
3008 		}
3009 		if (Values()[i] == -1) {
3010 			dprintf("invalid node %p, value %d: %" B_PRIdOFF ": values "
3011 				"corrupted\n", this, (int)i, Values()[i].value);
3012 			return B_BAD_DATA;
3013 		}
3014 	}
3015 	return B_OK;
3016 }
3017 
3018 
3019 // #pragma mark -
3020 
3021 
3022 #if !_BOOT_MODE
3023 BitmapArray::BitmapArray(size_t numBits)
3024 {
3025 	fSize = (numBits + 7) / 8;
3026 	fBitmap = (uint8*)calloc(fSize, 1);
3027 	fCountSet = 0;
3028 }
3029 
3030 
3031 BitmapArray::~BitmapArray()
3032 {
3033 	free(fBitmap);
3034 }
3035 
3036 
3037 status_t
3038 BitmapArray::InitCheck() const
3039 {
3040 	return fBitmap != NULL ? B_OK : B_NO_MEMORY;
3041 }
3042 
3043 
3044 bool
3045 BitmapArray::IsSet(size_t index) const
3046 {
3047 	uint32 byteIndex = index / 8;
3048 	if (byteIndex >= fSize)
3049 		return false;
3050 
3051 	return (fBitmap[byteIndex] & (1UL << (index & 0x7))) != 0;
3052 }
3053 
3054 
3055 void
3056 BitmapArray::Set(size_t index, bool set)
3057 {
3058 	uint32 byteIndex = index / 8;
3059 	if (byteIndex >= fSize)
3060 		return;
3061 
3062 	if (set) {
3063 		fBitmap[byteIndex] |= 1UL << (index & 0x7);
3064 		fCountSet++;
3065 	} else {
3066 		fBitmap[byteIndex] &= ~(1UL << (index & 0x7));
3067 		fCountSet--;
3068 	}
3069 }
3070 #endif // !_BOOT_MODE
3071 
3072 
3073 // #pragma mark -
3074 
3075 
3076 bool
3077 duplicate_array::_FindInternal(off_t value, int32& index) const
3078 {
3079 	int32 min = 0, max = Count() - 1;
3080 	off_t cmp;
3081 	while (min <= max) {
3082 		index = (min + max) / 2;
3083 
3084 		cmp = ValueAt(index) - value;
3085 		if (cmp < 0)
3086 			min = index + 1;
3087 		else if (cmp > 0)
3088 			max = index - 1;
3089 		else
3090 			return true;
3091 	}
3092 	return false;
3093 }
3094 
3095 
3096 void
3097 duplicate_array::Insert(off_t value)
3098 {
3099 	// if there are more than 8 values in this array, use a
3100 	// binary search, if not, just iterate linearly to find
3101 	// the insertion point
3102 	int32 size = Count();
3103 	int32 i = 0;
3104 	if (size > 8 ) {
3105 		if (!_FindInternal(value, i) && ValueAt(i) <= value)
3106 			i++;
3107 	} else {
3108 		for (i = 0; i < size; i++) {
3109 			if (ValueAt(i) > value)
3110 				break;
3111 		}
3112 	}
3113 
3114 	memmove(&values[i + 1], &values[i], (size - i) * sizeof(off_t));
3115 	values[i] = HOST_ENDIAN_TO_BFS_INT64(value);
3116 	count = HOST_ENDIAN_TO_BFS_INT64(size + 1);
3117 }
3118 
3119 
3120 bool
3121 duplicate_array::Remove(off_t value)
3122 {
3123 	int32 index = Find(value);
3124 	if (index == -1)
3125 		return false;
3126 
3127 	int32 newSize = Count() - 1;
3128 	memmove(&values[index], &values[index + 1],
3129 		(newSize - index) * sizeof(off_t));
3130 	count = HOST_ENDIAN_TO_BFS_INT64(newSize);
3131 
3132 	return true;
3133 }
3134 
3135 
3136 #if _BOOT_MODE
3137 } // namespace BFS
3138 #endif
3139