xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 746cac055adc6ac3308c7bc2d29040fb95689cc9)
1 /*
2  * Copyright 2001-2008, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 //! B+Tree implementation
10 
11 
12 #include "Debug.h"
13 #include "BPlusTree.h"
14 #include "Inode.h"
15 #include "Utility.h"
16 
17 
18 #ifdef DEBUG
19 class NodeChecker {
20 public:
21 	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
22 		:
23 		fNode(node),
24 		fSize(nodeSize),
25 		fText(text)
26 	{
27 		Check("integrity check failed on construction.");
28 	}
29 
30 	~NodeChecker()
31 	{
32 		Check("integrity check failed on destruction.");
33 	}
34 
35 	void
36 	Check(const char* message)
37 	{
38 		if (fNode->CheckIntegrity(fSize) < B_OK)
39 			dprintf("%s: %s\n", fText, message);
40 	}
41 
42 private:
43 	const bplustree_node*	fNode;
44 	int32					fSize;
45 	const char*				fText;
46 };
47 #endif
48 
49 
50 // Node Caching for the BPlusTree class
51 //
52 // With write support, there is the need for a function that allocates new
53 // nodes by either returning empty nodes, or by growing the file's data stream
54 //
55 // !! The CachedNode class assumes that you have properly locked the stream
56 // !! before asking for nodes.
57 //
58 // Note: This code will fail if the block size is smaller than the node size!
59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
60 // is hard-coded to 1024 bytes, that's not an issue now.
61 
62 void
63 CachedNode::UnsetUnchanged(Transaction& transaction)
64 {
65 	if (fTree == NULL || fTree->fStream == NULL)
66 		return;
67 
68 	if (fNode != NULL) {
69 		void* cache = fTree->fStream->GetVolume()->BlockCache();
70 
71 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
72 		block_cache_put(cache, fBlockNumber);
73 		fNode = NULL;
74 	}
75 }
76 
77 
78 void
79 CachedNode::Unset()
80 {
81 	if (fTree == NULL || fTree->fStream == NULL)
82 		return;
83 
84 	if (fNode != NULL) {
85 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
86 			fBlockNumber);
87 		fNode = NULL;
88 	}
89 }
90 
91 
92 const bplustree_node*
93 CachedNode::SetTo(off_t offset, bool check)
94 {
95 	if (fTree == NULL || fTree->fStream == NULL) {
96 		REPORT_ERROR(B_BAD_VALUE);
97 		return NULL;
98 	}
99 
100 	Unset();
101 
102 	// You can only ask for nodes at valid positions - you can't
103 	// even access the b+tree header with this method (use SetToHeader()
104 	// instead)
105 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
106 		|| offset <= 0
107 		|| (offset % fTree->fNodeSize) != 0)
108 		return NULL;
109 
110 	if (InternalSetTo(NULL, offset) != NULL && check) {
111 		// sanity checks (links, all_key_count)
112 		if (!fTree->fHeader->CheckNode(fNode)) {
113 			FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode "
114 				"at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID()));
115 			return NULL;
116 		}
117 	}
118 	return fNode;
119 }
120 
121 
122 bplustree_node*
123 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
124 {
125 	if (fTree == NULL || fTree->fStream == NULL) {
126 		REPORT_ERROR(B_BAD_VALUE);
127 		return NULL;
128 	}
129 
130 	Unset();
131 
132 	// You can only ask for nodes at valid positions - you can't
133 	// even access the b+tree header with this method (use SetToHeader()
134 	// instead)
135 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
136 		|| offset <= 0
137 		|| (offset % fTree->fNodeSize) != 0)
138 		return NULL;
139 
140 	if (InternalSetTo(&transaction, offset) != NULL && check) {
141 		// sanity checks (links, all_key_count)
142 		if (!fTree->fHeader->CheckNode(fNode)) {
143 			FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode "
144 				"at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID()));
145 			return NULL;
146 		}
147 	}
148 	return fNode;
149 }
150 
151 
152 bplustree_node*
153 CachedNode::MakeWritable(Transaction& transaction)
154 {
155 	if (fNode == NULL)
156 		return NULL;
157 
158 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
159 			fBlockNumber, transaction.ID()) == B_OK)
160 		return fNode;
161 
162 	return NULL;
163 }
164 
165 
166 bplustree_header*
167 CachedNode::MakeWritableHeader(Transaction& transaction)
168 {
169 	return (bplustree_header*)MakeWritable(transaction);
170 }
171 
172 
173 const bplustree_header*
174 CachedNode::SetToHeader()
175 {
176 	if (fTree == NULL || fTree->fStream == NULL) {
177 		REPORT_ERROR(B_BAD_VALUE);
178 		return NULL;
179 	}
180 
181 	Unset();
182 
183 	InternalSetTo(NULL, 0LL);
184 	return (bplustree_header*)fNode;
185 }
186 
187 
188 bplustree_header*
189 CachedNode::SetToWritableHeader(Transaction& transaction)
190 {
191 	if (fTree == NULL || fTree->fStream == NULL) {
192 		REPORT_ERROR(B_BAD_VALUE);
193 		return NULL;
194 	}
195 
196 	Unset();
197 
198 	InternalSetTo(&transaction, 0LL);
199 	return (bplustree_header*)fNode;
200 }
201 
202 
203 bplustree_node*
204 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
205 {
206 	fNode = NULL;
207 	fOffset = offset;
208 
209 	off_t fileOffset;
210 	block_run run;
211 	if (offset < fTree->fStream->Size()
212 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
213 		Volume* volume = fTree->fStream->GetVolume();
214 
215 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
216 		fBlockNumber = volume->ToBlock(run) + blockOffset;
217 		uint8* block;
218 
219 		if (transaction != NULL) {
220 			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
221 				fBlockNumber, transaction->ID());
222 			fWritable = true;
223 		} else {
224 			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
225 			fWritable = false;
226 		}
227 
228 		if (block) {
229 			// The node is somewhere in that block...
230 			// (confusing offset calculation)
231 			fNode = (bplustree_node*)(block + offset
232 				- (fileOffset + (blockOffset << volume->BlockShift())));
233 		} else
234 			REPORT_ERROR(B_IO_ERROR);
235 	}
236 	return fNode;
237 }
238 
239 
240 status_t
241 CachedNode::Free(Transaction& transaction, off_t offset)
242 {
243 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
244 		RETURN_ERROR(B_BAD_VALUE);
245 
246 	// TODO: scan the free nodes list and remove all nodes at the end
247 	// of the tree - perhaps that shouldn't be done everytime that
248 	// function is called, perhaps it should be done when the directory
249 	// inode is closed or based on some calculation or whatever...
250 
251 	bplustree_header* header
252 		= fTree->fCachedHeader.MakeWritableHeader(transaction);
253 	if (header == NULL)
254 		return B_IO_ERROR;
255 
256 #if 0
257 	// TODO: temporarily disabled because CheckNode() doesn't like this...
258 	// 		Also, it's such an edge case that it's almost useless, anyway.
259 	// if the node is the last one in the tree, we shrink
260 	// the tree and file size by one node
261 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
262 	if (offset == lastOffset) {
263 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
264 		if (status < B_OK)
265 			return status;
266 
267 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
268 		return B_OK;
269 	}
270 #endif
271 
272 	// add the node to the free nodes list
273 	fNode->left_link = header->free_node_pointer;
274 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
275 
276 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
277 	return B_OK;
278 }
279 
280 
281 status_t
282 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
283 	off_t* _offset)
284 {
285 	if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL)
286 		RETURN_ERROR(B_BAD_VALUE);
287 
288 	Unset();
289 
290 	bplustree_header* header;
291 	status_t status;
292 
293 	// if there are any free nodes, recycle them
294 	if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) {
295 		header = fTree->fCachedHeader.MakeWritableHeader(transaction);
296 		if (header == NULL)
297 			return B_IO_ERROR;
298 
299 		*_offset = header->FreeNode();
300 		*_node = fNode;
301 
302 		// set new free node pointer
303 		header->free_node_pointer = fNode->left_link;
304 		fNode->Initialize();
305 		return B_OK;
306 	}
307 
308 	// allocate space for a new node
309 	Inode* stream = fTree->fStream;
310 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
311 		return status;
312 
313 	header = fTree->fCachedHeader.MakeWritableHeader(transaction);
314 	if (header == NULL)
315 		return B_IO_ERROR;
316 
317 	// the maximum_size has to be changed before the call to SetTo() - or
318 	// else it will fail because the requested node is out of bounds
319 	off_t offset = header->MaximumSize();
320 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
321 		+ fTree->fNodeSize);
322 
323 	if (SetToWritable(transaction, offset, false) != NULL) {
324 		fNode->Initialize();
325 
326 		*_offset = offset;
327 		*_node = fNode;
328 		return B_OK;
329 	}
330 
331 	// revert header size to old value
332 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
333 		- fTree->fNodeSize);
334 	RETURN_ERROR(B_ERROR);
335 }
336 
337 
338 //	#pragma mark -
339 
340 
341 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
342 	:
343 	fStream(NULL),
344 	fHeader(NULL),
345 	fCachedHeader(this)
346 {
347 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
348 	SetTo(transaction, stream);
349 }
350 
351 
352 BPlusTree::BPlusTree(Inode* stream)
353 	:
354 	fStream(NULL),
355 	fHeader(NULL),
356 	fCachedHeader(this)
357 {
358 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
359 	SetTo(stream);
360 }
361 
362 
363 BPlusTree::BPlusTree()
364 	:
365 	fStream(NULL),
366 	fHeader(NULL),
367 	fCachedHeader(this),
368 	fNodeSize(BPLUSTREE_NODE_SIZE),
369 	fAllowDuplicates(true),
370 	fStatus(B_NO_INIT)
371 {
372 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
373 }
374 
375 
376 BPlusTree::~BPlusTree()
377 {
378 	// if there are any TreeIterators left, we need to stop them
379 	// (can happen when the tree's inode gets deleted while
380 	// traversing the tree - a TreeIterator doesn't lock the inode)
381 	mutex_lock(&fIteratorLock);
382 
383 	SinglyLinkedList<TreeIterator>::Iterator iterator
384 		= fIterators.GetIterator();
385 	while (iterator.HasNext())
386 		iterator.Next()->Stop();
387 
388 	mutex_destroy(&fIteratorLock);
389 }
390 
391 
392 /*! Create a new B+Tree on the specified stream */
393 status_t
394 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
395 {
396 	// initializes in-memory B+Tree
397 
398 	fStream = stream;
399 
400 	bplustree_header* header = fCachedHeader.SetToWritableHeader(transaction);
401 	if (header == NULL) {
402 		// allocate space for new header + node!
403 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
404 		if (fStatus < B_OK)
405 			RETURN_ERROR(fStatus);
406 
407 		header = fCachedHeader.SetToWritableHeader(transaction);
408 		if (header == NULL)
409 			RETURN_ERROR(fStatus = B_ERROR);
410 	}
411 
412 	fHeader = header;
413 	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
414 			&& stream->BlockRun() != stream->Parent())
415 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
416 
417 	fNodeSize = nodeSize;
418 
419 	// initialize b+tree header
420  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
421  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
422  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
423  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
424  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
425  	header->free_node_pointer
426  		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
427  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
428 
429 	// initialize b+tree root node
430 	CachedNode cached(this);
431 	cached.SetToWritable(transaction, header->RootNode(), false);
432 	if (cached.Node() == NULL)
433 		RETURN_ERROR(B_IO_ERROR);
434 
435 	cached.Node()->Initialize();
436 
437 	return fStatus = B_OK;
438 }
439 
440 
441 status_t
442 BPlusTree::SetTo(Inode* stream)
443 {
444 	if (stream == NULL)
445 		RETURN_ERROR(fStatus = B_BAD_VALUE);
446 
447 	// get on-disk B+Tree header
448 
449 	fCachedHeader.Unset();
450 	fStream = stream;
451 
452 	fHeader = fCachedHeader.SetToHeader();
453 	if (fHeader == NULL)
454 		RETURN_ERROR(fStatus = B_NO_INIT);
455 
456 	// is header valid?
457 
458 	if (fHeader->MaximumSize() != stream->Size()) {
459 		dprintf("B+tree header size %Ld doesn't fit file size %Ld!\n",
460 			fHeader->MaximumSize(), stream->Size());
461 		// we can't change the header since we don't have a transaction
462 		//fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
463 	}
464 	if (fHeader->Magic() != BPLUSTREE_MAGIC
465 		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
466 		|| !fHeader->IsValidLink(fHeader->RootNode())
467 		|| !fHeader->IsValidLink(fHeader->FreeNode())) {
468 #ifdef DEBUG
469 		dump_bplustree_header(fHeader);
470 		dump_block((const char*)fHeader, 128);
471 #endif
472 		RETURN_ERROR(fStatus = B_BAD_DATA);
473 	}
474 
475 	fNodeSize = fHeader->NodeSize();
476 
477 	{
478 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
479 			S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
480 			S_DOUBLE_INDEX};
481 		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
482 			| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
483 			| S_FLOAT_INDEX | S_DOUBLE_INDEX);
484 
485 		if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
486 			|| ((stream->Mode() & S_INDEX_DIR) != 0
487 				&& toMode[fHeader->DataType()] != mode)
488 			|| !stream->IsContainer()) {
489 			D(	dump_bplustree_header(fHeader);
490 				dump_inode(&stream->Node());
491 			);
492 			RETURN_ERROR(fStatus = B_BAD_TYPE);
493 		}
494 
495 		// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
496 		// in the original BFS code - we will honour it nevertheless
497 		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
498 			&& stream->BlockRun() != stream->Parent())
499 			|| (stream->Mode() & S_ALLOW_DUPS) != 0;
500 	}
501 
502 	CachedNode cached(this, fHeader->RootNode());
503 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
504 }
505 
506 
507 status_t
508 BPlusTree::InitCheck()
509 {
510 	return fStatus;
511 }
512 
513 
514 int32
515 BPlusTree::TypeCodeToKeyType(type_code code)
516 {
517 	switch (code) {
518 		case B_STRING_TYPE:
519 			return BPLUSTREE_STRING_TYPE;
520 		case B_SSIZE_T_TYPE:
521 		case B_INT32_TYPE:
522 			return BPLUSTREE_INT32_TYPE;
523 		case B_SIZE_T_TYPE:
524 		case B_UINT32_TYPE:
525 			return BPLUSTREE_UINT32_TYPE;
526 		case B_OFF_T_TYPE:
527 		case B_INT64_TYPE:
528 			return BPLUSTREE_INT64_TYPE;
529 		case B_UINT64_TYPE:
530 			return BPLUSTREE_UINT64_TYPE;
531 		case B_FLOAT_TYPE:
532 			return BPLUSTREE_FLOAT_TYPE;
533 		case B_DOUBLE_TYPE:
534 			return BPLUSTREE_DOUBLE_TYPE;
535 	}
536 	return -1;
537 }
538 
539 
540 int32
541 BPlusTree::ModeToKeyType(mode_t mode)
542 {
543 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
544 			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
545 		case S_INT_INDEX:
546 			return BPLUSTREE_INT32_TYPE;
547 		case S_UINT_INDEX:
548 			return BPLUSTREE_UINT32_TYPE;
549 		case S_LONG_LONG_INDEX:
550 			return BPLUSTREE_INT64_TYPE;
551 		case S_ULONG_LONG_INDEX:
552 			return BPLUSTREE_UINT64_TYPE;
553 		case S_FLOAT_INDEX:
554 			return BPLUSTREE_FLOAT_TYPE;
555 		case S_DOUBLE_INDEX:
556 			return BPLUSTREE_DOUBLE_TYPE;
557 		case S_STR_INDEX:
558 		default:
559 			// default is for standard directories
560 			return BPLUSTREE_STRING_TYPE;
561 	}
562 }
563 
564 
565 //	#pragma mark -
566 
567 
568 void
569 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
570 	uint16 splitAt, int8 change)
571 {
572 	// Although every iterator which is affected by this update currently
573 	// waits on a semaphore, other iterators could be added/removed at
574 	// any time, so we need to protect this loop
575 	MutexLocker _(fIteratorLock);
576 
577 	SinglyLinkedList<TreeIterator>::Iterator iterator
578 		= fIterators.GetIterator();
579 	while (iterator.HasNext())
580 		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
581 }
582 
583 
584 void
585 BPlusTree::_AddIterator(TreeIterator* iterator)
586 {
587 	MutexLocker _(fIteratorLock);
588 	fIterators.Add(iterator);
589 }
590 
591 
592 void
593 BPlusTree::_RemoveIterator(TreeIterator* iterator)
594 {
595 	MutexLocker _(fIteratorLock);
596 	fIterators.Remove(iterator);
597 }
598 
599 
600 int32
601 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
602 	int keyLength2)
603 {
604 	type_code type = 0;
605 	switch (fHeader->data_type) {
606 	    case BPLUSTREE_STRING_TYPE:
607 	    	type = B_STRING_TYPE;
608 	    	break;
609 		case BPLUSTREE_INT32_TYPE:
610 	    	type = B_INT32_TYPE;
611 	    	break;
612 		case BPLUSTREE_UINT32_TYPE:
613 	    	type = B_UINT32_TYPE;
614 	    	break;
615 		case BPLUSTREE_INT64_TYPE:
616 	    	type = B_INT64_TYPE;
617 	    	break;
618 		case BPLUSTREE_UINT64_TYPE:
619 	    	type = B_UINT64_TYPE;
620 	    	break;
621 		case BPLUSTREE_FLOAT_TYPE:
622 	    	type = B_FLOAT_TYPE;
623 	    	break;
624 		case BPLUSTREE_DOUBLE_TYPE:
625 	    	type = B_DOUBLE_TYPE;
626 	    	break;
627 	}
628    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
629 }
630 
631 
632 status_t
633 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
634 	uint16 keyLength, uint16* _index, off_t* _next)
635 {
636 #ifdef DEBUG
637 		NodeChecker checker(node, fNodeSize, "find");
638 #endif
639 
640 	if (node->all_key_count == 0) {
641 		if (_index)
642 			*_index = 0;
643 		if (_next)
644 			*_next = node->OverflowLink();
645 		return B_ENTRY_NOT_FOUND;
646 	}
647 
648 	off_t* values = node->Values();
649 	int16 saveIndex = -1;
650 
651 	// binary search in the key array
652 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
653 		uint16 i = (first + last) >> 1;
654 
655 		uint16 searchLength;
656 		uint8* searchKey = node->KeyAt(i, &searchLength);
657 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
658 				> (uint8*)node + fNodeSize
659 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
660 			fStream->GetVolume()->Panic();
661 			RETURN_ERROR(B_BAD_DATA);
662 		}
663 
664 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
665 		if (cmp < 0) {
666 			last = i - 1;
667 			saveIndex = i;
668 		} else if (cmp > 0) {
669 			saveIndex = first = i + 1;
670 		} else {
671 			if (_index)
672 				*_index = i;
673 			if (_next)
674 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
675 			return B_OK;
676 		}
677 	}
678 
679 	if (_index)
680 		*_index = saveIndex;
681 	if (_next) {
682 		if (saveIndex == node->NumKeys())
683 			*_next = node->OverflowLink();
684 		else
685 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
686 	}
687 	return B_ENTRY_NOT_FOUND;
688 }
689 
690 
691 /*!	Prepares the stack to contain all nodes that were passed while
692 	following the key, from the root node to the leaf node that could
693 	or should contain that key.
694 */
695 status_t
696 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
697 	uint16 keyLength)
698 {
699 	// set the root node to begin with
700 	node_and_key nodeAndKey;
701 	nodeAndKey.nodeOffset = fHeader->RootNode();
702 
703 	CachedNode cached(this);
704 	const bplustree_node* node;
705 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
706 		// if we are already on leaf level, we're done
707 		if (node->OverflowLink() == BPLUSTREE_NULL) {
708 			// node that the keyIndex is not properly set here (but it's not
709 			// needed in the calling functions anyway)!
710 			nodeAndKey.keyIndex = 0;
711 			stack.Push(nodeAndKey);
712 			return B_OK;
713 		}
714 
715 		off_t nextOffset;
716 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
717 			&nextOffset);
718 
719 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
720 			RETURN_ERROR(B_ERROR);
721 
722 		if ((uint32)stack.CountItems() > fHeader->MaxNumberOfLevels()) {
723 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
724 			break;
725 		}
726 
727 		// put the node offset & the correct keyIndex on the stack
728 		stack.Push(nodeAndKey);
729 
730 		nodeAndKey.nodeOffset = nextOffset;
731 	}
732 
733 	FATAL(("BPlusTree::_SeekDown() could not open node %Ld\n",
734 		nodeAndKey.nodeOffset));
735 	return B_ERROR;
736 }
737 
738 
739 /*!	This will find a free duplicate fragment in the given bplustree_node.
740 	The CachedNode will be set to the writable fragment on success.
741 */
742 status_t
743 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
744 	const bplustree_node* node, CachedNode& cached,
745 	off_t* _offset, bplustree_node** _fragment, uint32* _index)
746 {
747 	off_t* values = node->Values();
748 	for (int32 i = 0; i < node->NumKeys(); i++) {
749 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
750 
751 		// does the value link to a duplicate fragment?
752 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
753 			continue;
754 
755 		const bplustree_node* fragment = cached.SetTo(
756 			bplustree_node::FragmentOffset(value), false);
757 		if (fragment == NULL) {
758 			FATAL(("Could not get duplicate fragment at %Ld\n", value));
759 			continue;
760 		}
761 
762 		// see if there is some space left for us
763 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
764 		for (uint32 j = 0; j < num; j++) {
765 			duplicate_array* array = fragment->FragmentAt(j);
766 
767 			if (array->count == 0) {
768 				// found an unused fragment
769 				*_fragment = cached.MakeWritable(transaction);
770 				if (*_fragment == NULL)
771 					return B_IO_ERROR;
772 
773 				*_offset = bplustree_node::FragmentOffset(value);
774 				*_index = j;
775 				return B_OK;
776 			}
777 		}
778 	}
779 	return B_ENTRY_NOT_FOUND;
780 }
781 
782 
783 status_t
784 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
785 	const bplustree_node* node, uint16 index, off_t value)
786 {
787 	CachedNode cachedDuplicate(this);
788 	off_t* values = node->Values();
789 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
790 	status_t status;
791 	off_t offset;
792 
793 	if (bplustree_node::IsDuplicate(oldValue)) {
794 		// If it's a duplicate fragment, try to insert it into that, or if it
795 		// doesn't fit anymore, create a new duplicate node
796 
797 		if (bplustree_node::LinkType(oldValue)
798 				== BPLUSTREE_DUPLICATE_FRAGMENT) {
799 			bplustree_node* duplicate
800 				= cachedDuplicate.SetToWritable(transaction,
801 				bplustree_node::FragmentOffset(oldValue), false);
802 			if (duplicate == NULL)
803 				return B_IO_ERROR;
804 
805 			duplicate_array* array = duplicate->FragmentAt(
806 				bplustree_node::FragmentIndex(oldValue));
807 			if (array->count > NUM_FRAGMENT_VALUES
808 				|| array->count < 1) {
809 				FATAL(("insertDuplicate: Invalid array[%d] size in fragment "
810 					"%Ld == %Ld!\n",
811 					(int)bplustree_node::FragmentIndex(oldValue),
812 					bplustree_node::FragmentOffset(oldValue), array->count));
813 				return B_BAD_DATA;
814 			}
815 
816 			if (array->count < NUM_FRAGMENT_VALUES) {
817 				array->Insert(value);
818 			} else {
819 				// Test if the fragment will be empty if we remove this key's
820 				// values
821 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
822 					// The node will be empty without our values, so let us
823 					// reuse it as a duplicate node
824 					offset = bplustree_node::FragmentOffset(oldValue);
825 
826 					memmove(duplicate->DuplicateArray(), array,
827 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
828 					duplicate->left_link = duplicate->right_link
829 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
830 
831 					array = duplicate->DuplicateArray();
832 					array->Insert(value);
833 				} else {
834 					// Create a new duplicate node
835 
836 					cachedDuplicate.UnsetUnchanged(transaction);
837 						// The old duplicate has not been touched, so we can
838 						// reuse it
839 
840 					bplustree_node* newDuplicate;
841 					status = cachedDuplicate.Allocate(transaction,
842 						&newDuplicate, &offset);
843 					if (status < B_OK)
844 						RETURN_ERROR(status);
845 
846 					// Copy the array from the fragment node to the duplicate
847 					// node and free the old entry (by zero'ing all values)
848 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
849 						array->count);
850 					memcpy(&newDuplicate->all_key_count, &array->values[0],
851 						array->count * sizeof(off_t));
852 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
853 
854 					array = newDuplicate->DuplicateArray();
855 					array->Insert(value);
856 				}
857 
858 				// Update the main pointer to link to a duplicate node
859 				if (cached.MakeWritable(transaction) == NULL)
860 					return B_IO_ERROR;
861 
862 				values[index]
863 					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
864 						BPLUSTREE_DUPLICATE_NODE, offset));
865 			}
866 
867 			return B_OK;
868 		}
869 
870 		// Put the value into a dedicated duplicate node
871 
872 		// search for free space in the duplicate nodes of that key
873 		duplicate_array* array;
874 		const bplustree_node* duplicate;
875 		off_t duplicateOffset;
876 		do {
877 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
878 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
879 			if (duplicate == NULL)
880 				return B_IO_ERROR;
881 
882 			array = duplicate->DuplicateArray();
883 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
884 				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld "
885 					"== %Ld!\n", duplicateOffset, array->count));
886 				return B_BAD_DATA;
887 			}
888 		} while (array->count >= NUM_DUPLICATE_VALUES
889 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
890 
891 		bplustree_node* writableDuplicate
892 			= cachedDuplicate.MakeWritable(transaction);
893 		if (writableDuplicate == NULL)
894 			return B_IO_ERROR;
895 
896 		if (array->count < NUM_DUPLICATE_VALUES) {
897 			array = writableDuplicate->DuplicateArray();
898 			array->Insert(value);
899 		} else {
900 			// no space left - add a new duplicate node
901 
902 			bplustree_node* newDuplicate;
903 			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
904 				&offset);
905 			if (status < B_OK)
906 				RETURN_ERROR(status);
907 
908 			// link the two nodes together
909 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
910 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
911 
912 			array = newDuplicate->DuplicateArray();
913 			array->count = 0;
914 			array->Insert(value);
915 		}
916 		return B_OK;
917 	}
918 
919 	// Search for a free duplicate fragment or create a new one
920 	// to insert the duplicate value into
921 
922 	uint32 fragmentIndex = 0;
923 	bplustree_node* fragment;
924 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
925 			&offset, &fragment, &fragmentIndex) != B_OK) {
926 		// allocate a new duplicate fragment node
927 		if ((status = cachedDuplicate.Allocate(transaction, &fragment,
928 				&offset)) < B_OK)
929 			RETURN_ERROR(status);
930 
931 		memset(fragment, 0, fNodeSize);
932 	}
933 
934 	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
935 	array->Insert(oldValue);
936 	array->Insert(value);
937 
938 	if (cached.MakeWritable(transaction) == NULL)
939 		return B_IO_ERROR;
940 
941 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
942 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
943 
944 	return B_OK;
945 }
946 
947 
948 void
949 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
950 	uint16 keyLength, off_t value)
951 {
952 	// should never happen, but who knows?
953 	if (index > node->NumKeys())
954 		return;
955 
956 	off_t* values = node->Values();
957 	uint16* keyLengths = node->KeyLengths();
958 	uint8* keys = node->Keys();
959 
960 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
961 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
962 		+ keyLength);
963 
964 	off_t* newValues = node->Values();
965 	uint16* newKeyLengths = node->KeyLengths();
966 
967 	// move values and copy new value into them
968 	memmove(newValues + index + 1, values + index,
969 		sizeof(off_t) * (node->NumKeys() - 1 - index));
970 	memmove(newValues, values, sizeof(off_t) * index);
971 
972 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
973 
974 	// move and update key length index
975 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
976 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
977 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
978 	}
979 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
980 
981 	int32 keyStart;
982 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
983 		+ (keyStart = index > 0
984 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
985 
986 	// move keys and copy new key into them
987 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
988 	int32 size = node->AllKeyLength() - length;
989 	if (size > 0)
990 		memmove(keys + length, keys + length - keyLength, size);
991 
992 	memcpy(keys + keyStart, key, keyLength);
993 }
994 
995 
996 /*!	Splits the \a node into two halves - the other half will be put into
997 	\a other. It also takes care to create a new overflow link if the node
998 	to split is an index node.
999 */
1000 status_t
1001 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
1002 	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1003 	uint16* _keyLength, off_t* _value)
1004 {
1005 	if (*_keyIndex > node->NumKeys() + 1)
1006 		return B_BAD_VALUE;
1007 
1008 	uint16* inKeyLengths = node->KeyLengths();
1009 	off_t* inKeyValues = node->Values();
1010 	uint8* inKeys = node->Keys();
1011 	uint8* outKeys = other->Keys();
1012 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1013 
1014 	if (keyIndex > node->NumKeys()) {
1015 		FATAL(("key index out of bounds: %d, num keys: %u\n", (int)keyIndex,
1016 			node->NumKeys()));
1017 		return B_BAD_VALUE;
1018 	}
1019 
1020 	// how many keys will fit in one (half) page?
1021 	// that loop will find the answer to this question and
1022 	// change the key lengths indices for their new home
1023 
1024 	// "bytes" is the number of bytes written for the new key,
1025 	// "bytesBefore" are the bytes before that key
1026 	// "bytesAfter" are the bytes after the new key, if any
1027 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1028 
1029 	size_t size = fNodeSize >> 1;
1030 	int32 out, in;
1031 	for (in = out = 0; in < node->NumKeys() + 1;) {
1032 		if (!bytes) {
1033 			bytesBefore = in > 0
1034 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1035 		}
1036 
1037 		if (in == keyIndex && !bytes) {
1038 			bytes = *_keyLength;
1039 		} else {
1040 			if (keyIndex < out) {
1041 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1042 					- bytesBefore;
1043 			}
1044 
1045 			in++;
1046 		}
1047 		out++;
1048 
1049 		if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1050 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1051 			// we have found the number of keys in the new node!
1052 			break;
1053 		}
1054 	}
1055 
1056 	// if the new key was not inserted, set the length of the keys
1057 	// that can be copied directly
1058 	if (keyIndex >= out && in > 0)
1059 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1060 
1061 	if (bytesBefore < 0 || bytesAfter < 0)
1062 		return B_BAD_DATA;
1063 
1064 	other->left_link = node->left_link;
1065 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1066 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1067 		+ bytesAfter);
1068 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1069 
1070 	uint16* outKeyLengths = other->KeyLengths();
1071 	off_t* outKeyValues = other->Values();
1072 	int32 keys = out > keyIndex ? keyIndex : out;
1073 
1074 	if (bytesBefore) {
1075 		// copy the keys
1076 		memcpy(outKeys, inKeys, bytesBefore);
1077 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1078 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1079 	}
1080 	if (bytes) {
1081 		// copy the newly inserted key
1082 		memcpy(outKeys + bytesBefore, key, bytes);
1083 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1084 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1085 
1086 		if (bytesAfter) {
1087 			// copy the keys after the new key
1088 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1089 				bytesAfter);
1090 			keys = out - keyIndex - 1;
1091 			for (int32 i = 0;i < keys;i++) {
1092 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1093 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1094 						+ bytes);
1095 			}
1096 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1097 				keys * sizeof(off_t));
1098 		}
1099 	}
1100 
1101 	// if the new key was already inserted, we shouldn't use it again
1102 	if (in != out)
1103 		keyIndex--;
1104 
1105 	int32 total = bytesBefore + bytesAfter;
1106 
1107 	// these variables are for the key that will be returned
1108 	// to the parent node
1109 	uint8* newKey = NULL;
1110 	uint16 newLength;
1111 	bool newAllocated = false;
1112 
1113 	// If we have split an index node, we have to drop the first key
1114 	// of the next node (which can also be the new key to insert).
1115 	// The dropped key is also the one which has to be inserted in
1116 	// the parent node, so we will set the "newKey" already here.
1117 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1118 		if (in == keyIndex) {
1119 			newKey = key;
1120 			newLength = *_keyLength;
1121 
1122 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1123 			keyIndex--;
1124 		} else {
1125 			// If a key is dropped (is not the new key), we have to copy
1126 			// it, because it would be lost if not.
1127 			uint8* droppedKey = node->KeyAt(in, &newLength);
1128 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1129 					> (uint8*)node + fNodeSize
1130 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1131 				fStream->GetVolume()->Panic();
1132 				RETURN_ERROR(B_BAD_DATA);
1133 			}
1134 			newKey = (uint8*)malloc(newLength);
1135 			if (newKey == NULL)
1136 				return B_NO_MEMORY;
1137 
1138 			newAllocated = true;
1139 			memcpy(newKey, droppedKey, newLength);
1140 
1141 			other->overflow_link = inKeyValues[in];
1142 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1143 		}
1144 	}
1145 
1146 	// and now the same game for the other page and the rest of the keys
1147 	// (but with memmove() instead of memcpy(), because they may overlap)
1148 
1149 	bytesBefore = bytesAfter = bytes = 0;
1150 	out = 0;
1151 	int32 skip = in;
1152 	while (in < node->NumKeys() + 1) {
1153 		if (in == keyIndex && !bytes) {
1154 			// it's enough to set bytesBefore once here, because we do
1155 			// not need to know the exact length of all keys in this
1156 			// loop
1157 			bytesBefore = in > skip
1158 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1159 			bytes = *_keyLength;
1160 			out++;
1161 		} else {
1162 			if (in < node->NumKeys()) {
1163 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1164 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1165 
1166 				if (bytes) {
1167 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1168 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1169 
1170 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1171 						- bytesBefore - bytes;
1172 				}
1173 				out++;
1174 			}
1175 			in++;
1176 		}
1177 	}
1178 
1179 	// adjust the byte counts (since we were a bit lazy in the loop)
1180 	if (keyIndex < skip)
1181 		bytesBefore = node->AllKeyLength() - total;
1182 
1183 	if (bytesBefore < 0 || bytesAfter < 0) {
1184 		if (newAllocated)
1185 			free(newKey);
1186 		return B_BAD_DATA;
1187 	}
1188 
1189 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1190 		// right link, and overflow link can stay the same
1191 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1192 		+ bytesAfter);
1193 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1194 
1195 	// array positions have changed
1196 	outKeyLengths = node->KeyLengths();
1197 	outKeyValues = node->Values();
1198 
1199 	// move the keys in the old node: the order is important here,
1200 	// because we don't want to overwrite any contents
1201 
1202 	keys = keyIndex <= skip ? out : keyIndex - skip;
1203 	keyIndex -= skip;
1204 	in = out - keyIndex - 1;
1205 		// Note: keyIndex and in will contain invalid values when the new key
1206 		// went to the other node. But in this case bytes and bytesAfter are
1207 		// 0 and subsequently we never use keyIndex and in.
1208 
1209 	if (bytesBefore)
1210 		memmove(inKeys, inKeys + total, bytesBefore);
1211 	if (bytesAfter) {
1212 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1213 			bytesAfter);
1214 	}
1215 
1216 	if (bytesBefore)
1217 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1218 	if (bytesAfter) {
1219 		// if byteAfter is > 0, keyIndex is larger than skip
1220 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1221 			in * sizeof(uint16));
1222 	}
1223 
1224 	if (bytesBefore)
1225 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1226 	if (bytesAfter) {
1227 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1228 			in * sizeof(off_t));
1229 	}
1230 
1231 	if (bytes) {
1232 		// finally, copy the newly inserted key (don't overwrite anything)
1233 		memcpy(inKeys + bytesBefore, key, bytes);
1234 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1235 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1236 	}
1237 
1238 	// Prepare the key that will be inserted in the parent node which
1239 	// is either the dropped key or the last of the other node.
1240 	// If it's the dropped key, "newKey" was already set earlier.
1241 
1242 	if (newKey == NULL)
1243 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1244 
1245 	memcpy(key, newKey, newLength);
1246 	*_keyLength = newLength;
1247 	*_value = otherOffset;
1248 
1249 	if (newAllocated)
1250 		free(newKey);
1251 
1252 	return B_OK;
1253 }
1254 
1255 
1256 /*!	This inserts a key into the tree. The changes made to the tree will
1257 	all be part of the \a transaction.
1258 	You need to have the inode write locked.
1259 */
1260 status_t
1261 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1262 	off_t value)
1263 {
1264 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1265 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1266 		RETURN_ERROR(B_BAD_VALUE);
1267 #ifdef DEBUG
1268 	if (value < 0)
1269 		panic("tried to insert invalid value %Ld!\n", value);
1270 #endif
1271 
1272 	ASSERT_WRITE_LOCKED_INODE(fStream);
1273 
1274 	Stack<node_and_key> stack;
1275 	if (_SeekDown(stack, key, keyLength) != B_OK)
1276 		RETURN_ERROR(B_ERROR);
1277 
1278 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1279 
1280 	memcpy(keyBuffer, key, keyLength);
1281 	keyBuffer[keyLength] = 0;
1282 
1283 	node_and_key nodeAndKey;
1284 	const bplustree_node* node;
1285 
1286 	CachedNode cached(this);
1287 	while (stack.Pop(&nodeAndKey)
1288 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1289 #ifdef DEBUG
1290 		NodeChecker checker(node, fNodeSize, "insert");
1291 #endif
1292 		if (node->IsLeaf()) {
1293 			// first round, check for duplicate entries
1294 			status_t status = _FindKey(node, key, keyLength,
1295 				&nodeAndKey.keyIndex);
1296 
1297 			// is this a duplicate entry?
1298 			if (status == B_OK) {
1299 				if (fAllowDuplicates) {
1300 					status = _InsertDuplicate(transaction, cached, node,
1301 						nodeAndKey.keyIndex, value);
1302 					if (status != B_OK)
1303 						RETURN_ERROR(status);
1304 					return B_OK;
1305 				}
1306 
1307 				RETURN_ERROR(B_NAME_IN_USE);
1308 			}
1309 		}
1310 
1311 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1312 		if (writableNode == NULL)
1313 			return B_IO_ERROR;
1314 
1315 		// is the node big enough to hold the pair?
1316 		if (int32(key_align(sizeof(bplustree_node)
1317 				+ writableNode->AllKeyLength() + keyLength)
1318 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1319 				+ sizeof(off_t))) < fNodeSize) {
1320 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1321 				keyBuffer, keyLength, value);
1322 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1323 				nodeAndKey.keyIndex, 0, 1);
1324 
1325 			return B_OK;
1326 		} else {
1327 			CachedNode cachedNewRoot(this);
1328 			CachedNode cachedOther(this);
1329 
1330 			// do we need to allocate a new root node? if so, then do
1331 			// it now
1332 			off_t newRoot = BPLUSTREE_NULL;
1333 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1334 				bplustree_node* root;
1335 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1336 					&newRoot);
1337 				if (status < B_OK) {
1338 					// The tree is most likely corrupted!
1339 					// But it's still sane at leaf level - we could set
1340 					// a flag in the header that forces the tree to be
1341 					// rebuild next time...
1342 					// But since we will have journaling, that's not a big
1343 					// problem anyway.
1344 					RETURN_ERROR(status);
1345 				}
1346 			}
1347 
1348 			// reserve space for the other node
1349 			bplustree_node* other;
1350 			off_t otherOffset;
1351 			status_t status = cachedOther.Allocate(transaction, &other,
1352 				&otherOffset);
1353 			if (status < B_OK) {
1354 				cachedNewRoot.Free(transaction, newRoot);
1355 				RETURN_ERROR(status);
1356 			}
1357 
1358 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1359 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1360 					&value) < B_OK) {
1361 				// free root node & other node here
1362 				cachedOther.Free(transaction, otherOffset);
1363 				cachedNewRoot.Free(transaction, newRoot);
1364 
1365 				RETURN_ERROR(B_ERROR);
1366 			}
1367 #ifdef DEBUG
1368 			checker.Check("insert split");
1369 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1370 #endif
1371 
1372 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1373 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1374 
1375 			// update the right link of the node in the left of the new node
1376 			if ((other = cachedOther.SetToWritable(transaction,
1377 					other->LeftLink())) != NULL) {
1378 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1379 			}
1380 
1381 			// create a new root if necessary
1382 			if (newRoot != BPLUSTREE_NULL) {
1383 				bplustree_node* root = cachedNewRoot.Node();
1384 
1385 				_InsertKey(root, 0, keyBuffer, keyLength,
1386 					writableNode->LeftLink());
1387 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1388 					nodeAndKey.nodeOffset);
1389 
1390 				bplustree_header* header
1391 					= fCachedHeader.MakeWritableHeader(transaction);
1392 				if (header == NULL)
1393 					return B_IO_ERROR;
1394 
1395 				// finally, update header to point to the new root
1396 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1397 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1398 					header->MaxNumberOfLevels() + 1);
1399 
1400 				return B_OK;
1401 			}
1402 		}
1403 	}
1404 	RETURN_ERROR(B_ERROR);
1405 }
1406 
1407 
1408 /*!	Removes the duplicate index/value pair from the tree.
1409 	It's part of the private tree interface.
1410 */
1411 status_t
1412 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1413 	const bplustree_node* node, CachedNode& cached, uint16 index,
1414 	off_t value)
1415 {
1416 	off_t* values = node->Values();
1417 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1418 
1419 	CachedNode cachedDuplicate(this);
1420 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1421 	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1422 		duplicateOffset, false);
1423 	if (duplicate == NULL)
1424 		return B_IO_ERROR;
1425 
1426 	// if it's a duplicate fragment, remove the entry from there
1427 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1428 		duplicate_array* array = duplicate->FragmentAt(
1429 			bplustree_node::FragmentIndex(oldValue));
1430 
1431 		if (array->count > NUM_FRAGMENT_VALUES
1432 			|| array->count < 1) {
1433 			FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld "
1434 				"== %Ld!\n", (int)bplustree_node::FragmentIndex(oldValue),
1435 				duplicateOffset, array->count));
1436 			return B_BAD_DATA;
1437 		}
1438 		if (!array->Remove(value)) {
1439 			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",
1440 				value, duplicateOffset));
1441 			return B_ENTRY_NOT_FOUND;
1442 		}
1443 
1444 		// remove the array from the fragment node if it is empty
1445 		if (array->count == 1) {
1446 			// set the link to the remaining value
1447 			if (cached.MakeWritable(transaction) == NULL)
1448 				return B_IO_ERROR;
1449 
1450 			values[index] = array->values[0];
1451 
1452 			// Remove the whole fragment node, if this was the only array,
1453 			// otherwise free just the array
1454 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1455 				status_t status = cachedDuplicate.Free(transaction,
1456 					duplicateOffset);
1457 				if (status < B_OK)
1458 					return status;
1459 			} else
1460 				array->count = 0;
1461 		}
1462 		return B_OK;
1463 	}
1464 
1465 	// Remove value from a duplicate node!
1466 
1467 	duplicate_array* array = NULL;
1468 
1469 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1470 		FATAL(("invalid duplicate node: first left link points to %Ld!\n",
1471 			duplicate->LeftLink()));
1472 		return B_BAD_DATA;
1473 	}
1474 
1475 	// Search the duplicate nodes until the entry could be found (and removed)
1476 	while (duplicate != NULL) {
1477 		array = duplicate->DuplicateArray();
1478 		if (array->count > NUM_DUPLICATE_VALUES
1479 			|| array->count < 0) {
1480 			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == "
1481 				"%Ld!\n", duplicateOffset, array->count));
1482 			return B_BAD_DATA;
1483 		}
1484 
1485 		if (array->Remove(value))
1486 			break;
1487 
1488 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1489 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1490 
1491 		cachedDuplicate.UnsetUnchanged(transaction);
1492 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1493 			false);
1494 	}
1495 	if (duplicate == NULL)
1496 		RETURN_ERROR(B_IO_ERROR);
1497 
1498 	// The entry got removed from the duplicate node, but we might want to free
1499 	// it now in case it's empty
1500 
1501 	while (true) {
1502 		off_t left = duplicate->LeftLink();
1503 		off_t right = duplicate->RightLink();
1504 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1505 
1506 		if ((isLast && array->count == 1) || array->count == 0) {
1507 			// Free empty duplicate page, link their siblings together, and
1508 			// update the duplicate link if needed (ie. when we either remove
1509 			// the last duplicate node or have a new first one)
1510 
1511 			if (left == BPLUSTREE_NULL) {
1512 				// the duplicate link points to us
1513 				if (cached.MakeWritable(transaction) == NULL)
1514 					return B_IO_ERROR;
1515 
1516 				if (array->count == 1) {
1517 					// This is the last node, and there is only one value left;
1518 					// replace the duplicate link with that value, it's no
1519 					// duplicate anymore
1520 					values[index] = array->values[0];
1521 				} else {
1522 					// Move the duplicate link to the next node
1523 					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1524 						bplustree_node::MakeLink(
1525 							BPLUSTREE_DUPLICATE_NODE, right));
1526 				}
1527 			}
1528 
1529 			status_t status = cachedDuplicate.Free(transaction,
1530 				duplicateOffset);
1531 			if (status < B_OK)
1532 				return status;
1533 
1534 			if (left != BPLUSTREE_NULL
1535 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1536 						false)) != NULL) {
1537 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1538 
1539 				// If the next node is the last node, we need to free that node
1540 				// and convert the duplicate entry back into a normal entry
1541 				array = duplicate->DuplicateArray();
1542 				if (right == BPLUSTREE_NULL
1543 					&& duplicate->LeftLink() == BPLUSTREE_NULL
1544 					&& array->count <= NUM_FRAGMENT_VALUES) {
1545 					duplicateOffset = left;
1546 					continue;
1547 				}
1548 			}
1549 			if (right != BPLUSTREE_NULL
1550 				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1551 						right, false)) != NULL) {
1552 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1553 
1554 				// Again, we may need to turn the duplicate entry back into a
1555 				// normal entry
1556 				array = duplicate->DuplicateArray();
1557 				if (left == BPLUSTREE_NULL
1558 					&& duplicate->RightLink() == BPLUSTREE_NULL
1559 					&& array->count <= NUM_FRAGMENT_VALUES) {
1560 					duplicateOffset = right;
1561 					continue;
1562 				}
1563 			}
1564 			return B_OK;
1565 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1566 			// If the number of entries fits in a duplicate fragment, then
1567 			// either find a free fragment node, or convert this node to a
1568 			// fragment node.
1569 			CachedNode cachedOther(this);
1570 
1571 			bplustree_node* fragment = NULL;
1572 			uint32 fragmentIndex = 0;
1573 			off_t offset;
1574 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
1575 					&offset, &fragment, &fragmentIndex) == B_OK) {
1576 				// move to other node
1577 				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
1578 				memcpy(target, array,
1579 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1580 
1581 				cachedDuplicate.Free(transaction, duplicateOffset);
1582 				duplicateOffset = offset;
1583 			} else {
1584 				// convert node
1585 				memmove(duplicate, array,
1586 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1587 				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1588 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1589 			}
1590 
1591 			if (cached.MakeWritable(transaction) == NULL)
1592 				return B_IO_ERROR;
1593 
1594 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1595 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1596 		}
1597 		return B_OK;
1598 	}
1599 }
1600 
1601 
1602 /*!	Removes the key with the given index from the specified node.
1603 	Since it has to get the key from the node anyway (to obtain it's
1604 	pointer), it's not needed to pass the key & its length, although
1605 	the calling method (BPlusTree::Remove()) have this data.
1606 */
1607 void
1608 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
1609 {
1610 	// should never happen, but who knows?
1611 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1612 		FATAL(("Asked me to remove key outer limits: %u\n", index));
1613 		return;
1614 	}
1615 
1616 	off_t* values = node->Values();
1617 
1618 	// if we would have to drop the overflow link, drop
1619 	// the last key instead and update the overflow link
1620 	// to the value of that one
1621 	if (!node->IsLeaf() && index == node->NumKeys())
1622 		node->overflow_link = values[--index];
1623 
1624 	uint16 length;
1625 	uint8* key = node->KeyAt(index, &length);
1626 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
1627 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1628 		FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length,
1629 			(int)fStream->BlockRun().allocation_group,
1630 			fStream->BlockRun().start));
1631 		fStream->GetVolume()->Panic();
1632 		return;
1633 	}
1634 
1635 	uint16* keyLengths = node->KeyLengths();
1636 	uint8* keys = node->Keys();
1637 
1638 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1639 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
1640 		node->AllKeyLength() - length);
1641 
1642 	off_t* newValues = node->Values();
1643 	uint16* newKeyLengths = node->KeyLengths();
1644 
1645 	// move key data
1646 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1647 
1648 	// move and update key lengths
1649 	if (index > 0 && newKeyLengths != keyLengths)
1650 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1651 	for (uint16 i = index; i < node->NumKeys(); i++) {
1652 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1653 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
1654 	}
1655 
1656 	// move values
1657 	if (index > 0)
1658 		memmove(newValues, values, index * sizeof(off_t));
1659 	if (node->NumKeys() > index) {
1660 		memmove(newValues + index, values + index + 1,
1661 			(node->NumKeys() - index) * sizeof(off_t));
1662 	}
1663 }
1664 
1665 
1666 /*!	Removes the specified key from the tree. The "value" parameter is only used
1667 	for trees which allow duplicates, so you may safely ignore it.
1668 	It's not an optional parameter, so at least you have to think about it.
1669 	You need to have the inode write locked.
1670 */
1671 status_t
1672 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
1673 	off_t value)
1674 {
1675 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1676 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1677 		RETURN_ERROR(B_BAD_VALUE);
1678 
1679 	ASSERT_WRITE_LOCKED_INODE(fStream);
1680 
1681 	Stack<node_and_key> stack;
1682 	if (_SeekDown(stack, key, keyLength) != B_OK)
1683 		RETURN_ERROR(B_ERROR);
1684 
1685 	node_and_key nodeAndKey;
1686 	const bplustree_node* node;
1687 
1688 	CachedNode cached(this);
1689 	while (stack.Pop(&nodeAndKey)
1690 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1691 #ifdef DEBUG
1692 		NodeChecker checker(node, fNodeSize, "remove");
1693 #endif
1694 		if (node->IsLeaf()) {
1695 			// first round, check for duplicate entries
1696 			status_t status = _FindKey(node, key, keyLength,
1697 				&nodeAndKey.keyIndex);
1698 			if (status < B_OK)
1699 				RETURN_ERROR(status);
1700 
1701 			// If we will remove the last key, the iterator will be set
1702 			// to the next node after the current - if there aren't any
1703 			// more nodes, we need a way to prevent the TreeIterators to
1704 			// touch the old node again, we use BPLUSTREE_FREE for this
1705 			off_t next = node->RightLink() == BPLUSTREE_NULL
1706 				? BPLUSTREE_FREE : node->RightLink();
1707 			_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
1708 				? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
1709 
1710 			// is this a duplicate entry?
1711 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
1712 					node->Values()[nodeAndKey.keyIndex]))) {
1713 				if (fAllowDuplicates) {
1714 					return _RemoveDuplicate(transaction, node, cached,
1715 						nodeAndKey.keyIndex, value);
1716 				} else {
1717 					FATAL(("dupliate node found where no duplicates are "
1718 						"allowed!\n"));
1719 					RETURN_ERROR(B_ERROR);
1720 				}
1721 			}
1722 		}
1723 
1724 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1725 		if (writableNode == NULL)
1726 			return B_IO_ERROR;
1727 
1728 		// if it's an empty root node, we have to convert it
1729 		// to a leaf node by dropping the overflow link, or,
1730 		// if it's already a leaf node, just empty it
1731 		if (nodeAndKey.nodeOffset == fHeader->RootNode()
1732 			&& (node->NumKeys() == 0
1733 				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
1734 			writableNode->overflow_link
1735 				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1736 			writableNode->all_key_count = 0;
1737 			writableNode->all_key_length = 0;
1738 
1739 			// if we've made a leaf node out of the root node, we need
1740 			// to reset the maximum number of levels in the header
1741 			if (fHeader->MaxNumberOfLevels() != 1) {
1742 				bplustree_header* header
1743 					= fCachedHeader.MakeWritableHeader(transaction);
1744 				if (header == NULL)
1745 					return B_IO_ERROR;
1746 
1747 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1748 			}
1749 			return B_OK;
1750 		}
1751 
1752 		// if there is only one key left, we don't have to remove
1753 		// it, we can just dump the node (index nodes still have
1754 		// the overflow link, so we have to drop the last key)
1755 		if (writableNode->NumKeys() > 1
1756 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
1757 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
1758 			return B_OK;
1759 		}
1760 
1761 		// when we are here, we can just free the node, but
1762 		// we have to update the right/left link of the
1763 		// siblings first
1764 		CachedNode otherCached(this);
1765 		bplustree_node* other = otherCached.SetToWritable(transaction,
1766 			writableNode->LeftLink());
1767 		if (other != NULL)
1768 			other->right_link = writableNode->right_link;
1769 
1770 		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
1771 				!= NULL) {
1772 			other->left_link = writableNode->left_link;
1773 		}
1774 
1775 		cached.Free(transaction, nodeAndKey.nodeOffset);
1776 	}
1777 	RETURN_ERROR(B_ERROR);
1778 }
1779 
1780 
1781 /*!	Replaces the value for the key in the tree.
1782 	Returns B_OK if the key could be found and its value replaced,
1783 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1784 	to indicate that something went terribly wrong.
1785 	Note that this doesn't work with duplicates - it will just
1786 	return B_BAD_TYPE if you call this function on a tree where
1787 	duplicates are allowed.
1788 	You need to have the inode write locked.
1789 */
1790 status_t
1791 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
1792 	off_t value)
1793 {
1794 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1795 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1796 		|| key == NULL)
1797 		RETURN_ERROR(B_BAD_VALUE);
1798 
1799 	if (fAllowDuplicates)
1800 		RETURN_ERROR(B_BAD_TYPE);
1801 
1802 	ASSERT_WRITE_LOCKED_INODE(fStream);
1803 
1804 	off_t nodeOffset = fHeader->RootNode();
1805 	CachedNode cached(this);
1806 	const bplustree_node* node;
1807 
1808 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1809 		uint16 keyIndex = 0;
1810 		off_t nextOffset;
1811 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
1812 			&nextOffset);
1813 
1814 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1815 			if (status == B_OK) {
1816 				bplustree_node* writableNode = cached.MakeWritable(transaction);
1817 				if (writableNode != NULL) {
1818 					writableNode->Values()[keyIndex]
1819 						= HOST_ENDIAN_TO_BFS_INT64(value);
1820 				} else
1821 					status = B_IO_ERROR;
1822 			}
1823 
1824 			return status;
1825 		} else if (nextOffset == nodeOffset)
1826 			RETURN_ERROR(B_ERROR);
1827 
1828 		nodeOffset = nextOffset;
1829 	}
1830 	RETURN_ERROR(B_ERROR);
1831 }
1832 
1833 
1834 /*!	Searches the key in the tree, and stores the offset found in _value,
1835 	if successful.
1836 	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
1837 	while it descends the tree.
1838 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
1839 	It can also return other errors to indicate that something went wrong.
1840 	Note that this doesn't work with duplicates - it will just return
1841 	B_BAD_TYPE if you call this function on a tree where duplicates are
1842 	allowed.
1843 	You need to have the inode read or write locked.
1844 */
1845 status_t
1846 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
1847 {
1848 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1849 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1850 		|| key == NULL)
1851 		RETURN_ERROR(B_BAD_VALUE);
1852 
1853 	if (fAllowDuplicates)
1854 		RETURN_ERROR(B_BAD_TYPE);
1855 
1856 	ASSERT_READ_LOCKED_INODE(fStream);
1857 
1858 	off_t nodeOffset = fHeader->RootNode();
1859 	CachedNode cached(this);
1860 	const bplustree_node* node;
1861 
1862 #ifdef DEBUG
1863 	int32 levels = 0;
1864 #endif
1865 
1866 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1867 		uint16 keyIndex = 0;
1868 		off_t nextOffset;
1869 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
1870 			&nextOffset);
1871 
1872 #ifdef DEBUG
1873 		levels++;
1874 #endif
1875 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1876 			if (status == B_OK && _value != NULL)
1877 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1878 
1879 #ifdef DEBUG
1880 			if (levels != (int32)fHeader->MaxNumberOfLevels())
1881 				DEBUGGER(("levels don't match"));
1882 #endif
1883 			return status;
1884 		} else if (nextOffset == nodeOffset)
1885 			RETURN_ERROR(B_ERROR);
1886 
1887 		nodeOffset = nextOffset;
1888 	}
1889 	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
1890 	RETURN_ERROR(B_ERROR);
1891 }
1892 
1893 
1894 //	#pragma mark -
1895 
1896 
1897 TreeIterator::TreeIterator(BPlusTree* tree)
1898 	:
1899 	fTree(tree),
1900 	fCurrentNodeOffset(BPLUSTREE_NULL)
1901 {
1902 	tree->_AddIterator(this);
1903 }
1904 
1905 
1906 TreeIterator::~TreeIterator()
1907 {
1908 	if (fTree)
1909 		fTree->_RemoveIterator(this);
1910 }
1911 
1912 
1913 status_t
1914 TreeIterator::Goto(int8 to)
1915 {
1916 	if (fTree == NULL || fTree->fHeader == NULL)
1917 		RETURN_ERROR(B_BAD_VALUE);
1918 
1919 	// lock access to stream
1920 	InodeReadLocker locker(fTree->fStream);
1921 
1922 	off_t nodeOffset = fTree->fHeader->RootNode();
1923 	CachedNode cached(fTree);
1924 	const bplustree_node* node;
1925 
1926 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1927 		// is the node a leaf node?
1928 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1929 			fCurrentNodeOffset = nodeOffset;
1930 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1931 			fDuplicateNode = BPLUSTREE_NULL;
1932 
1933 			return B_OK;
1934 		}
1935 
1936 		// get the next node offset depending on the direction (and if there
1937 		// are any keys in that node at all)
1938 		off_t nextOffset;
1939 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1940 			nextOffset = node->OverflowLink();
1941 		else {
1942 			if (node->AllKeyLength() > fTree->fNodeSize
1943 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
1944 					- 8 * node->NumKeys())
1945 				RETURN_ERROR(B_ERROR);
1946 
1947 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1948 		}
1949 		if (nextOffset == nodeOffset)
1950 			break;
1951 
1952 		nodeOffset = nextOffset;
1953 	}
1954 	FATAL(("%s fails\n", __FUNCTION__));
1955 
1956 	RETURN_ERROR(B_ERROR);
1957 }
1958 
1959 
1960 /*!	Iterates through the tree in the specified direction.
1961 	When it iterates through duplicates, the "key" is only updated for the
1962 	first entry - if you need to know when this happens, use the "duplicate"
1963 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
1964 	the other duplicates.
1965 	That's not too nice, but saves the 256 bytes that would be needed to
1966 	store the last key - if this will ever become an issue, it will be
1967 	easy to change.
1968 	The other advantage of this is, that the queries can skip all duplicates
1969 	at once when they are not relevant to them.
1970 */
1971 status_t
1972 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
1973 	uint16 maxLength, off_t* value, uint16* duplicate)
1974 {
1975 	if (fTree == NULL)
1976 		return B_INTERRUPTED;
1977 
1978 	bool forward = direction == BPLUSTREE_FORWARD;
1979 
1980 	if (fCurrentNodeOffset == BPLUSTREE_NULL
1981 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
1982 		RETURN_ERROR(B_ERROR);
1983 
1984 	// if the tree was emptied since the last call
1985 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
1986 		return B_ENTRY_NOT_FOUND;
1987 
1988 	// lock access to stream
1989 	InodeReadLocker locker(fTree->fStream);
1990 
1991 	CachedNode cached(fTree);
1992 	const bplustree_node* node;
1993 
1994 	if (fDuplicateNode != BPLUSTREE_NULL) {
1995 		// regardless of traverse direction the duplicates are always presented
1996 		// in the same order; since they are all considered as equal, this
1997 		// shouldn't cause any problems
1998 
1999 		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2000 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2001 				false);
2002 		} else
2003 			node = NULL;
2004 
2005 		if (node != NULL) {
2006 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2007 				// If the node is out of duplicates, we go directly to the next
2008 				// one
2009 				fDuplicateNode = node->RightLink();
2010 				if (fDuplicateNode != BPLUSTREE_NULL
2011 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2012 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2013 						false);
2014 					fDuplicate = 0;
2015 				}
2016 			}
2017 			if (fDuplicate < fNumDuplicates) {
2018 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2019 					fDuplicate++);
2020 				if (duplicate)
2021 					*duplicate = 2;
2022 				return B_OK;
2023 			}
2024 		}
2025 		fDuplicateNode = BPLUSTREE_NULL;
2026 	}
2027 
2028 	off_t savedNodeOffset = fCurrentNodeOffset;
2029 	int32 savedKey = fCurrentKey;
2030 
2031 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2032 		RETURN_ERROR(B_ERROR);
2033 
2034 	if (duplicate)
2035 		*duplicate = 0;
2036 
2037 	fCurrentKey += direction;
2038 
2039 	// is the current key in the current node?
2040 	while ((forward && fCurrentKey >= node->NumKeys())
2041 			|| (!forward && fCurrentKey < 0)) {
2042 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2043 
2044 		// are there any more nodes?
2045 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2046 			node = cached.SetTo(fCurrentNodeOffset);
2047 			if (!node)
2048 				RETURN_ERROR(B_ERROR);
2049 
2050 			// reset current key
2051 			fCurrentKey = forward ? 0 : node->NumKeys();
2052 		} else {
2053 			// there are no nodes left, so turn back to the last key
2054 			fCurrentNodeOffset = savedNodeOffset;
2055 			fCurrentKey = savedKey;
2056 
2057 			return B_ENTRY_NOT_FOUND;
2058 		}
2059 	}
2060 
2061 	if (node->all_key_count == 0)
2062 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2063 
2064 	uint16 length;
2065 	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2066 	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2067 			> (uint8*)node + fTree->fNodeSize
2068 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2069 		fTree->fStream->GetVolume()->Panic();
2070 		RETURN_ERROR(B_BAD_DATA);
2071 	}
2072 
2073 	length = min_c(length, maxLength);
2074 	memcpy(key, keyStart, length);
2075 
2076 	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	{
2077 		// terminate string type
2078 		if (length == maxLength)
2079 			length--;
2080 		((char*)key)[length] = '\0';
2081 	}
2082 	*keyLength = length;
2083 
2084 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2085 
2086 	// duplicate fragments?
2087 	uint8 type = bplustree_node::LinkType(offset);
2088 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2089 		|| type == BPLUSTREE_DUPLICATE_NODE) {
2090 		fDuplicateNode = offset;
2091 
2092 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2093 			false);
2094 		if (node == NULL)
2095 			RETURN_ERROR(B_ERROR);
2096 
2097 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2098 
2099 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2100 		if (fNumDuplicates) {
2101 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2102 			fDuplicate = 1;
2103 			if (duplicate)
2104 				*duplicate = 1;
2105 		} else {
2106 			// Shouldn't happen, but we're dealing here with potentially
2107 			// corrupt disks...
2108 			fDuplicateNode = BPLUSTREE_NULL;
2109 			offset = 0;
2110 		}
2111 	}
2112 	*value = offset;
2113 
2114 	return B_OK;
2115 }
2116 
2117 
2118 /*!	This is more or less a copy of BPlusTree::Find() - but it just
2119 	sets the current position in the iterator, regardless of if the
2120 	key could be found or not.
2121 */
2122 status_t
2123 TreeIterator::Find(const uint8* key, uint16 keyLength)
2124 {
2125 	if (fTree == NULL)
2126 		return B_INTERRUPTED;
2127 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2128 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2129 		|| key == NULL)
2130 		RETURN_ERROR(B_BAD_VALUE);
2131 
2132 	// lock access to stream
2133 	InodeReadLocker locker(fTree->fStream);
2134 
2135 	off_t nodeOffset = fTree->fHeader->RootNode();
2136 
2137 	CachedNode cached(fTree);
2138 	const bplustree_node* node;
2139 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2140 		uint16 keyIndex = 0;
2141 		off_t nextOffset;
2142 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2143 			&nextOffset);
2144 
2145 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2146 			fCurrentNodeOffset = nodeOffset;
2147 			fCurrentKey = keyIndex - 1;
2148 			fDuplicateNode = BPLUSTREE_NULL;
2149 
2150 			return status;
2151 		} else if (nextOffset == nodeOffset)
2152 			RETURN_ERROR(B_ERROR);
2153 
2154 		nodeOffset = nextOffset;
2155 	}
2156 	RETURN_ERROR(B_ERROR);
2157 }
2158 
2159 
2160 void
2161 TreeIterator::SkipDuplicates()
2162 {
2163 	fDuplicateNode = BPLUSTREE_NULL;
2164 }
2165 
2166 
2167 void
2168 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2169 	uint16 splitAt, int8 change)
2170 {
2171 	if (offset != fCurrentNodeOffset)
2172 		return;
2173 
2174 	if (nextOffset != BPLUSTREE_NULL) {
2175 		fCurrentNodeOffset = nextOffset;
2176 		if (splitAt <= fCurrentKey) {
2177 			fCurrentKey -= splitAt;
2178 			keyIndex -= splitAt;
2179 		}
2180 	}
2181 
2182 	// Adjust fCurrentKey to point to the same key as before.
2183 	// Note, that if a key is inserted at the current position
2184 	// it won't be included in this tree transition.
2185 	if (keyIndex <= fCurrentKey)
2186 		fCurrentKey += change;
2187 
2188 	// TODO: duplicate handling!
2189 }
2190 
2191 
2192 void
2193 TreeIterator::Stop()
2194 {
2195 	fTree = NULL;
2196 }
2197 
2198 
2199 #ifdef DEBUG
2200 void
2201 TreeIterator::Dump()
2202 {
2203 	__out("TreeIterator at %p:\n", this);
2204 	__out("\tfTree = %p\n", fTree);
2205 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2206 	__out("\tfCurrentKey = %ld\n", fCurrentKey);
2207 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n",
2208 		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2209 		fDuplicateNode);
2210 	__out("\tfDuplicate = %u\n", fDuplicate);
2211 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2212 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2213 }
2214 #endif
2215 
2216 
2217 //	#pragma mark -
2218 
2219 
2220 void
2221 bplustree_node::Initialize()
2222 {
2223 	left_link = right_link = overflow_link
2224 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2225 	all_key_count = 0;
2226 	all_key_length = 0;
2227 }
2228 
2229 
2230 uint8*
2231 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2232 {
2233 	if (index < 0 || index > NumKeys())
2234 		return NULL;
2235 
2236 	uint8* keyStart = Keys();
2237 	uint16* keyLengths = KeyLengths();
2238 
2239 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2240 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2241 	if (index > 0)
2242 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2243 
2244 	return keyStart;
2245 }
2246 
2247 
2248 uint8
2249 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2250 {
2251 	// the duplicate fragment handling is currently hard-coded to a node size
2252 	// of 1024 bytes - with future versions of BFS, this may be a problem
2253 
2254 	if (isFragment) {
2255 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2256 
2257 		return ((off_t*)this)[fragment];
2258 	}
2259 	return OverflowLink();
2260 }
2261 
2262 
2263 off_t
2264 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2265 {
2266 	uint32 start;
2267 	if (isFragment)
2268 		start = 8 * ((uint64)offset & 0x3ff);
2269 	else
2270 		start = 2;
2271 
2272 	return ((off_t*)this)[start + 1 + index];
2273 }
2274 
2275 
2276 /*!	Although the name suggests it, this function doesn't return the real
2277 	used fragment count; at least, it can only count to two: it returns
2278 	0, if there is no fragment used, 1 if there is only one fragment
2279 	used, and 2 if there are at least 2 fragments used.
2280 */
2281 uint32
2282 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2283 {
2284 	uint32 used = 0;
2285 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2286 		duplicate_array* array = FragmentAt(i);
2287 		if (array->count > 0 && ++used > 1)
2288 			return used;
2289 	}
2290 	return used;
2291 }
2292 
2293 
2294 #ifdef DEBUG
2295 status_t
2296 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2297 {
2298 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2299 		DEBUGGER(("invalid node: key/length count"));
2300 
2301 	for (int32 i = 0; i < NumKeys(); i++) {
2302 		uint16 length;
2303 		uint8* key = KeyAt(i, &length);
2304 		if (key + length + sizeof(off_t) + sizeof(uint16)
2305 				> (uint8*)this + nodeSize
2306 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2307 			dprintf("node %p, key %ld\n", this, i);
2308 			DEBUGGER(("invalid node: keys corrupted"));
2309 			return B_BAD_DATA;
2310 		}
2311 		if (Values()[i] == -1) {
2312 			dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]);
2313 			DEBUGGER(("invalid node: values corrupted"));
2314 			return B_BAD_DATA;
2315 		}
2316 	}
2317 	return B_OK;
2318 }
2319 #endif
2320 
2321 
2322 //	#pragma mark -
2323 
2324 
2325 int32
2326 compareKeys(type_code type, const void* key1, int keyLength1,
2327 	const void* key2, int keyLength2)
2328 {
2329 	// if one of the keys is NULL, bail out gracefully
2330 	if (key1 == NULL || key2 == NULL) {
2331 		// even if it's most probably a bug in the calling code, we try to
2332 		// give a meaningful result
2333 		if (key1 == NULL && key2 != NULL)
2334 			return 1;
2335 		if (key2 == NULL && key1 != NULL)
2336 			return -1;
2337 
2338 		return 0;
2339 	}
2340 
2341 	switch (type) {
2342 	    case B_STRING_TYPE:
2343     	{
2344 			int result = memcmp(key1, key2, min_c(keyLength1, keyLength2));
2345 			if (result == 0) {
2346 				// ignore trailing null bytes
2347 				if ((keyLength1 == keyLength2 + 1
2348 						&& ((uint8*)key1)[keyLength2] == '\0')
2349 					|| (keyLength2 == keyLength1 + 1
2350 						&& ((uint8*)key2)[keyLength1] == '\0'))
2351 					return 0;
2352 
2353 				result = keyLength1 - keyLength2;
2354 			}
2355 
2356 			return result;
2357 		}
2358 
2359 		case B_SSIZE_T_TYPE:
2360 		case B_INT32_TYPE:
2361 			return *(int32*)key1 - *(int32*)key2;
2362 
2363 		case B_SIZE_T_TYPE:
2364 		case B_UINT32_TYPE:
2365 			if (*(uint32*)key1 == *(uint32*)key2)
2366 				return 0;
2367 			if (*(uint32*)key1 > *(uint32*)key2)
2368 				return 1;
2369 
2370 			return -1;
2371 
2372 		case B_OFF_T_TYPE:
2373 		case B_INT64_TYPE:
2374 			if (*(int64*)key1 == *(int64*)key2)
2375 				return 0;
2376 			if (*(int64*)key1 > *(int64*)key2)
2377 				return 1;
2378 
2379 			return -1;
2380 
2381 		case B_UINT64_TYPE:
2382 			if (*(uint64*)key1 == *(uint64*)key2)
2383 				return 0;
2384 			if (*(uint64*)key1 > *(uint64*)key2)
2385 				return 1;
2386 
2387 			return -1;
2388 
2389 		case B_FLOAT_TYPE:
2390 		{
2391 			float result = *(float*)key1 - *(float*)key2;
2392 			if (result == 0.0f)
2393 				return 0;
2394 
2395 			return (result < 0.0f) ? -1 : 1;
2396 		}
2397 
2398 		case B_DOUBLE_TYPE:
2399 		{
2400 			double result = *(double*)key1 - *(double*)key2;
2401 			if (result == 0.0)
2402 				return 0;
2403 
2404 			return (result < 0.0) ? -1 : 1;
2405 		}
2406 	}
2407 
2408 	// if the type is unknown, the entries don't match...
2409 	return -1;
2410 }
2411 
2412