xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 24159a0c7d6d6dcba9f2a0c1a7c08d2c8167f21b)
1 /* BPlusTree - BFS B+Tree implementation
2  *
3  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
4  * no code but achieves binary compatibility with the on disk format.
5  *
6  * Copyright 2001-2005, Axel Dörfler, axeld@pinc-software.de.
7  * This file may be used under the terms of the MIT License.
8  */
9 
10 
11 #include "Debug.h"
12 #include "BPlusTree.h"
13 #include "Inode.h"
14 #include "Utility.h"
15 
16 #include <util/kernel_cpp.h>
17 #include <util/Stack.h>
18 #include <TypeConstants.h>
19 
20 #include <string.h>
21 #include <stdlib.h>
22 #include <stdio.h>
23 
24 
25 #ifdef DEBUG
26 class NodeChecker {
27 	public:
28 		NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text)
29 			:
30 			fNode(node),
31 			fSize(nodeSize),
32 			fText(text)
33 		{
34 			Check("integrity check failed on construction.");
35 		}
36 
37 		~NodeChecker()
38 		{
39 			Check("integrity check failed on destruction.");
40 		}
41 
42 		void
43 		Check(const char *message)
44 		{
45 			if (fNode->CheckIntegrity(fSize) < B_OK)
46 				dprintf("%s: %s\n", fText, message);
47 		}
48 
49 	private:
50 		const bplustree_node	*fNode;
51 		int32					fSize;
52 		const char				*fText;
53 };
54 #endif
55 
56 
57 // Node Caching for the BPlusTree class
58 //
59 // With write support, there is the need for a function that allocates new
60 // nodes by either returning empty nodes, or by growing the file's data stream
61 //
62 // !! The CachedNode class assumes that you have properly locked the stream
63 // !! before asking for nodes.
64 //
65 // Note: This code will fail if the block size is smaller than the node size!
66 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
67 // is hard-coded to 1024 bytes, that's not an issue now.
68 
69 void
70 CachedNode::UnsetUnchanged(Transaction &transaction)
71 {
72 	if (fTree == NULL || fTree->fStream == NULL)
73 		return;
74 
75 	if (fNode != NULL) {
76 		void *cache = fTree->fStream->GetVolume()->BlockCache();
77 
78 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
79 		block_cache_put(cache, fBlockNumber);
80 		fNode = NULL;
81 	}
82 }
83 
84 
85 void
86 CachedNode::Unset()
87 {
88 	if (fTree == NULL || fTree->fStream == NULL)
89 		return;
90 
91 	if (fNode != NULL) {
92 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber);
93 		fNode = NULL;
94 	}
95 }
96 
97 
98 const bplustree_node *
99 CachedNode::SetTo(off_t offset, bool check)
100 {
101 	if (fTree == NULL || fTree->fStream == NULL) {
102 		REPORT_ERROR(B_BAD_VALUE);
103 		return NULL;
104 	}
105 
106 	Unset();
107 
108 	// You can only ask for nodes at valid positions - you can't
109 	// even access the b+tree header with this method (use SetToHeader()
110 	// instead)
111 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
112 		|| offset <= 0
113 		|| (offset % fTree->fNodeSize) != 0)
114 		return NULL;
115 
116 	if (InternalSetTo(NULL, offset) != NULL && check) {
117 		// sanity checks (links, all_key_count)
118 		if (!fTree->fHeader->CheckNode(fNode)) {
119 			FATAL(("invalid node (%p) read from offset %Ld, inode at %Ld\n",
120 				fNode, offset, fTree->fStream->ID()));
121 			return NULL;
122 		}
123 	}
124 	return fNode;
125 }
126 
127 
128 bplustree_node *
129 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check)
130 {
131 	if (fTree == NULL || fTree->fStream == NULL) {
132 		REPORT_ERROR(B_BAD_VALUE);
133 		return NULL;
134 	}
135 
136 	Unset();
137 
138 	// You can only ask for nodes at valid positions - you can't
139 	// even access the b+tree header with this method (use SetToHeader()
140 	// instead)
141 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
142 		|| offset <= 0
143 		|| (offset % fTree->fNodeSize) != 0)
144 		return NULL;
145 
146 	if (InternalSetTo(&transaction, offset) != NULL && check) {
147 		// sanity checks (links, all_key_count)
148 		if (!fTree->fHeader->CheckNode(fNode)) {
149 			FATAL(("invalid node [%p] read from offset %Ld, inode at %Ld\n",
150 				fNode, offset, fTree->fStream->ID()));
151 			return NULL;
152 		}
153 	}
154 	return fNode;
155 }
156 
157 
158 bplustree_node *
159 CachedNode::MakeWritable(Transaction &transaction)
160 {
161 	if (fNode == NULL)
162 		return NULL;
163 
164 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
165 			fBlockNumber, transaction.ID()) == B_OK)
166 		return fNode;
167 
168 	return NULL;
169 }
170 
171 
172 bplustree_header *
173 CachedNode::MakeWritableHeader(Transaction &transaction)
174 {
175 	return (bplustree_header *)MakeWritable(transaction);
176 }
177 
178 
179 const bplustree_header *
180 CachedNode::SetToHeader()
181 {
182 	if (fTree == NULL || fTree->fStream == NULL) {
183 		REPORT_ERROR(B_BAD_VALUE);
184 		return NULL;
185 	}
186 
187 	Unset();
188 
189 	InternalSetTo(NULL, 0LL);
190 	return (bplustree_header *)fNode;
191 }
192 
193 
194 bplustree_header *
195 CachedNode::SetToWritableHeader(Transaction &transaction)
196 {
197 	if (fTree == NULL || fTree->fStream == NULL) {
198 		REPORT_ERROR(B_BAD_VALUE);
199 		return NULL;
200 	}
201 
202 	Unset();
203 
204 	InternalSetTo(&transaction, 0LL);
205 	return (bplustree_header *)fNode;
206 }
207 
208 
209 bplustree_node *
210 CachedNode::InternalSetTo(Transaction *transaction, off_t offset)
211 {
212 	fNode = NULL;
213 	fOffset = offset;
214 
215 	off_t fileOffset;
216 	block_run run;
217 	if (offset < fTree->fStream->Size()
218 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
219 		Volume *volume = fTree->fStream->GetVolume();
220 
221 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
222 		fBlockNumber = volume->ToBlock(run) + blockOffset;
223 		uint8 *block;
224 
225 		if (transaction != NULL) {
226 			block = (uint8 *)block_cache_get_writable(volume->BlockCache(), fBlockNumber, transaction->ID());
227 			fWritable = true;
228 		} else {
229 			block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber);
230 			fWritable = false;
231 		}
232 
233 		if (block) {
234 			// the node is somewhere in that block... (confusing offset calculation)
235 			fNode = (bplustree_node *)(block + offset -
236 						(fileOffset + (blockOffset << volume->BlockShift())));
237 		} else
238 			REPORT_ERROR(B_IO_ERROR);
239 	}
240 	return fNode;
241 }
242 
243 
244 status_t
245 CachedNode::Free(Transaction &transaction, off_t offset)
246 {
247 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
248 		RETURN_ERROR(B_BAD_VALUE);
249 
250 	// ToDo: scan the free nodes list and remove all nodes at the end
251 	// of the tree - perhaps that shouldn't be done everytime that
252 	// function is called, perhaps it should be done when the directory
253 	// inode is closed or based on some calculation or whatever...
254 
255 	bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction);
256 	if (header == NULL)
257 		return B_IO_ERROR;
258 
259 	// if the node is the last one in the tree, we shrink
260 	// the tree and file size by one node
261 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
262 	if (offset == lastOffset) {
263 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
264 		if (status < B_OK)
265 			return status;
266 
267 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
268 		return B_OK;
269 	}
270 
271 	// add the node to the free nodes list
272 	fNode->left_link = header->free_node_pointer;
273 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
274 
275 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
276 	return B_OK;
277 }
278 
279 
280 status_t
281 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node, off_t *_offset)
282 {
283 	if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL)
284 		RETURN_ERROR(B_BAD_VALUE);
285 
286 	Unset();
287 
288 	bplustree_header *header;
289 	status_t status;
290 
291 	// if there are any free nodes, recycle them
292 	if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) {
293 		header = fTree->fCachedHeader.MakeWritableHeader(transaction);
294 		if (header == NULL)
295 			return B_IO_ERROR;
296 
297 		*_offset = header->FreeNode();
298 		*_node = fNode;
299 
300 		// set new free node pointer
301 		header->free_node_pointer = fNode->left_link;
302 		fNode->Initialize();
303 		return B_OK;
304 	}
305 
306 	// allocate space for a new node
307 	Inode *stream = fTree->fStream;
308 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
309 		return status;
310 
311 	header = fTree->fCachedHeader.MakeWritableHeader(transaction);
312 	if (header == NULL)
313 		return B_IO_ERROR;
314 
315 	// the maximum_size has to be changed before the call to SetTo() - or
316 	// else it will fail because the requested node is out of bounds
317 	off_t offset = header->MaximumSize();
318 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() + fTree->fNodeSize);
319 
320 	if (SetToWritable(transaction, offset, false) != NULL) {
321 		fNode->Initialize();
322 
323 		*_offset = offset;
324 		*_node = fNode;
325 		return B_OK;
326 	}
327 
328 	// revert header size to old value
329 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() - fTree->fNodeSize);
330 	RETURN_ERROR(B_ERROR);
331 }
332 
333 
334 //	#pragma mark -
335 
336 
337 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize)
338 	:
339 	fStream(NULL),
340 	fHeader(NULL),
341 	fCachedHeader(this)
342 {
343 	SetTo(transaction, stream);
344 }
345 
346 
347 BPlusTree::BPlusTree(Inode *stream)
348 	:
349 	fStream(NULL),
350 	fHeader(NULL),
351 	fCachedHeader(this)
352 {
353 	SetTo(stream);
354 }
355 
356 
357 BPlusTree::BPlusTree()
358 	:
359 	fStream(NULL),
360 	fHeader(NULL),
361 	fCachedHeader(this),
362 	fNodeSize(BPLUSTREE_NODE_SIZE),
363 	fAllowDuplicates(true),
364 	fStatus(B_NO_INIT)
365 {
366 }
367 
368 
369 BPlusTree::~BPlusTree()
370 {
371 	// if there are any TreeIterators left, we need to stop them
372 	// (can happen when the tree's inode gets deleted while
373 	// traversing the tree - a TreeIterator doesn't lock the inode)
374 	if (fIteratorLock.Lock() < B_OK)
375 		return;
376 
377 	TreeIterator *iterator = NULL;
378 	while ((iterator = fIterators.Next(iterator)) != NULL)
379 		iterator->Stop();
380 
381 	fIteratorLock.Unlock();
382 }
383 
384 
385 /** Create a new B+Tree on the specified stream */
386 
387 status_t
388 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize)
389 {
390 	// initializes in-memory B+Tree
391 
392 	fStream = stream;
393 
394 	bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction);
395 	if (header == NULL) {
396 		// allocate space for new header + node!
397 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
398 		if (fStatus < B_OK)
399 			RETURN_ERROR(fStatus);
400 
401 		header = fCachedHeader.SetToWritableHeader(transaction);
402 		if (header == NULL)
403 			RETURN_ERROR(fStatus = B_ERROR);
404 	}
405 
406 	fHeader = header;
407 	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
408 						&& stream->BlockRun() != stream->Parent())
409 						|| (stream->Mode() & S_ALLOW_DUPS) != 0;
410 
411 	fNodeSize = nodeSize;
412 
413 	// initialize b+tree header
414  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
415  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
416  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
417  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
418  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
419  	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
420  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
421 
422 	// initialize b+tree root node
423 	CachedNode cached(this);
424 	cached.SetToWritable(transaction, header->RootNode(), false);
425 	if (cached.Node() == NULL)
426 		RETURN_ERROR(B_IO_ERROR);
427 
428 	cached.Node()->Initialize();
429 
430 	return fStatus = B_OK;
431 }
432 
433 
434 status_t
435 BPlusTree::SetTo(Inode *stream)
436 {
437 	if (stream == NULL)
438 		RETURN_ERROR(fStatus = B_BAD_VALUE);
439 
440 	// get on-disk B+Tree header
441 
442 	fCachedHeader.Unset();
443 	fStream = stream;
444 
445 	fHeader = fCachedHeader.SetToHeader();
446 	if (fHeader == NULL)
447 		RETURN_ERROR(fStatus = B_NO_INIT);
448 
449 	// is header valid?
450 
451 	if (fHeader->MaximumSize() != stream->Size()) {
452 		FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n",
453 			fHeader->MaximumSize(), stream->Size()));
454 		// we can't change the header since we don't have a transaction
455 		//fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
456 	}
457 	if (fHeader->Magic() != BPLUSTREE_MAGIC
458 		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
459 		|| !fHeader->IsValidLink(fHeader->RootNode())
460 		|| !fHeader->IsValidLink(fHeader->FreeNode())) {
461 #ifdef DEBUG
462 		dump_bplustree_header(fHeader);
463 		dump_block((const char *)fHeader, 128);
464 #endif
465 		RETURN_ERROR(fStatus = B_BAD_DATA);
466 	}
467 
468 	fNodeSize = fHeader->NodeSize();
469 
470 	{
471 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
472 						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
473 		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
474 						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
475 
476 		if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
477 			|| (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->DataType()] != mode
478 			|| !stream->IsContainer()) {
479 			D(	dump_bplustree_header(fHeader);
480 				dump_inode(&stream->Node());
481 			);
482 			RETURN_ERROR(fStatus = B_BAD_TYPE);
483 		}
484 
485 		// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
486 		// in the original BFS code - we will honour it nevertheless
487 		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
488 							&& stream->BlockRun() != stream->Parent())
489 							|| (stream->Mode() & S_ALLOW_DUPS) != 0;
490 	}
491 
492 	CachedNode cached(this, fHeader->RootNode());
493 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
494 }
495 
496 
497 status_t
498 BPlusTree::InitCheck()
499 {
500 	return fStatus;
501 }
502 
503 
504 int32
505 BPlusTree::TypeCodeToKeyType(type_code code)
506 {
507 	switch (code) {
508 		case B_STRING_TYPE:
509 			return BPLUSTREE_STRING_TYPE;
510 		case B_SSIZE_T_TYPE:
511 		case B_INT32_TYPE:
512 			return BPLUSTREE_INT32_TYPE;
513 		case B_SIZE_T_TYPE:
514 		case B_UINT32_TYPE:
515 			return BPLUSTREE_UINT32_TYPE;
516 		case B_OFF_T_TYPE:
517 		case B_INT64_TYPE:
518 			return BPLUSTREE_INT64_TYPE;
519 		case B_UINT64_TYPE:
520 			return BPLUSTREE_UINT64_TYPE;
521 		case B_FLOAT_TYPE:
522 			return BPLUSTREE_FLOAT_TYPE;
523 		case B_DOUBLE_TYPE:
524 			return BPLUSTREE_DOUBLE_TYPE;
525 	}
526 	return -1;
527 }
528 
529 
530 int32
531 BPlusTree::ModeToKeyType(mode_t mode)
532 {
533 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
534 				   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
535 		case S_INT_INDEX:
536 			return BPLUSTREE_INT32_TYPE;
537 		case S_UINT_INDEX:
538 			return BPLUSTREE_UINT32_TYPE;
539 		case S_LONG_LONG_INDEX:
540 			return BPLUSTREE_INT64_TYPE;
541 		case S_ULONG_LONG_INDEX:
542 			return BPLUSTREE_UINT64_TYPE;
543 		case S_FLOAT_INDEX:
544 			return BPLUSTREE_FLOAT_TYPE;
545 		case S_DOUBLE_INDEX:
546 			return BPLUSTREE_DOUBLE_TYPE;
547 		case S_STR_INDEX:
548 		default:
549 			// default is for standard directories
550 			return BPLUSTREE_STRING_TYPE;
551 	}
552 }
553 
554 
555 //	#pragma mark -
556 
557 
558 void
559 BPlusTree::UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt,
560 	int8 change)
561 {
562 	// Although every iterator which is affected by this update currently
563 	// waits on a semaphore, other iterators could be added/removed at
564 	// any time, so we need to protect this loop
565 	if (fIteratorLock.Lock() < B_OK)
566 		return;
567 
568 	TreeIterator *iterator = NULL;
569 	while ((iterator = fIterators.Next(iterator)) != NULL)
570 		iterator->Update(offset, nextOffset, keyIndex, splitAt, change);
571 
572 	fIteratorLock.Unlock();
573 }
574 
575 
576 void
577 BPlusTree::AddIterator(TreeIterator *iterator)
578 {
579 	if (fIteratorLock.Lock() < B_OK)
580 		return;
581 
582 	fIterators.Add(iterator);
583 
584 	fIteratorLock.Unlock();
585 }
586 
587 
588 void
589 BPlusTree::RemoveIterator(TreeIterator *iterator)
590 {
591 	if (fIteratorLock.Lock() < B_OK)
592 		return;
593 
594 	fIterators.Remove(iterator);
595 
596 	fIteratorLock.Unlock();
597 }
598 
599 
600 int32
601 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
602 {
603 	type_code type = 0;
604 	switch (fHeader->data_type) {
605 	    case BPLUSTREE_STRING_TYPE:
606 	    	type = B_STRING_TYPE;
607 	    	break;
608 		case BPLUSTREE_INT32_TYPE:
609 	    	type = B_INT32_TYPE;
610 	    	break;
611 		case BPLUSTREE_UINT32_TYPE:
612 	    	type = B_UINT32_TYPE;
613 	    	break;
614 		case BPLUSTREE_INT64_TYPE:
615 	    	type = B_INT64_TYPE;
616 	    	break;
617 		case BPLUSTREE_UINT64_TYPE:
618 	    	type = B_UINT64_TYPE;
619 	    	break;
620 		case BPLUSTREE_FLOAT_TYPE:
621 	    	type = B_FLOAT_TYPE;
622 	    	break;
623 		case BPLUSTREE_DOUBLE_TYPE:
624 	    	type = B_DOUBLE_TYPE;
625 	    	break;
626 	}
627    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
628 }
629 
630 
631 status_t
632 BPlusTree::FindKey(const bplustree_node *node, const uint8 *key, uint16 keyLength,
633 	uint16 *_index, off_t *_next)
634 {
635 #ifdef DEBUG
636 		NodeChecker checker(node, fNodeSize, "find");
637 #endif
638 
639 	if (node->all_key_count == 0) {
640 		if (_index)
641 			*_index = 0;
642 		if (_next)
643 			*_next = node->OverflowLink();
644 		return B_ENTRY_NOT_FOUND;
645 	}
646 
647 	off_t *values = node->Values();
648 	int16 saveIndex = -1;
649 
650 	// binary search in the key array
651 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
652 		uint16 i = (first + last) >> 1;
653 
654 		uint16 searchLength;
655 		uint8 *searchKey = node->KeyAt(i, &searchLength);
656 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
657 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
658 			fStream->GetVolume()->Panic();
659 			RETURN_ERROR(B_BAD_DATA);
660 		}
661 
662 		int32 cmp = CompareKeys(key, keyLength, searchKey, searchLength);
663 		if (cmp < 0) {
664 			last = i - 1;
665 			saveIndex = i;
666 		} else if (cmp > 0) {
667 			saveIndex = first = i + 1;
668 		} else {
669 			if (_index)
670 				*_index = i;
671 			if (_next)
672 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
673 			return B_OK;
674 		}
675 	}
676 
677 	if (_index)
678 		*_index = saveIndex;
679 	if (_next) {
680 		if (saveIndex == node->NumKeys())
681 			*_next = node->OverflowLink();
682 		else
683 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
684 	}
685 	return B_ENTRY_NOT_FOUND;
686 }
687 
688 
689 /**	Prepares the stack to contain all nodes that were passed while
690  *	following the key, from the root node to the leaf node that could
691  *	or should contain that key.
692  */
693 
694 status_t
695 BPlusTree::SeekDown(Stack<node_and_key> &stack, const uint8 *key, uint16 keyLength)
696 {
697 	// set the root node to begin with
698 	node_and_key nodeAndKey;
699 	nodeAndKey.nodeOffset = fHeader->RootNode();
700 
701 	CachedNode cached(this);
702 	const bplustree_node *node;
703 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
704 		// if we are already on leaf level, we're done
705 		if (node->OverflowLink() == BPLUSTREE_NULL) {
706 			// node that the keyIndex is not properly set here (but it's not
707 			// needed in the calling functions anyway)!
708 			nodeAndKey.keyIndex = 0;
709 			stack.Push(nodeAndKey);
710 			return B_OK;
711 		}
712 
713 		off_t nextOffset;
714 		status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex, &nextOffset);
715 
716 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
717 			RETURN_ERROR(B_ERROR);
718 
719 		// put the node offset & the correct keyIndex on the stack
720 		stack.Push(nodeAndKey);
721 
722 		nodeAndKey.nodeOffset = nextOffset;
723 	}
724 
725 	FATAL(("BPlusTree::SeekDown() could not open node %Ld\n", nodeAndKey.nodeOffset));
726 	return B_ERROR;
727 }
728 
729 
730 /**	This will find a free duplicate fragment in the given bplustree_node.
731  *	The CachedNode will be set to the writable fragment on success.
732  */
733 
734 status_t
735 BPlusTree::FindFreeDuplicateFragment(Transaction &transaction,
736 	const bplustree_node *node, CachedNode &cached,
737 	off_t *_offset, bplustree_node **_fragment, uint32 *_index)
738 {
739 	off_t *values = node->Values();
740 	for (int32 i = 0; i < node->NumKeys(); i++) {
741 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
742 
743 		// does the value link to a duplicate fragment?
744 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
745 			continue;
746 
747 		const bplustree_node *fragment = cached.SetTo(
748 			bplustree_node::FragmentOffset(value), false);
749 		if (fragment == NULL) {
750 			FATAL(("Could not get duplicate fragment at %Ld\n", value));
751 			continue;
752 		}
753 
754 		// see if there is some space left for us
755 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
756 		for (uint32 j = 0; j < num; j++) {
757 			duplicate_array *array = fragment->FragmentAt(j);
758 
759 			if (array->count == 0) {
760 				// found an unused fragment
761 				*_fragment = cached.MakeWritable(transaction);
762 				if (*_fragment == NULL)
763 					return B_IO_ERROR;
764 
765 				*_offset = bplustree_node::FragmentOffset(value);
766 				*_index = j;
767 				return B_OK;
768 			}
769 		}
770 	}
771 	return B_ENTRY_NOT_FOUND;
772 }
773 
774 
775 status_t
776 BPlusTree::InsertDuplicate(Transaction &transaction, CachedNode &cached,
777 	const bplustree_node *node, uint16 index, off_t value)
778 {
779 	CachedNode cachedDuplicate(this);
780 	off_t *values = node->Values();
781 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
782 	status_t status;
783 	off_t offset;
784 
785 	if (bplustree_node::IsDuplicate(oldValue)) {
786 		// If it's a duplicate fragment, try to insert it into that, or if it
787 		// doesn't fit anymore, create a new duplicate node
788 
789 		if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
790 			bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
791 				bplustree_node::FragmentOffset(oldValue), false);
792 			if (duplicate == NULL)
793 				return B_IO_ERROR;
794 
795 			duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
796 			if (array->count > NUM_FRAGMENT_VALUES
797 				|| array->count < 1) {
798 				FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",
799 					bplustree_node::FragmentIndex(oldValue),
800 					bplustree_node::FragmentOffset(oldValue),
801 					array->count));
802 				return B_BAD_DATA;
803 			}
804 
805 			if (array->count < NUM_FRAGMENT_VALUES) {
806 				array->Insert(value);
807 			} else {
808 				// test if the fragment will be empty if we remove this key's values
809 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
810 					// the node will be empty without our values, so let us
811 					// reuse it as a duplicate node
812 					offset = bplustree_node::FragmentOffset(oldValue);
813 
814 					memmove(duplicate->DuplicateArray(), array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
815 					duplicate->left_link = duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(
816 						(uint64)BPLUSTREE_NULL);
817 
818 					array = duplicate->DuplicateArray();
819 					array->Insert(value);
820 				} else {
821 					// create a new duplicate node
822 
823 					cachedDuplicate.UnsetUnchanged(transaction);
824 						// the old duplicate has not been touched, so we can reuse it
825 
826 					bplustree_node *newDuplicate;
827 					status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset);
828 					if (status < B_OK)
829 						RETURN_ERROR(status);
830 
831 					// copy the array from the fragment node to the duplicate node
832 					// and free the old entry (by zero'ing all values)
833 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(array->count);
834 					memcpy(&newDuplicate->all_key_count, &array->values[0],
835 						array->count * sizeof(off_t));
836 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
837 
838 					array = newDuplicate->DuplicateArray();
839 					array->Insert(value);
840 				}
841 
842 				// update the main pointer to link to a duplicate node
843 				if (cached.MakeWritable(transaction) == NULL)
844 					return B_IO_ERROR;
845 
846 				values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
847 					BPLUSTREE_DUPLICATE_NODE, offset));
848 			}
849 
850 			return B_OK;
851 		}
852 
853 		// Put the value into a dedicated duplicate node
854 
855 		// search for free space in the duplicate nodes of that key
856 		duplicate_array *array;
857 		const bplustree_node *duplicate;
858 		off_t duplicateOffset;
859 		do {
860 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
861 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
862 			if (duplicate == NULL)
863 				return B_IO_ERROR;
864 
865 			array = duplicate->DuplicateArray();
866 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
867 				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
868 					duplicateOffset, array->count));
869 				return B_BAD_DATA;
870 			}
871 		} while (array->count >= NUM_DUPLICATE_VALUES
872 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
873 
874 		bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction);
875 		if (writableDuplicate == NULL)
876 			return B_IO_ERROR;
877 
878 		if (array->count < NUM_DUPLICATE_VALUES) {
879 			array = writableDuplicate->DuplicateArray();
880 			array->Insert(value);
881 		} else {
882 			// no space left - add a new duplicate node
883 
884 			bplustree_node *newDuplicate;
885 			status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset);
886 			if (status < B_OK)
887 				RETURN_ERROR(status);
888 
889 			// link the two nodes together
890 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
891 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
892 
893 			array = newDuplicate->DuplicateArray();
894 			array->count = 0;
895 			array->Insert(value);
896 		}
897 		return B_OK;
898 	}
899 
900 	// Search for a free duplicate fragment or create a new one
901 	// to insert the duplicate value into
902 
903 	uint32 fragmentIndex = 0;
904 	bplustree_node *fragment;
905 	if (FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
906 			&offset, &fragment, &fragmentIndex) != B_OK) {
907 		// allocate a new duplicate fragment node
908 		if ((status = cachedDuplicate.Allocate(transaction, &fragment, &offset)) < B_OK)
909 			RETURN_ERROR(status);
910 
911 		memset(fragment, 0, fNodeSize);
912 	}
913 
914 	duplicate_array *array = fragment->FragmentAt(fragmentIndex);
915 	array->Insert(oldValue);
916 	array->Insert(value);
917 
918 	if (cached.MakeWritable(transaction) == NULL)
919 		return B_IO_ERROR;
920 
921 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
922 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
923 
924 	return B_OK;
925 }
926 
927 
928 void
929 BPlusTree::InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength,
930 	off_t value)
931 {
932 	// should never happen, but who knows?
933 	if (index > node->NumKeys())
934 		return;
935 
936 	off_t *values = node->Values();
937 	uint16 *keyLengths = node->KeyLengths();
938 	uint8 *keys = node->Keys();
939 
940 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
941 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() + keyLength);
942 
943 	off_t *newValues = node->Values();
944 	uint16 *newKeyLengths = node->KeyLengths();
945 
946 	// move values and copy new value into them
947 	memmove(newValues + index + 1, values + index, sizeof(off_t) * (node->NumKeys() - 1 - index));
948 	memmove(newValues, values, sizeof(off_t) * index);
949 
950 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
951 
952 	// move and update key length index
953 	for (uint16 i = node->NumKeys(); i-- > index + 1;)
954 		newKeyLengths[i] = keyLengths[i - 1] + keyLength;
955 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
956 
957 	int32 keyStart;
958 	newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0);
959 
960 	// move keys and copy new key into them
961 	int32 size = node->AllKeyLength() - newKeyLengths[index];
962 	if (size > 0)
963 		memmove(keys + newKeyLengths[index], keys + newKeyLengths[index] - keyLength, size);
964 
965 	memcpy(keys + keyStart, key, keyLength);
966 }
967 
968 
969 /**	Splits the \a node into two halves - the other half will be put into \a other.
970  *	It also takes care to create a new overflow link if the node to split is an
971  *	index node.
972  */
973 
974 status_t
975 BPlusTree::SplitNode(bplustree_node *node, off_t nodeOffset, bplustree_node *other,
976 	off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value)
977 {
978 	if (*_keyIndex > node->NumKeys() + 1)
979 		return B_BAD_VALUE;
980 
981 	uint16 *inKeyLengths = node->KeyLengths();
982 	off_t *inKeyValues = node->Values();
983 	uint8 *inKeys = node->Keys();
984 	uint8 *outKeys = other->Keys();
985 	int32 keyIndex = *_keyIndex;	// can become less than zero!
986 
987 	// how many keys will fit in one (half) page?
988 	// that loop will find the answer to this question and
989 	// change the key lengths indices for their new home
990 
991 	// "bytes" is the number of bytes written for the new key,
992 	// "bytesBefore" are the bytes before that key
993 	// "bytesAfter" are the bytes after the new key, if any
994 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
995 
996 	size_t size = fNodeSize >> 1;
997 	int32 out, in;
998 	for (in = out = 0; in < node->NumKeys() + 1;) {
999 		if (!bytes)
1000 			bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0;
1001 
1002 		if (in == keyIndex && !bytes) {
1003 			bytes = *_keyLength;
1004 		} else {
1005 			if (keyIndex < out)
1006 				bytesAfter = inKeyLengths[in] - bytesBefore;
1007 
1008 			in++;
1009 		}
1010 		out++;
1011 
1012 		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1013 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1014 			// we have found the number of keys in the new node!
1015 			break;
1016 		}
1017 	}
1018 
1019 	// if the new key was not inserted, set the length of the keys
1020 	// that can be copied directly
1021 	if (keyIndex >= out && in > 0)
1022 		bytesBefore = inKeyLengths[in - 1];
1023 
1024 	if (bytesBefore < 0 || bytesAfter < 0)
1025 		return B_BAD_DATA;
1026 
1027 	other->left_link = node->left_link;
1028 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1029 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
1030 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1031 
1032 	uint16 *outKeyLengths = other->KeyLengths();
1033 	off_t *outKeyValues = other->Values();
1034 	int32 keys = out > keyIndex ? keyIndex : out;
1035 
1036 	if (bytesBefore) {
1037 		// copy the keys
1038 		memcpy(outKeys, inKeys, bytesBefore);
1039 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1040 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1041 	}
1042 	if (bytes) {
1043 		// copy the newly inserted key
1044 		memcpy(outKeys + bytesBefore, key, bytes);
1045 		outKeyLengths[keyIndex] = bytes + bytesBefore;
1046 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1047 
1048 		if (bytesAfter) {
1049 			// copy the keys after the new key
1050 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, bytesAfter);
1051 			keys = out - keyIndex - 1;
1052 			for (int32 i = 0;i < keys;i++)
1053 				outKeyLengths[keyIndex + i + 1] = inKeyLengths[keyIndex + i] + bytes;
1054 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, keys * sizeof(off_t));
1055 		}
1056 	}
1057 
1058 	// if the new key was already inserted, we shouldn't use it again
1059 	if (in != out)
1060 		keyIndex--;
1061 
1062 	int32 total = bytesBefore + bytesAfter;
1063 
1064 	// these variables are for the key that will be returned
1065 	// to the parent node
1066 	uint8 *newKey = NULL;
1067 	uint16 newLength;
1068 	bool newAllocated = false;
1069 
1070 	// If we have split an index node, we have to drop the first key
1071 	// of the next node (which can also be the new key to insert).
1072 	// The dropped key is also the one which has to be inserted in
1073 	// the parent node, so we will set the "newKey" already here.
1074 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1075 		if (in == keyIndex) {
1076 			newKey = key;
1077 			newLength = *_keyLength;
1078 
1079 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1080 			keyIndex--;
1081 		} else {
1082 			// If a key is dropped (is not the new key), we have to copy
1083 			// it, because it would be lost if not.
1084 			uint8 *droppedKey = node->KeyAt(in, &newLength);
1085 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
1086 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1087 				fStream->GetVolume()->Panic();
1088 				RETURN_ERROR(B_BAD_DATA);
1089 			}
1090 			newKey = (uint8 *)malloc(newLength);
1091 			if (newKey == NULL)
1092 				return B_NO_MEMORY;
1093 			memcpy(newKey, droppedKey, newLength);
1094 
1095 			other->overflow_link = inKeyValues[in];
1096 			total = inKeyLengths[in++];
1097 		}
1098 	}
1099 
1100 	// and now the same game for the other page and the rest of the keys
1101 	// (but with memmove() instead of memcpy(), because they may overlap)
1102 
1103 	bytesBefore = bytesAfter = bytes = 0;
1104 	out = 0;
1105 	int32 skip = in;
1106 	while (in < node->NumKeys() + 1) {
1107 		if (in == keyIndex && !bytes) {
1108 			// it's enough to set bytesBefore once here, because we do
1109 			// not need to know the exact length of all keys in this
1110 			// loop
1111 			bytesBefore = in > skip ? inKeyLengths[in - 1] : 0;
1112 			bytes = *_keyLength;
1113 		} else {
1114 			if (in < node->NumKeys()) {
1115 				inKeyLengths[in] -= total;
1116 				if (bytes) {
1117 					inKeyLengths[in] += bytes;
1118 					bytesAfter = inKeyLengths[in] - bytesBefore - bytes;
1119 				}
1120 			}
1121 			in++;
1122 		}
1123 
1124 		out++;
1125 
1126 		// break out when all keys are done
1127 		if (in > node->NumKeys() && keyIndex < in)
1128 			break;
1129 	}
1130 
1131 	// adjust the byte counts (since we were a bit lazy in the loop)
1132 	if (keyIndex >= in && keyIndex - skip < out)
1133 		bytesAfter = inKeyLengths[in] - bytesBefore - total;
1134 	else if (keyIndex < skip)
1135 		bytesBefore = node->AllKeyLength() - total;
1136 
1137 	if (bytesBefore < 0 || bytesAfter < 0)
1138 		return B_BAD_DATA;
1139 
1140 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1141 		// right link, and overflow link can stay the same
1142 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
1143 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1);
1144 
1145 	// array positions have changed
1146 	outKeyLengths = node->KeyLengths();
1147 	outKeyValues = node->Values();
1148 
1149 	// move the keys in the old node: the order is important here,
1150 	// because we don't want to overwrite any contents
1151 
1152 	keys = keyIndex <= skip ? out : keyIndex - skip;
1153 	keyIndex -= skip;
1154 
1155 	if (bytesBefore)
1156 		memmove(inKeys, inKeys + total, bytesBefore);
1157 	if (bytesAfter)
1158 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, bytesAfter);
1159 
1160 	if (bytesBefore)
1161 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1162 	in = out - keyIndex - 1;
1163 	if (bytesAfter)
1164 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, in * sizeof(uint16));
1165 
1166 	if (bytesBefore)
1167 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1168 	if (bytesAfter)
1169 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, in * sizeof(off_t));
1170 
1171 	if (bytes) {
1172 		// finally, copy the newly inserted key (don't overwrite anything)
1173 		memcpy(inKeys + bytesBefore, key, bytes);
1174 		outKeyLengths[keyIndex] = bytes + bytesBefore;
1175 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1176 	}
1177 
1178 	// Prepare the key that will be inserted in the parent node which
1179 	// is either the dropped key or the last of the other node.
1180 	// If it's the dropped key, "newKey" was already set earlier.
1181 
1182 	if (newKey == NULL)
1183 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1184 
1185 	memcpy(key, newKey, newLength);
1186 	*_keyLength = newLength;
1187 	*_value = otherOffset;
1188 
1189 	if (newAllocated)
1190 		free(newKey);
1191 
1192 	return B_OK;
1193 }
1194 
1195 
1196 /**	This inserts a key into the tree. The changes made to the tree will
1197  *	all be part of the \a transaction.
1198  */
1199 
1200 status_t
1201 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value)
1202 {
1203 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1204 		RETURN_ERROR(B_BAD_VALUE);
1205 #ifdef DEBUG
1206 	if (value < 0)
1207 		panic("tried to insert invalid value %Ld!\n", value);
1208 #endif
1209 
1210 	// lock access to stream
1211 	WriteLocked locked(fStream->Lock());
1212 
1213 	Stack<node_and_key> stack;
1214 	if (SeekDown(stack, key, keyLength) != B_OK)
1215 		RETURN_ERROR(B_ERROR);
1216 
1217 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1218 
1219 	memcpy(keyBuffer, key, keyLength);
1220 	keyBuffer[keyLength] = 0;
1221 
1222 	node_and_key nodeAndKey;
1223 	const bplustree_node *node;
1224 
1225 	CachedNode cached(this);
1226 	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1227 #ifdef DEBUG
1228 		NodeChecker checker(node, fNodeSize, "insert");
1229 #endif
1230 		if (node->IsLeaf()) {
1231 			// first round, check for duplicate entries
1232 			status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
1233 
1234 			// is this a duplicate entry?
1235 			if (status == B_OK) {
1236 				if (fAllowDuplicates) {
1237 					status = InsertDuplicate(transaction, cached, node, nodeAndKey.keyIndex, value);
1238 					if (status != B_OK)
1239 						RETURN_ERROR(status);
1240 					return B_OK;
1241 				}
1242 
1243 				RETURN_ERROR(B_NAME_IN_USE);
1244 			}
1245 		}
1246 
1247 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1248 		if (writableNode == NULL)
1249 			return B_IO_ERROR;
1250 
1251 		// is the node big enough to hold the pair?
1252 		if (int32(round_up(sizeof(bplustree_node) + writableNode->AllKeyLength() + keyLength)
1253 			+ (writableNode->NumKeys() + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
1254 		{
1255 			InsertKey(writableNode, nodeAndKey.keyIndex, keyBuffer, keyLength, value);
1256 			UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, nodeAndKey.keyIndex, 0, 1);
1257 
1258 			return B_OK;
1259 		} else {
1260 			CachedNode cachedNewRoot(this);
1261 			CachedNode cachedOther(this);
1262 
1263 			// do we need to allocate a new root node? if so, then do
1264 			// it now
1265 			off_t newRoot = BPLUSTREE_NULL;
1266 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1267 				bplustree_node *root;
1268 				status_t status = cachedNewRoot.Allocate(transaction, &root, &newRoot);
1269 				if (status < B_OK) {
1270 					// The tree is most likely corrupted!
1271 					// But it's still sane at leaf level - we could set
1272 					// a flag in the header that forces the tree to be
1273 					// rebuild next time...
1274 					// But since we will have journaling, that's not a big
1275 					// problem anyway.
1276 					RETURN_ERROR(status);
1277 				}
1278 			}
1279 
1280 			// reserve space for the other node
1281 			bplustree_node *other;
1282 			off_t otherOffset;
1283 			status_t status = cachedOther.Allocate(transaction, &other, &otherOffset);
1284 			if (status < B_OK) {
1285 				cachedNewRoot.Free(transaction, newRoot);
1286 				RETURN_ERROR(status);
1287 			}
1288 
1289 			if (SplitNode(writableNode, nodeAndKey.nodeOffset, other, otherOffset,
1290 					&nodeAndKey.keyIndex, keyBuffer, &keyLength, &value) < B_OK) {
1291 				// free root node & other node here
1292 				cachedOther.Free(transaction, otherOffset);
1293 				cachedNewRoot.Free(transaction, newRoot);
1294 
1295 				RETURN_ERROR(B_ERROR);
1296 			}
1297 #ifdef DEBUG
1298 			checker.Check("insert split");
1299 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1300 #endif
1301 
1302 			UpdateIterators(nodeAndKey.nodeOffset, otherOffset, nodeAndKey.keyIndex,
1303 				writableNode->NumKeys(), 1);
1304 
1305 			// update the right link of the node in the left of the new node
1306 			if ((other = cachedOther.SetToWritable(transaction, other->LeftLink())) != NULL) {
1307 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1308 			}
1309 
1310 			// create a new root if necessary
1311 			if (newRoot != BPLUSTREE_NULL) {
1312 				bplustree_node *root = cachedNewRoot.Node();
1313 
1314 				InsertKey(root, 0, keyBuffer, keyLength, writableNode->LeftLink());
1315 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(nodeAndKey.nodeOffset);
1316 
1317 				bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction);
1318 				if (header == NULL)
1319 					return B_IO_ERROR;
1320 
1321 				// finally, update header to point to the new root
1322 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1323 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(header->MaxNumberOfLevels() + 1);
1324 
1325 				return B_OK;
1326 			}
1327 		}
1328 	}
1329 	RETURN_ERROR(B_ERROR);
1330 }
1331 
1332 
1333 /**	Removes the duplicate index/value pair from the tree.
1334  *	It's part of the private tree interface.
1335  */
1336 
1337 status_t
1338 BPlusTree::RemoveDuplicate(Transaction &transaction, const bplustree_node *node,
1339 	CachedNode &cached, uint16 index, off_t value)
1340 {
1341 	off_t *values = node->Values();
1342 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1343 
1344 	CachedNode cachedDuplicate(this);
1345 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1346 	bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
1347 		duplicateOffset, false);
1348 	if (duplicate == NULL)
1349 		return B_IO_ERROR;
1350 
1351 	// if it's a duplicate fragment, remove the entry from there
1352 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1353 		duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
1354 
1355 		if (array->count > NUM_FRAGMENT_VALUES
1356 			|| array->count < 1) {
1357 			FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",
1358 				bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count));
1359 			return B_BAD_DATA;
1360 		}
1361 		if (!array->Remove(value)) {
1362 			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",
1363 				value, duplicateOffset));
1364 			return B_ENTRY_NOT_FOUND;
1365 		}
1366 
1367 		// remove the array from the fragment node if it is empty
1368 		if (array->count == 1) {
1369 			// set the link to the remaining value
1370 			if (cached.MakeWritable(transaction) == NULL)
1371 				return B_IO_ERROR;
1372 
1373 			values[index] = array->values[0];
1374 
1375 			// Remove the whole fragment node, if this was the only array,
1376 			// otherwise free just the array
1377 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1378 				status_t status = cachedDuplicate.Free(transaction, duplicateOffset);
1379 				if (status < B_OK)
1380 					return status;
1381 			} else
1382 				array->count = 0;
1383 		}
1384 		return B_OK;
1385 	}
1386 
1387 	// Remove value from a duplicate node!
1388 
1389 	duplicate_array *array = NULL;
1390 
1391 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1392 		FATAL(("invalid duplicate node: first left link points to %Ld!\n",
1393 			duplicate->LeftLink()));
1394 		return B_BAD_DATA;
1395 	}
1396 
1397 	// Search the duplicate nodes until the entry could be found (and removed)
1398 	while (duplicate != NULL) {
1399 		array = duplicate->DuplicateArray();
1400 		if (array->count > NUM_DUPLICATE_VALUES
1401 			|| array->count < 0) {
1402 			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
1403 				duplicateOffset, array->count));
1404 			return B_BAD_DATA;
1405 		}
1406 
1407 		if (array->Remove(value))
1408 			break;
1409 
1410 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1411 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1412 
1413 		cachedDuplicate.UnsetUnchanged(transaction);
1414 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false);
1415 	}
1416 	if (duplicate == NULL)
1417 		RETURN_ERROR(B_IO_ERROR);
1418 
1419 	// The entry got removed from the duplicate node, but we might want to free
1420 	// it now in case it's empty
1421 
1422 	while (true) {
1423 		off_t left = duplicate->LeftLink();
1424 		off_t right = duplicate->RightLink();
1425 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1426 
1427 		if ((isLast && array->count == 1) || array->count == 0) {
1428 			// Free empty duplicate page, link their siblings together, and
1429 			// update the duplicate link if needed (ie. when we either remove
1430 			// the last duplicate node or have a new first one)
1431 
1432 			if (left == BPLUSTREE_NULL) {
1433 				// the duplicate link points to us
1434 				if (cached.MakeWritable(transaction) == NULL)
1435 					return B_IO_ERROR;
1436 
1437 				if (array->count == 1) {
1438 					// this is the last node, and there is only one value left;
1439 					// replace the duplicate link with that value, it's no duplicate
1440 					// anymore
1441 					values[index] = array->values[0];
1442 				} else {
1443 					// move the duplicate link to the next node
1444 					values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1445 						BPLUSTREE_DUPLICATE_NODE, right));
1446 				}
1447 			}
1448 
1449 			status_t status;
1450 			if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK)
1451 				return status;
1452 
1453 			if (left != BPLUSTREE_NULL
1454 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) {
1455 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1456 
1457 				// If the next node is the last node, we need to free that node
1458 				// and convert the duplicate entry back into a normal entry
1459 				array = duplicate->DuplicateArray();
1460 				if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL
1461 					&& array->count <= NUM_FRAGMENT_VALUES) {
1462 					duplicateOffset = left;
1463 					continue;
1464 				}
1465 			}
1466 			if (right != BPLUSTREE_NULL
1467 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) {
1468 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1469 
1470 				// Again, we may need to turn the duplicate entry back into a normal entry
1471 				array = duplicate->DuplicateArray();
1472 				if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL
1473 					&& array->count <= NUM_FRAGMENT_VALUES) {
1474 					duplicateOffset = right;
1475 					continue;
1476 				}
1477 			}
1478 			return B_OK;
1479 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1480 			// If the number of entries fits in a duplicate fragment, then
1481 			// either find a free fragment node, or convert this node to a
1482 			// fragment node.
1483 			CachedNode cachedOther(this);
1484 
1485 			bplustree_node *fragment = NULL;
1486 			uint32 fragmentIndex = 0;
1487 			off_t offset;
1488 			if (FindFreeDuplicateFragment(transaction, node, cachedOther,
1489 					&offset, &fragment, &fragmentIndex) == B_OK) {
1490 				// move to other node
1491 				duplicate_array *target = fragment->FragmentAt(fragmentIndex);
1492 				memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1493 
1494 				cachedDuplicate.Free(transaction, duplicateOffset);
1495 				duplicateOffset = offset;
1496 			} else {
1497 				// convert node
1498 				memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1499 				memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1500 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1501 			}
1502 
1503 			if (cached.MakeWritable(transaction) == NULL)
1504 				return B_IO_ERROR;
1505 
1506 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1507 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1508 		}
1509 		return B_OK;
1510 	}
1511 }
1512 
1513 
1514 /** Removes the key with the given index from the specified node.
1515  *	Since it has to get the key from the node anyway (to obtain it's
1516  *	pointer), it's not needed to pass the key & its length, although
1517  *	the calling method (BPlusTree::Remove()) have this data.
1518  */
1519 
1520 void
1521 BPlusTree::RemoveKey(bplustree_node *node, uint16 index)
1522 {
1523 	// should never happen, but who knows?
1524 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1525 		FATAL(("Asked me to remove key outer limits: %u\n", index));
1526 		return;
1527 	}
1528 
1529 	off_t *values = node->Values();
1530 
1531 	// if we would have to drop the overflow link, drop
1532 	// the last key instead and update the overflow link
1533 	// to the value of that one
1534 	if (!node->IsLeaf() && index == node->NumKeys())
1535 		node->overflow_link = values[--index];
1536 
1537 	uint16 length;
1538 	uint8 *key = node->KeyAt(index, &length);
1539 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
1540 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1541 		FATAL(("Key length to long: %s, %u (inode at %ld,%u)\n", key, length,
1542 			fStream->BlockRun().allocation_group, fStream->BlockRun().start));
1543 		fStream->GetVolume()->Panic();
1544 		return;
1545 	}
1546 
1547 	uint16 *keyLengths = node->KeyLengths();
1548 	uint8 *keys = node->Keys();
1549 
1550 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1551 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length);
1552 
1553 	off_t *newValues = node->Values();
1554 	uint16 *newKeyLengths = node->KeyLengths();
1555 
1556 	// move key data
1557 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1558 
1559 	// move and update key lengths
1560 	if (index > 0 && newKeyLengths != keyLengths)
1561 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1562 	for (uint16 i = index; i < node->NumKeys(); i++)
1563 		newKeyLengths[i] = keyLengths[i + 1] - length;
1564 
1565 	// move values
1566 	if (index > 0)
1567 		memmove(newValues, values, index * sizeof(off_t));
1568 	if (node->NumKeys() > index)
1569 		memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t));
1570 }
1571 
1572 
1573 /**	Removes the specified key from the tree. The "value" parameter is only used
1574  *	for trees which allow duplicates, so you may safely ignore it.
1575  *	It's not an optional parameter, so at least you have to think about it.
1576  */
1577 
1578 status_t
1579 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value)
1580 {
1581 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1582 		RETURN_ERROR(B_BAD_VALUE);
1583 
1584 	// lock access to stream
1585 	WriteLocked locked(fStream->Lock());
1586 
1587 	Stack<node_and_key> stack;
1588 	if (SeekDown(stack, key, keyLength) != B_OK)
1589 		RETURN_ERROR(B_ERROR);
1590 
1591 	node_and_key nodeAndKey;
1592 	const bplustree_node *node;
1593 
1594 	CachedNode cached(this);
1595 	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1596 #ifdef DEBUG
1597 		NodeChecker checker(node, fNodeSize, "remove");
1598 #endif
1599 		if (node->IsLeaf()) {
1600 			// first round, check for duplicate entries
1601 			status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
1602 			if (status < B_OK)
1603 				RETURN_ERROR(status);
1604 
1605 			// If we will remove the last key, the iterator will be set
1606 			// to the next node after the current - if there aren't any
1607 			// more nodes, we need a way to prevent the TreeIterators to
1608 			// touch the old node again, we use BPLUSTREE_FREE for this
1609 			off_t next = node->RightLink() == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->RightLink();
1610 			UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 ?
1611 								next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
1612 
1613 			// is this a duplicate entry?
1614 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
1615 					node->Values()[nodeAndKey.keyIndex]))) {
1616 				if (fAllowDuplicates) {
1617 					return RemoveDuplicate(transaction, node, cached,
1618 						nodeAndKey.keyIndex, value);
1619 				} else {
1620 					FATAL(("dupliate node found where no duplicates are allowed!\n"));
1621 					RETURN_ERROR(B_ERROR);
1622 				}
1623 			}
1624 		}
1625 
1626 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1627 		if (writableNode == NULL)
1628 			return B_IO_ERROR;
1629 
1630 		// if it's an empty root node, we have to convert it
1631 		// to a leaf node by dropping the overflow link, or,
1632 		// if it's a leaf node, just empty it
1633 		if (nodeAndKey.nodeOffset == fHeader->RootNode()
1634 			&& node->NumKeys() == 0
1635 			|| node->NumKeys() == 1 && node->IsLeaf()) {
1636 			writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1637 			writableNode->all_key_count = 0;
1638 			writableNode->all_key_length = 0;
1639 
1640 			// if we've cleared the root node, reset the maximum
1641 			// number of levels in the header
1642 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1643 				bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction);
1644 				if (header == NULL)
1645 					return B_IO_ERROR;
1646 
1647 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1648 			}
1649 			return B_OK;
1650 		}
1651 
1652 		// if there is only one key left, we don't have to remove
1653 		// it, we can just dump the node (index nodes still have
1654 		// the overflow link, so we have to drop the last key)
1655 		if (writableNode->NumKeys() > 1
1656 			|| !writableNode->IsLeaf() && writableNode->NumKeys() == 1) {
1657 			RemoveKey(writableNode, nodeAndKey.keyIndex);
1658 			return B_OK;
1659 		}
1660 
1661 		// when we are here, we can just free the node, but
1662 		// we have to update the right/left link of the
1663 		// siblings first
1664 		CachedNode otherCached(this);
1665 		bplustree_node *other = otherCached.SetToWritable(transaction,
1666 			writableNode->LeftLink());
1667 		if (other != NULL)
1668 			other->right_link = writableNode->right_link;
1669 
1670 		if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL)
1671 			other->left_link = writableNode->left_link;
1672 
1673 		cached.Free(transaction, nodeAndKey.nodeOffset);
1674 	}
1675 	RETURN_ERROR(B_ERROR);
1676 }
1677 
1678 
1679 /**	Replaces the value for the key in the tree.
1680  *	Returns B_OK if the key could be found and its value replaced,
1681  *	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1682  *	to indicate that something went terribly wrong.
1683  *	Note that this doesn't work with duplicates - it will just
1684  *	return B_BAD_TYPE if you call this function on a tree where
1685  *	duplicates are allowed.
1686  */
1687 
1688 status_t
1689 BPlusTree::Replace(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value)
1690 {
1691 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1692 		|| key == NULL)
1693 		RETURN_ERROR(B_BAD_VALUE);
1694 
1695 	if (fAllowDuplicates)
1696 		RETURN_ERROR(B_BAD_TYPE);
1697 
1698 	// lock access to stream (a read lock is okay for this purpose)
1699 	ReadLocked locked(fStream->Lock());
1700 
1701 	off_t nodeOffset = fHeader->RootNode();
1702 	CachedNode cached(this);
1703 	const bplustree_node *node;
1704 
1705 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1706 		uint16 keyIndex = 0;
1707 		off_t nextOffset;
1708 		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1709 
1710 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1711 			if (status == B_OK) {
1712 				bplustree_node *writableNode = cached.MakeWritable(transaction);
1713 				if (writableNode != NULL)
1714 					writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value);
1715 				else
1716 					status = B_IO_ERROR;
1717 			}
1718 
1719 			return status;
1720 		} else if (nextOffset == nodeOffset)
1721 			RETURN_ERROR(B_ERROR);
1722 
1723 		nodeOffset = nextOffset;
1724 	}
1725 	RETURN_ERROR(B_ERROR);
1726 }
1727 
1728 
1729 /**	Searches the key in the tree, and stores the offset found in
1730  *	_value, if successful.
1731  *	It's very similar to BPlusTree::SeekDown(), but doesn't fill
1732  *	a stack while it descends the tree.
1733  *	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND
1734  *	if not. It can also return other errors to indicate that
1735  *	something went wrong.
1736  *	Note that this doesn't work with duplicates - it will just
1737  *	return B_BAD_TYPE if you call this function on a tree where
1738  *	duplicates are allowed.
1739  */
1740 
1741 status_t
1742 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value)
1743 {
1744 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1745 		|| key == NULL)
1746 		RETURN_ERROR(B_BAD_VALUE);
1747 
1748 	if (fAllowDuplicates)
1749 		RETURN_ERROR(B_BAD_TYPE);
1750 
1751 	// lock access to stream
1752 	ReadLocked locked(fStream->Lock());
1753 
1754 	off_t nodeOffset = fHeader->RootNode();
1755 	CachedNode cached(this);
1756 	const bplustree_node *node;
1757 
1758 #ifdef DEBUG
1759 	int32 levels = 0;
1760 #endif
1761 
1762 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1763 		uint16 keyIndex = 0;
1764 		off_t nextOffset;
1765 		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1766 
1767 #ifdef DEBUG
1768 		levels++;
1769 #endif
1770 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1771 			if (status == B_OK && _value != NULL)
1772 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1773 
1774 #ifdef DEBUG
1775 			if (levels != (int32)fHeader->MaxNumberOfLevels())
1776 				DEBUGGER(("levels don't match"));
1777 #endif
1778 			return status;
1779 		} else if (nextOffset == nodeOffset)
1780 			RETURN_ERROR(B_ERROR);
1781 
1782 		nodeOffset = nextOffset;
1783 	}
1784 	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
1785 	RETURN_ERROR(B_ERROR);
1786 }
1787 
1788 
1789 //	#pragma mark -
1790 
1791 
1792 TreeIterator::TreeIterator(BPlusTree *tree)
1793 	:
1794 	fTree(tree),
1795 	fCurrentNodeOffset(BPLUSTREE_NULL),
1796 	fNext(NULL)
1797 {
1798 	tree->AddIterator(this);
1799 }
1800 
1801 
1802 TreeIterator::~TreeIterator()
1803 {
1804 	if (fTree)
1805 		fTree->RemoveIterator(this);
1806 }
1807 
1808 
1809 status_t
1810 TreeIterator::Goto(int8 to)
1811 {
1812 	if (fTree == NULL || fTree->fHeader == NULL)
1813 		RETURN_ERROR(B_BAD_VALUE);
1814 
1815 	// lock access to stream
1816 	ReadLocked locked(fTree->fStream->Lock());
1817 
1818 	off_t nodeOffset = fTree->fHeader->RootNode();
1819 	CachedNode cached(fTree);
1820 	const bplustree_node *node;
1821 
1822 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1823 		// is the node a leaf node?
1824 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1825 			fCurrentNodeOffset = nodeOffset;
1826 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1827 			fDuplicateNode = BPLUSTREE_NULL;
1828 
1829 			return B_OK;
1830 		}
1831 
1832 		// get the next node offset depending on the direction (and if there
1833 		// are any keys in that node at all)
1834 		off_t nextOffset;
1835 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1836 			nextOffset = node->OverflowLink();
1837 		else {
1838 			if (node->AllKeyLength() > fTree->fNodeSize
1839 				|| (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->NumKeys())
1840 				RETURN_ERROR(B_ERROR);
1841 
1842 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1843 		}
1844 		if (nextOffset == nodeOffset)
1845 			break;
1846 
1847 		nodeOffset = nextOffset;
1848 	}
1849 	FATAL(("%s fails\n", __FUNCTION__));
1850 
1851 	RETURN_ERROR(B_ERROR);
1852 }
1853 
1854 
1855 /**	Iterates through the tree in the specified direction.
1856  *	When it iterates through duplicates, the "key" is only updated for the
1857  *	first entry - if you need to know when this happens, use the "duplicate"
1858  *	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
1859  *	the other duplicates.
1860  *	That's not too nice, but saves the 256 bytes that would be needed to
1861  *	store the last key - if this will ever become an issue, it will be
1862  *	easy to change.
1863  *	The other advantage of this is, that the queries can skip all duplicates
1864  *	at once when they are not relevant to them.
1865  */
1866 
1867 status_t
1868 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength,
1869 	off_t *value, uint16 *duplicate)
1870 {
1871 	if (fTree == NULL)
1872 		return B_INTERRUPTED;
1873 	if (fCurrentNodeOffset == BPLUSTREE_NULL
1874 		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
1875 		RETURN_ERROR(B_ERROR);
1876 
1877 	// if the tree was emptied since the last call
1878 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
1879 		return B_ENTRY_NOT_FOUND;
1880 
1881 	// lock access to stream
1882 	ReadLocked locked(fTree->fStream->Lock());
1883 
1884 	CachedNode cached(fTree);
1885 	const bplustree_node *node;
1886 
1887 	if (fDuplicateNode != BPLUSTREE_NULL) {
1888 		// regardless of traverse direction the duplicates are always presented in
1889 		// the same order; since they are all considered as equal, this shouldn't
1890 		// cause any problems
1891 
1892 		if (!fIsFragment || fDuplicate < fNumDuplicates)
1893 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1894 		else
1895 			node = NULL;
1896 
1897 		if (node != NULL) {
1898 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
1899 				// if the node is out of duplicates, we go directly to the next one
1900 				fDuplicateNode = node->RightLink();
1901 				if (fDuplicateNode != BPLUSTREE_NULL
1902 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
1903 					fNumDuplicates = node->CountDuplicates(fDuplicateNode, false);
1904 					fDuplicate = 0;
1905 				}
1906 			}
1907 			if (fDuplicate < fNumDuplicates) {
1908 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++);
1909 				if (duplicate)
1910 					*duplicate = 2;
1911 				return B_OK;
1912 			}
1913 		}
1914 		fDuplicateNode = BPLUSTREE_NULL;
1915 	}
1916 
1917 	off_t savedNodeOffset = fCurrentNodeOffset;
1918 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
1919 		RETURN_ERROR(B_ERROR);
1920 
1921 	if (duplicate)
1922 		*duplicate = 0;
1923 
1924 	fCurrentKey += direction;
1925 
1926 	// is the current key in the current node?
1927 	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->NumKeys())
1928 			|| (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) {
1929 		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->RightLink() : node->LeftLink();
1930 
1931 		// are there any more nodes?
1932 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
1933 			node = cached.SetTo(fCurrentNodeOffset);
1934 			if (!node)
1935 				RETURN_ERROR(B_ERROR);
1936 
1937 			// reset current key
1938 			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->NumKeys();
1939 		} else {
1940 			// there are no nodes left, so turn back to the last key
1941 			fCurrentNodeOffset = savedNodeOffset;
1942 			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->NumKeys() : -1;
1943 
1944 			return B_ENTRY_NOT_FOUND;
1945 		}
1946 	}
1947 
1948 	if (node->all_key_count == 0)
1949 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
1950 
1951 	uint16 length;
1952 	uint8 *keyStart = node->KeyAt(fCurrentKey, &length);
1953 	if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize
1954 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1955 		fTree->fStream->GetVolume()->Panic();
1956 		RETURN_ERROR(B_BAD_DATA);
1957 	}
1958 
1959 	length = min_c(length, maxLength);
1960 	memcpy(key, keyStart, length);
1961 
1962 	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	{
1963 		// terminate string type
1964 		if (length == maxLength)
1965 			length--;
1966 		((char *)key)[length] = '\0';
1967 	}
1968 	*keyLength = length;
1969 
1970 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
1971 
1972 	// duplicate fragments?
1973 	uint8 type = bplustree_node::LinkType(offset);
1974 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) {
1975 		fDuplicateNode = offset;
1976 
1977 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1978 		if (node == NULL)
1979 			RETURN_ERROR(B_ERROR);
1980 
1981 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
1982 
1983 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
1984 		if (fNumDuplicates) {
1985 			offset = node->DuplicateAt(offset, fIsFragment, 0);
1986 			fDuplicate = 1;
1987 			if (duplicate)
1988 				*duplicate = 1;
1989 		} else {
1990 			// shouldn't happen, but we're dealing here with potentially corrupt disks...
1991 			fDuplicateNode = BPLUSTREE_NULL;
1992 			offset = 0;
1993 		}
1994 	}
1995 	*value = offset;
1996 
1997 	return B_OK;
1998 }
1999 
2000 
2001 /**	This is more or less a copy of BPlusTree::Find() - but it just
2002  *	sets the current position in the iterator, regardless of if the
2003  *	key could be found or not.
2004  */
2005 
2006 status_t
2007 TreeIterator::Find(const uint8 *key, uint16 keyLength)
2008 {
2009 	if (fTree == NULL)
2010 		return B_INTERRUPTED;
2011 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
2012 		|| key == NULL)
2013 		RETURN_ERROR(B_BAD_VALUE);
2014 
2015 	// lock access to stream
2016 	ReadLocked locked(fTree->fStream->Lock());
2017 
2018 	off_t nodeOffset = fTree->fHeader->RootNode();
2019 
2020 	CachedNode cached(fTree);
2021 	const bplustree_node *node;
2022 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2023 		uint16 keyIndex = 0;
2024 		off_t nextOffset;
2025 		status_t status = fTree->FindKey(node, key, keyLength, &keyIndex, &nextOffset);
2026 
2027 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2028 			fCurrentNodeOffset = nodeOffset;
2029 			fCurrentKey = keyIndex - 1;
2030 			fDuplicateNode = BPLUSTREE_NULL;
2031 
2032 			return status;
2033 		} else if (nextOffset == nodeOffset)
2034 			RETURN_ERROR(B_ERROR);
2035 
2036 		nodeOffset = nextOffset;
2037 	}
2038 	RETURN_ERROR(B_ERROR);
2039 }
2040 
2041 
2042 void
2043 TreeIterator::SkipDuplicates()
2044 {
2045 	fDuplicateNode = BPLUSTREE_NULL;
2046 }
2047 
2048 
2049 void
2050 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change)
2051 {
2052 	if (offset != fCurrentNodeOffset)
2053 		return;
2054 
2055 	if (nextOffset != BPLUSTREE_NULL) {
2056 		fCurrentNodeOffset = nextOffset;
2057 		if (splitAt <= fCurrentKey) {
2058 			fCurrentKey -= splitAt;
2059 			keyIndex -= splitAt;
2060 		}
2061 	}
2062 
2063 	// Adjust fCurrentKey to point to the same key as before.
2064 	// Note, that if a key is inserted at the current position
2065 	// it won't be included in this tree transition.
2066 	if (keyIndex <= fCurrentKey)
2067 		fCurrentKey += change;
2068 
2069 	// ToDo: duplicate handling!
2070 }
2071 
2072 
2073 void
2074 TreeIterator::Stop()
2075 {
2076 	fTree = NULL;
2077 }
2078 
2079 
2080 #ifdef DEBUG
2081 void
2082 TreeIterator::Dump()
2083 {
2084 	__out("TreeIterator at %p:\n", this);
2085 	__out("\tfTree = %p\n", fTree);
2086 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2087 	__out("\tfCurrentKey = %ld\n", fCurrentKey);
2088 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode);
2089 	__out("\tfDuplicate = %u\n", fDuplicate);
2090 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2091 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2092 }
2093 #endif
2094 
2095 
2096 //	#pragma mark -
2097 
2098 
2099 void
2100 bplustree_node::Initialize()
2101 {
2102 	left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2103 	all_key_count = 0;
2104 	all_key_length = 0;
2105 }
2106 
2107 
2108 uint8 *
2109 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const
2110 {
2111 	if (index < 0 || index > NumKeys())
2112 		return NULL;
2113 
2114 	uint8 *keyStart = Keys();
2115 	uint16 *keyLengths = KeyLengths();
2116 
2117 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2118 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2119 	if (index > 0)
2120 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2121 
2122 	return keyStart;
2123 }
2124 
2125 
2126 uint8
2127 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2128 {
2129 	// the duplicate fragment handling is currently hard-coded to a node size
2130 	// of 1024 bytes - with future versions of BFS, this may be a problem
2131 
2132 	if (isFragment) {
2133 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2134 
2135 		return ((off_t *)this)[fragment];
2136 	}
2137 	return OverflowLink();
2138 }
2139 
2140 
2141 off_t
2142 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2143 {
2144 	uint32 start;
2145 	if (isFragment)
2146 		start = 8 * ((uint64)offset & 0x3ff);
2147 	else
2148 		start = 2;
2149 
2150 	return ((off_t *)this)[start + 1 + index];
2151 }
2152 
2153 
2154 /**	Although the name suggests it, this function doesn't return the real
2155  *	used fragment count; at least, it can only count to two: it returns
2156  *	0, if there is no fragment used, 1 if there is only one fragment
2157  *	used, and 2 if there are at least 2 fragments used.
2158  */
2159 
2160 uint32
2161 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2162 {
2163 	uint32 used = 0;
2164 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2165 		duplicate_array *array = FragmentAt(i);
2166 		if (array->count > 0 && ++used > 1)
2167 			return used;
2168 	}
2169 	return used;
2170 }
2171 
2172 
2173 #ifdef DEBUG
2174 status_t
2175 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2176 {
2177 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2178 		DEBUGGER(("invalid node: key/length count"));
2179 
2180 	for (int32 i = 0; i < NumKeys(); i++) {
2181 		uint16 length;
2182 		uint8 *key = KeyAt(i, &length);
2183 		if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize
2184 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2185 			dprintf("node %p, key %ld\n", this, i);
2186 			DEBUGGER(("invalid node: keys corrupted"));
2187 			return B_BAD_DATA;
2188 		}
2189 		if (Values()[i] == -1) {
2190 			dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]);
2191 			DEBUGGER(("invalid node: values corrupted"));
2192 			return B_BAD_DATA;
2193 		}
2194 	}
2195 	return B_OK;
2196 }
2197 #endif
2198 
2199 
2200 //	#pragma mark -
2201 
2202 
2203 int32
2204 compareKeys(type_code type, const void *key1, int keyLength1, const void *key2, int keyLength2)
2205 {
2206 	// if one of the keys is NULL, bail out gracefully
2207 	if (key1 == NULL || key2 == NULL) {
2208 //#if USER
2209 //		// that's here to see if it's reasonable to handle this case nicely at all
2210 //		DEBUGGER(("compareKeys() got NULL key!"));
2211 //#endif
2212 		// even if it's most probably a bug in the calling code, we try to
2213 		// give a meaningful result
2214 		if (key1 == NULL && key2 != NULL)
2215 			return 1;
2216 		else if (key2 == NULL && key1 != NULL)
2217 			return -1;
2218 
2219 		return 0;
2220 	}
2221 
2222 	switch (type) {
2223 	    case B_STRING_TYPE:
2224     	{
2225 			int len = min_c(keyLength1, keyLength2);
2226 			int result = strncmp((const char *)key1, (const char *)key2, len);
2227 
2228 			if (result == 0
2229 				&& !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0'))
2230 				result = keyLength1 - keyLength2;
2231 
2232 			return result;
2233 		}
2234 
2235 		case B_SSIZE_T_TYPE:
2236 		case B_INT32_TYPE:
2237 			return *(int32 *)key1 - *(int32 *)key2;
2238 
2239 		case B_SIZE_T_TYPE:
2240 		case B_UINT32_TYPE:
2241 			if (*(uint32 *)key1 == *(uint32 *)key2)
2242 				return 0;
2243 			else if (*(uint32 *)key1 > *(uint32 *)key2)
2244 				return 1;
2245 
2246 			return -1;
2247 
2248 		case B_OFF_T_TYPE:
2249 		case B_INT64_TYPE:
2250 			if (*(int64 *)key1 == *(int64 *)key2)
2251 				return 0;
2252 			else if (*(int64 *)key1 > *(int64 *)key2)
2253 				return 1;
2254 
2255 			return -1;
2256 
2257 		case B_UINT64_TYPE:
2258 			if (*(uint64 *)key1 == *(uint64 *)key2)
2259 				return 0;
2260 			else if (*(uint64 *)key1 > *(uint64 *)key2)
2261 				return 1;
2262 
2263 			return -1;
2264 
2265 		case B_FLOAT_TYPE:
2266 		{
2267 			float result = *(float *)key1 - *(float *)key2;
2268 			if (result == 0.0f)
2269 				return 0;
2270 
2271 			return (result < 0.0f) ? -1 : 1;
2272 		}
2273 
2274 		case B_DOUBLE_TYPE:
2275 		{
2276 			double result = *(double *)key1 - *(double *)key2;
2277 			if (result == 0.0)
2278 				return 0;
2279 
2280 			return (result < 0.0) ? -1 : 1;
2281 		}
2282 	}
2283 
2284 	// if the type is unknown, the entries don't match...
2285 	return -1;
2286 }
2287 
2288