xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision cd552c7a15cc10c36dae8d7439ba1d6c0bb168c5)
1 /* BPlusTree - BFS B+Tree implementation
2  *
3  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
4  * no code but achieves binary compatibility with the on disk format.
5  *
6  * Copyright 2001-2006, Axel Dörfler, axeld@pinc-software.de.
7  * This file may be used under the terms of the MIT License.
8  */
9 
10 
11 #include "Debug.h"
12 #include "BPlusTree.h"
13 #include "Inode.h"
14 #include "Utility.h"
15 
16 #include <util/kernel_cpp.h>
17 #include <util/Stack.h>
18 #include <TypeConstants.h>
19 
20 #include <string.h>
21 #include <stdlib.h>
22 #include <stdio.h>
23 
24 
25 #ifdef DEBUG
26 class NodeChecker {
27 	public:
28 		NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text)
29 			:
30 			fNode(node),
31 			fSize(nodeSize),
32 			fText(text)
33 		{
34 			Check("integrity check failed on construction.");
35 		}
36 
37 		~NodeChecker()
38 		{
39 			Check("integrity check failed on destruction.");
40 		}
41 
42 		void
43 		Check(const char *message)
44 		{
45 			if (fNode->CheckIntegrity(fSize) < B_OK)
46 				dprintf("%s: %s\n", fText, message);
47 		}
48 
49 	private:
50 		const bplustree_node	*fNode;
51 		int32					fSize;
52 		const char				*fText;
53 };
54 #endif
55 
56 
57 // Node Caching for the BPlusTree class
58 //
59 // With write support, there is the need for a function that allocates new
60 // nodes by either returning empty nodes, or by growing the file's data stream
61 //
62 // !! The CachedNode class assumes that you have properly locked the stream
63 // !! before asking for nodes.
64 //
65 // Note: This code will fail if the block size is smaller than the node size!
66 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
67 // is hard-coded to 1024 bytes, that's not an issue now.
68 
69 void
70 CachedNode::UnsetUnchanged(Transaction &transaction)
71 {
72 	if (fTree == NULL || fTree->fStream == NULL)
73 		return;
74 
75 	if (fNode != NULL) {
76 		void *cache = fTree->fStream->GetVolume()->BlockCache();
77 
78 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
79 		block_cache_put(cache, fBlockNumber);
80 		fNode = NULL;
81 	}
82 }
83 
84 
85 void
86 CachedNode::Unset()
87 {
88 	if (fTree == NULL || fTree->fStream == NULL)
89 		return;
90 
91 	if (fNode != NULL) {
92 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber);
93 		fNode = NULL;
94 	}
95 }
96 
97 
98 const bplustree_node *
99 CachedNode::SetTo(off_t offset, bool check)
100 {
101 	if (fTree == NULL || fTree->fStream == NULL) {
102 		REPORT_ERROR(B_BAD_VALUE);
103 		return NULL;
104 	}
105 
106 	Unset();
107 
108 	// You can only ask for nodes at valid positions - you can't
109 	// even access the b+tree header with this method (use SetToHeader()
110 	// instead)
111 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
112 		|| offset <= 0
113 		|| (offset % fTree->fNodeSize) != 0)
114 		return NULL;
115 
116 	if (InternalSetTo(NULL, offset) != NULL && check) {
117 		// sanity checks (links, all_key_count)
118 		if (!fTree->fHeader->CheckNode(fNode)) {
119 			FATAL(("invalid node (%p) read from offset %Ld, inode at %Ld\n",
120 				fNode, offset, fTree->fStream->ID()));
121 			return NULL;
122 		}
123 	}
124 	return fNode;
125 }
126 
127 
128 bplustree_node *
129 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check)
130 {
131 	if (fTree == NULL || fTree->fStream == NULL) {
132 		REPORT_ERROR(B_BAD_VALUE);
133 		return NULL;
134 	}
135 
136 	Unset();
137 
138 	// You can only ask for nodes at valid positions - you can't
139 	// even access the b+tree header with this method (use SetToHeader()
140 	// instead)
141 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
142 		|| offset <= 0
143 		|| (offset % fTree->fNodeSize) != 0)
144 		return NULL;
145 
146 	if (InternalSetTo(&transaction, offset) != NULL && check) {
147 		// sanity checks (links, all_key_count)
148 		if (!fTree->fHeader->CheckNode(fNode)) {
149 			FATAL(("invalid node [%p] read from offset %Ld, inode at %Ld\n",
150 				fNode, offset, fTree->fStream->ID()));
151 			return NULL;
152 		}
153 	}
154 	return fNode;
155 }
156 
157 
158 bplustree_node *
159 CachedNode::MakeWritable(Transaction &transaction)
160 {
161 	if (fNode == NULL)
162 		return NULL;
163 
164 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
165 			fBlockNumber, transaction.ID()) == B_OK)
166 		return fNode;
167 
168 	return NULL;
169 }
170 
171 
172 bplustree_header *
173 CachedNode::MakeWritableHeader(Transaction &transaction)
174 {
175 	return (bplustree_header *)MakeWritable(transaction);
176 }
177 
178 
179 const bplustree_header *
180 CachedNode::SetToHeader()
181 {
182 	if (fTree == NULL || fTree->fStream == NULL) {
183 		REPORT_ERROR(B_BAD_VALUE);
184 		return NULL;
185 	}
186 
187 	Unset();
188 
189 	InternalSetTo(NULL, 0LL);
190 	return (bplustree_header *)fNode;
191 }
192 
193 
194 bplustree_header *
195 CachedNode::SetToWritableHeader(Transaction &transaction)
196 {
197 	if (fTree == NULL || fTree->fStream == NULL) {
198 		REPORT_ERROR(B_BAD_VALUE);
199 		return NULL;
200 	}
201 
202 	Unset();
203 
204 	InternalSetTo(&transaction, 0LL);
205 	return (bplustree_header *)fNode;
206 }
207 
208 
209 bplustree_node *
210 CachedNode::InternalSetTo(Transaction *transaction, off_t offset)
211 {
212 	fNode = NULL;
213 	fOffset = offset;
214 
215 	off_t fileOffset;
216 	block_run run;
217 	if (offset < fTree->fStream->Size()
218 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
219 		Volume *volume = fTree->fStream->GetVolume();
220 
221 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
222 		fBlockNumber = volume->ToBlock(run) + blockOffset;
223 		uint8 *block;
224 
225 		if (transaction != NULL) {
226 			block = (uint8 *)block_cache_get_writable(volume->BlockCache(), fBlockNumber, transaction->ID());
227 			fWritable = true;
228 		} else {
229 			block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber);
230 			fWritable = false;
231 		}
232 
233 		if (block) {
234 			// the node is somewhere in that block... (confusing offset calculation)
235 			fNode = (bplustree_node *)(block + offset -
236 						(fileOffset + (blockOffset << volume->BlockShift())));
237 		} else
238 			REPORT_ERROR(B_IO_ERROR);
239 	}
240 	return fNode;
241 }
242 
243 
244 status_t
245 CachedNode::Free(Transaction &transaction, off_t offset)
246 {
247 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
248 		RETURN_ERROR(B_BAD_VALUE);
249 
250 	// ToDo: scan the free nodes list and remove all nodes at the end
251 	// of the tree - perhaps that shouldn't be done everytime that
252 	// function is called, perhaps it should be done when the directory
253 	// inode is closed or based on some calculation or whatever...
254 
255 	bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction);
256 	if (header == NULL)
257 		return B_IO_ERROR;
258 
259 	// if the node is the last one in the tree, we shrink
260 	// the tree and file size by one node
261 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
262 	if (offset == lastOffset) {
263 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
264 		if (status < B_OK)
265 			return status;
266 
267 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
268 		return B_OK;
269 	}
270 
271 	// add the node to the free nodes list
272 	fNode->left_link = header->free_node_pointer;
273 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
274 
275 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
276 	return B_OK;
277 }
278 
279 
280 status_t
281 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node, off_t *_offset)
282 {
283 	if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL)
284 		RETURN_ERROR(B_BAD_VALUE);
285 
286 	Unset();
287 
288 	bplustree_header *header;
289 	status_t status;
290 
291 	// if there are any free nodes, recycle them
292 	if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) {
293 		header = fTree->fCachedHeader.MakeWritableHeader(transaction);
294 		if (header == NULL)
295 			return B_IO_ERROR;
296 
297 		*_offset = header->FreeNode();
298 		*_node = fNode;
299 
300 		// set new free node pointer
301 		header->free_node_pointer = fNode->left_link;
302 		fNode->Initialize();
303 		return B_OK;
304 	}
305 
306 	// allocate space for a new node
307 	Inode *stream = fTree->fStream;
308 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
309 		return status;
310 
311 	header = fTree->fCachedHeader.MakeWritableHeader(transaction);
312 	if (header == NULL)
313 		return B_IO_ERROR;
314 
315 	// the maximum_size has to be changed before the call to SetTo() - or
316 	// else it will fail because the requested node is out of bounds
317 	off_t offset = header->MaximumSize();
318 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() + fTree->fNodeSize);
319 
320 	if (SetToWritable(transaction, offset, false) != NULL) {
321 		fNode->Initialize();
322 
323 		*_offset = offset;
324 		*_node = fNode;
325 		return B_OK;
326 	}
327 
328 	// revert header size to old value
329 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize() - fTree->fNodeSize);
330 	RETURN_ERROR(B_ERROR);
331 }
332 
333 
334 //	#pragma mark -
335 
336 
337 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize)
338 	:
339 	fStream(NULL),
340 	fHeader(NULL),
341 	fCachedHeader(this)
342 {
343 	SetTo(transaction, stream);
344 }
345 
346 
347 BPlusTree::BPlusTree(Inode *stream)
348 	:
349 	fStream(NULL),
350 	fHeader(NULL),
351 	fCachedHeader(this)
352 {
353 	SetTo(stream);
354 }
355 
356 
357 BPlusTree::BPlusTree()
358 	:
359 	fStream(NULL),
360 	fHeader(NULL),
361 	fCachedHeader(this),
362 	fNodeSize(BPLUSTREE_NODE_SIZE),
363 	fAllowDuplicates(true),
364 	fStatus(B_NO_INIT)
365 {
366 }
367 
368 
369 BPlusTree::~BPlusTree()
370 {
371 	// if there are any TreeIterators left, we need to stop them
372 	// (can happen when the tree's inode gets deleted while
373 	// traversing the tree - a TreeIterator doesn't lock the inode)
374 	if (fIteratorLock.Lock() < B_OK)
375 		return;
376 
377 	TreeIterator *iterator = NULL;
378 	while ((iterator = fIterators.Next(iterator)) != NULL)
379 		iterator->Stop();
380 
381 	fIteratorLock.Unlock();
382 }
383 
384 
385 /** Create a new B+Tree on the specified stream */
386 
387 status_t
388 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize)
389 {
390 	// initializes in-memory B+Tree
391 
392 	fStream = stream;
393 
394 	bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction);
395 	if (header == NULL) {
396 		// allocate space for new header + node!
397 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
398 		if (fStatus < B_OK)
399 			RETURN_ERROR(fStatus);
400 
401 		header = fCachedHeader.SetToWritableHeader(transaction);
402 		if (header == NULL)
403 			RETURN_ERROR(fStatus = B_ERROR);
404 	}
405 
406 	fHeader = header;
407 	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
408 						&& stream->BlockRun() != stream->Parent())
409 						|| (stream->Mode() & S_ALLOW_DUPS) != 0;
410 
411 	fNodeSize = nodeSize;
412 
413 	// initialize b+tree header
414  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
415  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
416  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
417  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
418  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
419  	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
420  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
421 
422 	// initialize b+tree root node
423 	CachedNode cached(this);
424 	cached.SetToWritable(transaction, header->RootNode(), false);
425 	if (cached.Node() == NULL)
426 		RETURN_ERROR(B_IO_ERROR);
427 
428 	cached.Node()->Initialize();
429 
430 	return fStatus = B_OK;
431 }
432 
433 
434 status_t
435 BPlusTree::SetTo(Inode *stream)
436 {
437 	if (stream == NULL)
438 		RETURN_ERROR(fStatus = B_BAD_VALUE);
439 
440 	// get on-disk B+Tree header
441 
442 	fCachedHeader.Unset();
443 	fStream = stream;
444 
445 	fHeader = fCachedHeader.SetToHeader();
446 	if (fHeader == NULL)
447 		RETURN_ERROR(fStatus = B_NO_INIT);
448 
449 	// is header valid?
450 
451 	if (fHeader->MaximumSize() != stream->Size()) {
452 		FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n",
453 			fHeader->MaximumSize(), stream->Size()));
454 		// we can't change the header since we don't have a transaction
455 		//fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
456 	}
457 	if (fHeader->Magic() != BPLUSTREE_MAGIC
458 		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
459 		|| !fHeader->IsValidLink(fHeader->RootNode())
460 		|| !fHeader->IsValidLink(fHeader->FreeNode())) {
461 #ifdef DEBUG
462 		dump_bplustree_header(fHeader);
463 		dump_block((const char *)fHeader, 128);
464 #endif
465 		RETURN_ERROR(fStatus = B_BAD_DATA);
466 	}
467 
468 	fNodeSize = fHeader->NodeSize();
469 
470 	{
471 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
472 						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
473 		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
474 						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
475 
476 		if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
477 			|| (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->DataType()] != mode
478 			|| !stream->IsContainer()) {
479 			D(	dump_bplustree_header(fHeader);
480 				dump_inode(&stream->Node());
481 			);
482 			RETURN_ERROR(fStatus = B_BAD_TYPE);
483 		}
484 
485 		// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
486 		// in the original BFS code - we will honour it nevertheless
487 		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
488 							&& stream->BlockRun() != stream->Parent())
489 							|| (stream->Mode() & S_ALLOW_DUPS) != 0;
490 	}
491 
492 	CachedNode cached(this, fHeader->RootNode());
493 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
494 }
495 
496 
497 status_t
498 BPlusTree::InitCheck()
499 {
500 	return fStatus;
501 }
502 
503 
504 int32
505 BPlusTree::TypeCodeToKeyType(type_code code)
506 {
507 	switch (code) {
508 		case B_STRING_TYPE:
509 			return BPLUSTREE_STRING_TYPE;
510 		case B_SSIZE_T_TYPE:
511 		case B_INT32_TYPE:
512 			return BPLUSTREE_INT32_TYPE;
513 		case B_SIZE_T_TYPE:
514 		case B_UINT32_TYPE:
515 			return BPLUSTREE_UINT32_TYPE;
516 		case B_OFF_T_TYPE:
517 		case B_INT64_TYPE:
518 			return BPLUSTREE_INT64_TYPE;
519 		case B_UINT64_TYPE:
520 			return BPLUSTREE_UINT64_TYPE;
521 		case B_FLOAT_TYPE:
522 			return BPLUSTREE_FLOAT_TYPE;
523 		case B_DOUBLE_TYPE:
524 			return BPLUSTREE_DOUBLE_TYPE;
525 	}
526 	return -1;
527 }
528 
529 
530 int32
531 BPlusTree::ModeToKeyType(mode_t mode)
532 {
533 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
534 				   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
535 		case S_INT_INDEX:
536 			return BPLUSTREE_INT32_TYPE;
537 		case S_UINT_INDEX:
538 			return BPLUSTREE_UINT32_TYPE;
539 		case S_LONG_LONG_INDEX:
540 			return BPLUSTREE_INT64_TYPE;
541 		case S_ULONG_LONG_INDEX:
542 			return BPLUSTREE_UINT64_TYPE;
543 		case S_FLOAT_INDEX:
544 			return BPLUSTREE_FLOAT_TYPE;
545 		case S_DOUBLE_INDEX:
546 			return BPLUSTREE_DOUBLE_TYPE;
547 		case S_STR_INDEX:
548 		default:
549 			// default is for standard directories
550 			return BPLUSTREE_STRING_TYPE;
551 	}
552 }
553 
554 
555 //	#pragma mark -
556 
557 
558 void
559 BPlusTree::UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt,
560 	int8 change)
561 {
562 	// Although every iterator which is affected by this update currently
563 	// waits on a semaphore, other iterators could be added/removed at
564 	// any time, so we need to protect this loop
565 	if (fIteratorLock.Lock() < B_OK)
566 		return;
567 
568 	TreeIterator *iterator = NULL;
569 	while ((iterator = fIterators.Next(iterator)) != NULL)
570 		iterator->Update(offset, nextOffset, keyIndex, splitAt, change);
571 
572 	fIteratorLock.Unlock();
573 }
574 
575 
576 void
577 BPlusTree::AddIterator(TreeIterator *iterator)
578 {
579 	if (fIteratorLock.Lock() < B_OK)
580 		return;
581 
582 	fIterators.Add(iterator);
583 
584 	fIteratorLock.Unlock();
585 }
586 
587 
588 void
589 BPlusTree::RemoveIterator(TreeIterator *iterator)
590 {
591 	if (fIteratorLock.Lock() < B_OK)
592 		return;
593 
594 	fIterators.Remove(iterator);
595 
596 	fIteratorLock.Unlock();
597 }
598 
599 
600 int32
601 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
602 {
603 	type_code type = 0;
604 	switch (fHeader->data_type) {
605 	    case BPLUSTREE_STRING_TYPE:
606 	    	type = B_STRING_TYPE;
607 	    	break;
608 		case BPLUSTREE_INT32_TYPE:
609 	    	type = B_INT32_TYPE;
610 	    	break;
611 		case BPLUSTREE_UINT32_TYPE:
612 	    	type = B_UINT32_TYPE;
613 	    	break;
614 		case BPLUSTREE_INT64_TYPE:
615 	    	type = B_INT64_TYPE;
616 	    	break;
617 		case BPLUSTREE_UINT64_TYPE:
618 	    	type = B_UINT64_TYPE;
619 	    	break;
620 		case BPLUSTREE_FLOAT_TYPE:
621 	    	type = B_FLOAT_TYPE;
622 	    	break;
623 		case BPLUSTREE_DOUBLE_TYPE:
624 	    	type = B_DOUBLE_TYPE;
625 	    	break;
626 	}
627    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
628 }
629 
630 
631 status_t
632 BPlusTree::FindKey(const bplustree_node *node, const uint8 *key, uint16 keyLength,
633 	uint16 *_index, off_t *_next)
634 {
635 #ifdef DEBUG
636 		NodeChecker checker(node, fNodeSize, "find");
637 #endif
638 
639 	if (node->all_key_count == 0) {
640 		if (_index)
641 			*_index = 0;
642 		if (_next)
643 			*_next = node->OverflowLink();
644 		return B_ENTRY_NOT_FOUND;
645 	}
646 
647 	off_t *values = node->Values();
648 	int16 saveIndex = -1;
649 
650 	// binary search in the key array
651 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
652 		uint16 i = (first + last) >> 1;
653 
654 		uint16 searchLength;
655 		uint8 *searchKey = node->KeyAt(i, &searchLength);
656 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
657 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
658 			fStream->GetVolume()->Panic();
659 			RETURN_ERROR(B_BAD_DATA);
660 		}
661 
662 		int32 cmp = CompareKeys(key, keyLength, searchKey, searchLength);
663 		if (cmp < 0) {
664 			last = i - 1;
665 			saveIndex = i;
666 		} else if (cmp > 0) {
667 			saveIndex = first = i + 1;
668 		} else {
669 			if (_index)
670 				*_index = i;
671 			if (_next)
672 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
673 			return B_OK;
674 		}
675 	}
676 
677 	if (_index)
678 		*_index = saveIndex;
679 	if (_next) {
680 		if (saveIndex == node->NumKeys())
681 			*_next = node->OverflowLink();
682 		else
683 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
684 	}
685 	return B_ENTRY_NOT_FOUND;
686 }
687 
688 
689 /**	Prepares the stack to contain all nodes that were passed while
690  *	following the key, from the root node to the leaf node that could
691  *	or should contain that key.
692  */
693 
694 status_t
695 BPlusTree::SeekDown(Stack<node_and_key> &stack, const uint8 *key, uint16 keyLength)
696 {
697 	// set the root node to begin with
698 	node_and_key nodeAndKey;
699 	nodeAndKey.nodeOffset = fHeader->RootNode();
700 
701 	CachedNode cached(this);
702 	const bplustree_node *node;
703 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
704 		// if we are already on leaf level, we're done
705 		if (node->OverflowLink() == BPLUSTREE_NULL) {
706 			// node that the keyIndex is not properly set here (but it's not
707 			// needed in the calling functions anyway)!
708 			nodeAndKey.keyIndex = 0;
709 			stack.Push(nodeAndKey);
710 			return B_OK;
711 		}
712 
713 		off_t nextOffset;
714 		status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex, &nextOffset);
715 
716 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
717 			RETURN_ERROR(B_ERROR);
718 
719 		// put the node offset & the correct keyIndex on the stack
720 		stack.Push(nodeAndKey);
721 
722 		nodeAndKey.nodeOffset = nextOffset;
723 	}
724 
725 	FATAL(("BPlusTree::SeekDown() could not open node %Ld\n", nodeAndKey.nodeOffset));
726 	return B_ERROR;
727 }
728 
729 
730 /**	This will find a free duplicate fragment in the given bplustree_node.
731  *	The CachedNode will be set to the writable fragment on success.
732  */
733 
734 status_t
735 BPlusTree::FindFreeDuplicateFragment(Transaction &transaction,
736 	const bplustree_node *node, CachedNode &cached,
737 	off_t *_offset, bplustree_node **_fragment, uint32 *_index)
738 {
739 	off_t *values = node->Values();
740 	for (int32 i = 0; i < node->NumKeys(); i++) {
741 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
742 
743 		// does the value link to a duplicate fragment?
744 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
745 			continue;
746 
747 		const bplustree_node *fragment = cached.SetTo(
748 			bplustree_node::FragmentOffset(value), false);
749 		if (fragment == NULL) {
750 			FATAL(("Could not get duplicate fragment at %Ld\n", value));
751 			continue;
752 		}
753 
754 		// see if there is some space left for us
755 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
756 		for (uint32 j = 0; j < num; j++) {
757 			duplicate_array *array = fragment->FragmentAt(j);
758 
759 			if (array->count == 0) {
760 				// found an unused fragment
761 				*_fragment = cached.MakeWritable(transaction);
762 				if (*_fragment == NULL)
763 					return B_IO_ERROR;
764 
765 				*_offset = bplustree_node::FragmentOffset(value);
766 				*_index = j;
767 				return B_OK;
768 			}
769 		}
770 	}
771 	return B_ENTRY_NOT_FOUND;
772 }
773 
774 
775 status_t
776 BPlusTree::InsertDuplicate(Transaction &transaction, CachedNode &cached,
777 	const bplustree_node *node, uint16 index, off_t value)
778 {
779 	CachedNode cachedDuplicate(this);
780 	off_t *values = node->Values();
781 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
782 	status_t status;
783 	off_t offset;
784 
785 	if (bplustree_node::IsDuplicate(oldValue)) {
786 		// If it's a duplicate fragment, try to insert it into that, or if it
787 		// doesn't fit anymore, create a new duplicate node
788 
789 		if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
790 			bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
791 				bplustree_node::FragmentOffset(oldValue), false);
792 			if (duplicate == NULL)
793 				return B_IO_ERROR;
794 
795 			duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
796 			if (array->count > NUM_FRAGMENT_VALUES
797 				|| array->count < 1) {
798 				FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",
799 					bplustree_node::FragmentIndex(oldValue),
800 					bplustree_node::FragmentOffset(oldValue),
801 					array->count));
802 				return B_BAD_DATA;
803 			}
804 
805 			if (array->count < NUM_FRAGMENT_VALUES) {
806 				array->Insert(value);
807 			} else {
808 				// test if the fragment will be empty if we remove this key's values
809 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
810 					// the node will be empty without our values, so let us
811 					// reuse it as a duplicate node
812 					offset = bplustree_node::FragmentOffset(oldValue);
813 
814 					memmove(duplicate->DuplicateArray(), array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
815 					duplicate->left_link = duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(
816 						(uint64)BPLUSTREE_NULL);
817 
818 					array = duplicate->DuplicateArray();
819 					array->Insert(value);
820 				} else {
821 					// create a new duplicate node
822 
823 					cachedDuplicate.UnsetUnchanged(transaction);
824 						// the old duplicate has not been touched, so we can reuse it
825 
826 					bplustree_node *newDuplicate;
827 					status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset);
828 					if (status < B_OK)
829 						RETURN_ERROR(status);
830 
831 					// copy the array from the fragment node to the duplicate node
832 					// and free the old entry (by zero'ing all values)
833 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(array->count);
834 					memcpy(&newDuplicate->all_key_count, &array->values[0],
835 						array->count * sizeof(off_t));
836 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
837 
838 					array = newDuplicate->DuplicateArray();
839 					array->Insert(value);
840 				}
841 
842 				// update the main pointer to link to a duplicate node
843 				if (cached.MakeWritable(transaction) == NULL)
844 					return B_IO_ERROR;
845 
846 				values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
847 					BPLUSTREE_DUPLICATE_NODE, offset));
848 			}
849 
850 			return B_OK;
851 		}
852 
853 		// Put the value into a dedicated duplicate node
854 
855 		// search for free space in the duplicate nodes of that key
856 		duplicate_array *array;
857 		const bplustree_node *duplicate;
858 		off_t duplicateOffset;
859 		do {
860 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
861 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
862 			if (duplicate == NULL)
863 				return B_IO_ERROR;
864 
865 			array = duplicate->DuplicateArray();
866 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
867 				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
868 					duplicateOffset, array->count));
869 				return B_BAD_DATA;
870 			}
871 		} while (array->count >= NUM_DUPLICATE_VALUES
872 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
873 
874 		bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction);
875 		if (writableDuplicate == NULL)
876 			return B_IO_ERROR;
877 
878 		if (array->count < NUM_DUPLICATE_VALUES) {
879 			array = writableDuplicate->DuplicateArray();
880 			array->Insert(value);
881 		} else {
882 			// no space left - add a new duplicate node
883 
884 			bplustree_node *newDuplicate;
885 			status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset);
886 			if (status < B_OK)
887 				RETURN_ERROR(status);
888 
889 			// link the two nodes together
890 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
891 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
892 
893 			array = newDuplicate->DuplicateArray();
894 			array->count = 0;
895 			array->Insert(value);
896 		}
897 		return B_OK;
898 	}
899 
900 	// Search for a free duplicate fragment or create a new one
901 	// to insert the duplicate value into
902 
903 	uint32 fragmentIndex = 0;
904 	bplustree_node *fragment;
905 	if (FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
906 			&offset, &fragment, &fragmentIndex) != B_OK) {
907 		// allocate a new duplicate fragment node
908 		if ((status = cachedDuplicate.Allocate(transaction, &fragment, &offset)) < B_OK)
909 			RETURN_ERROR(status);
910 
911 		memset(fragment, 0, fNodeSize);
912 	}
913 
914 	duplicate_array *array = fragment->FragmentAt(fragmentIndex);
915 	array->Insert(oldValue);
916 	array->Insert(value);
917 
918 	if (cached.MakeWritable(transaction) == NULL)
919 		return B_IO_ERROR;
920 
921 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
922 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
923 
924 	return B_OK;
925 }
926 
927 
928 void
929 BPlusTree::InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength,
930 	off_t value)
931 {
932 	// should never happen, but who knows?
933 	if (index > node->NumKeys())
934 		return;
935 
936 	off_t *values = node->Values();
937 	uint16 *keyLengths = node->KeyLengths();
938 	uint8 *keys = node->Keys();
939 
940 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
941 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() + keyLength);
942 
943 	off_t *newValues = node->Values();
944 	uint16 *newKeyLengths = node->KeyLengths();
945 
946 	// move values and copy new value into them
947 	memmove(newValues + index + 1, values + index, sizeof(off_t) * (node->NumKeys() - 1 - index));
948 	memmove(newValues, values, sizeof(off_t) * index);
949 
950 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
951 
952 	// move and update key length index
953 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
954 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
955 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
956 	}
957 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
958 
959 	int32 keyStart;
960 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
961 		+ (keyStart = index > 0 ? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
962 
963 	// move keys and copy new key into them
964 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
965 	int32 size = node->AllKeyLength() - length;
966 	if (size > 0)
967 		memmove(keys + length, keys + length - keyLength, size);
968 
969 	memcpy(keys + keyStart, key, keyLength);
970 }
971 
972 
973 /**	Splits the \a node into two halves - the other half will be put into \a other.
974  *	It also takes care to create a new overflow link if the node to split is an
975  *	index node.
976  */
977 
978 status_t
979 BPlusTree::SplitNode(bplustree_node *node, off_t nodeOffset, bplustree_node *other,
980 	off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value)
981 {
982 	if (*_keyIndex > node->NumKeys() + 1)
983 		return B_BAD_VALUE;
984 
985 	uint16 *inKeyLengths = node->KeyLengths();
986 	off_t *inKeyValues = node->Values();
987 	uint8 *inKeys = node->Keys();
988 	uint8 *outKeys = other->Keys();
989 	int32 keyIndex = *_keyIndex;	// can become less than zero!
990 
991 	// how many keys will fit in one (half) page?
992 	// that loop will find the answer to this question and
993 	// change the key lengths indices for their new home
994 
995 	// "bytes" is the number of bytes written for the new key,
996 	// "bytesBefore" are the bytes before that key
997 	// "bytesAfter" are the bytes after the new key, if any
998 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
999 
1000 	size_t size = fNodeSize >> 1;
1001 	int32 out, in;
1002 	for (in = out = 0; in < node->NumKeys() + 1;) {
1003 		if (!bytes)
1004 			bytesBefore = in > 0 ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1005 
1006 		if (in == keyIndex && !bytes) {
1007 			bytes = *_keyLength;
1008 		} else {
1009 			if (keyIndex < out)
1010 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - bytesBefore;
1011 
1012 			in++;
1013 		}
1014 		out++;
1015 
1016 		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1017 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1018 			// we have found the number of keys in the new node!
1019 			break;
1020 		}
1021 	}
1022 
1023 	// if the new key was not inserted, set the length of the keys
1024 	// that can be copied directly
1025 	if (keyIndex >= out && in > 0)
1026 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1027 
1028 	if (bytesBefore < 0 || bytesAfter < 0)
1029 		return B_BAD_DATA;
1030 
1031 	other->left_link = node->left_link;
1032 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1033 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
1034 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1035 
1036 	uint16 *outKeyLengths = other->KeyLengths();
1037 	off_t *outKeyValues = other->Values();
1038 	int32 keys = out > keyIndex ? keyIndex : out;
1039 
1040 	if (bytesBefore) {
1041 		// copy the keys
1042 		memcpy(outKeys, inKeys, bytesBefore);
1043 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1044 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1045 	}
1046 	if (bytes) {
1047 		// copy the newly inserted key
1048 		memcpy(outKeys + bytesBefore, key, bytes);
1049 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1050 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1051 
1052 		if (bytesAfter) {
1053 			// copy the keys after the new key
1054 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, bytesAfter);
1055 			keys = out - keyIndex - 1;
1056 			for (int32 i = 0;i < keys;i++) {
1057 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1058 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) + bytes);
1059 			}
1060 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, keys * sizeof(off_t));
1061 		}
1062 	}
1063 
1064 	// if the new key was already inserted, we shouldn't use it again
1065 	if (in != out)
1066 		keyIndex--;
1067 
1068 	int32 total = bytesBefore + bytesAfter;
1069 
1070 	// these variables are for the key that will be returned
1071 	// to the parent node
1072 	uint8 *newKey = NULL;
1073 	uint16 newLength;
1074 	bool newAllocated = false;
1075 
1076 	// If we have split an index node, we have to drop the first key
1077 	// of the next node (which can also be the new key to insert).
1078 	// The dropped key is also the one which has to be inserted in
1079 	// the parent node, so we will set the "newKey" already here.
1080 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1081 		if (in == keyIndex) {
1082 			newKey = key;
1083 			newLength = *_keyLength;
1084 
1085 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1086 			keyIndex--;
1087 		} else {
1088 			// If a key is dropped (is not the new key), we have to copy
1089 			// it, because it would be lost if not.
1090 			uint8 *droppedKey = node->KeyAt(in, &newLength);
1091 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
1092 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1093 				fStream->GetVolume()->Panic();
1094 				RETURN_ERROR(B_BAD_DATA);
1095 			}
1096 			newKey = (uint8 *)malloc(newLength);
1097 			if (newKey == NULL)
1098 				return B_NO_MEMORY;
1099 			memcpy(newKey, droppedKey, newLength);
1100 
1101 			other->overflow_link = inKeyValues[in];
1102 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1103 		}
1104 	}
1105 
1106 	// and now the same game for the other page and the rest of the keys
1107 	// (but with memmove() instead of memcpy(), because they may overlap)
1108 
1109 	bytesBefore = bytesAfter = bytes = 0;
1110 	out = 0;
1111 	int32 skip = in;
1112 	while (in < node->NumKeys() + 1) {
1113 		if (in == keyIndex && !bytes) {
1114 			// it's enough to set bytesBefore once here, because we do
1115 			// not need to know the exact length of all keys in this
1116 			// loop
1117 			bytesBefore = in > skip ? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1118 			bytes = *_keyLength;
1119 		} else {
1120 			if (in < node->NumKeys()) {
1121 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1122 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1123 
1124 				if (bytes) {
1125 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1126 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1127 
1128 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1129 						- bytesBefore - bytes;
1130 				}
1131 			}
1132 			in++;
1133 		}
1134 
1135 		out++;
1136 
1137 		// break out when all keys are done
1138 		if (in > node->NumKeys() && keyIndex < in)
1139 			break;
1140 	}
1141 
1142 	// adjust the byte counts (since we were a bit lazy in the loop)
1143 	if (keyIndex >= in && keyIndex - skip < out)
1144 		bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - bytesBefore - total;
1145 	else if (keyIndex < skip)
1146 		bytesBefore = node->AllKeyLength() - total;
1147 
1148 	if (bytesBefore < 0 || bytesAfter < 0)
1149 		return B_BAD_DATA;
1150 
1151 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1152 		// right link, and overflow link can stay the same
1153 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
1154 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1);
1155 
1156 	// array positions have changed
1157 	outKeyLengths = node->KeyLengths();
1158 	outKeyValues = node->Values();
1159 
1160 	// move the keys in the old node: the order is important here,
1161 	// because we don't want to overwrite any contents
1162 
1163 	keys = keyIndex <= skip ? out : keyIndex - skip;
1164 	keyIndex -= skip;
1165 
1166 	if (bytesBefore)
1167 		memmove(inKeys, inKeys + total, bytesBefore);
1168 	if (bytesAfter)
1169 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, bytesAfter);
1170 
1171 	if (bytesBefore)
1172 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1173 	in = out - keyIndex - 1;
1174 	if (bytesAfter)
1175 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, in * sizeof(uint16));
1176 
1177 	if (bytesBefore)
1178 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1179 	if (bytesAfter)
1180 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, in * sizeof(off_t));
1181 
1182 	if (bytes) {
1183 		// finally, copy the newly inserted key (don't overwrite anything)
1184 		memcpy(inKeys + bytesBefore, key, bytes);
1185 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1186 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1187 	}
1188 
1189 	// Prepare the key that will be inserted in the parent node which
1190 	// is either the dropped key or the last of the other node.
1191 	// If it's the dropped key, "newKey" was already set earlier.
1192 
1193 	if (newKey == NULL)
1194 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1195 
1196 	memcpy(key, newKey, newLength);
1197 	*_keyLength = newLength;
1198 	*_value = otherOffset;
1199 
1200 	if (newAllocated)
1201 		free(newKey);
1202 
1203 	return B_OK;
1204 }
1205 
1206 
1207 /**	This inserts a key into the tree. The changes made to the tree will
1208  *	all be part of the \a transaction.
1209  */
1210 
1211 status_t
1212 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value)
1213 {
1214 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1215 		RETURN_ERROR(B_BAD_VALUE);
1216 #ifdef DEBUG
1217 	if (value < 0)
1218 		panic("tried to insert invalid value %Ld!\n", value);
1219 #endif
1220 
1221 	// lock access to stream
1222 	WriteLocked locked(fStream->Lock());
1223 
1224 	Stack<node_and_key> stack;
1225 	if (SeekDown(stack, key, keyLength) != B_OK)
1226 		RETURN_ERROR(B_ERROR);
1227 
1228 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1229 
1230 	memcpy(keyBuffer, key, keyLength);
1231 	keyBuffer[keyLength] = 0;
1232 
1233 	node_and_key nodeAndKey;
1234 	const bplustree_node *node;
1235 
1236 	CachedNode cached(this);
1237 	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1238 #ifdef DEBUG
1239 		NodeChecker checker(node, fNodeSize, "insert");
1240 #endif
1241 		if (node->IsLeaf()) {
1242 			// first round, check for duplicate entries
1243 			status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
1244 
1245 			// is this a duplicate entry?
1246 			if (status == B_OK) {
1247 				if (fAllowDuplicates) {
1248 					status = InsertDuplicate(transaction, cached, node, nodeAndKey.keyIndex, value);
1249 					if (status != B_OK)
1250 						RETURN_ERROR(status);
1251 					return B_OK;
1252 				}
1253 
1254 				RETURN_ERROR(B_NAME_IN_USE);
1255 			}
1256 		}
1257 
1258 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1259 		if (writableNode == NULL)
1260 			return B_IO_ERROR;
1261 
1262 		// is the node big enough to hold the pair?
1263 		if (int32(round_up(sizeof(bplustree_node) + writableNode->AllKeyLength() + keyLength)
1264 			+ (writableNode->NumKeys() + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
1265 		{
1266 			InsertKey(writableNode, nodeAndKey.keyIndex, keyBuffer, keyLength, value);
1267 			UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, nodeAndKey.keyIndex, 0, 1);
1268 
1269 			return B_OK;
1270 		} else {
1271 			CachedNode cachedNewRoot(this);
1272 			CachedNode cachedOther(this);
1273 
1274 			// do we need to allocate a new root node? if so, then do
1275 			// it now
1276 			off_t newRoot = BPLUSTREE_NULL;
1277 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1278 				bplustree_node *root;
1279 				status_t status = cachedNewRoot.Allocate(transaction, &root, &newRoot);
1280 				if (status < B_OK) {
1281 					// The tree is most likely corrupted!
1282 					// But it's still sane at leaf level - we could set
1283 					// a flag in the header that forces the tree to be
1284 					// rebuild next time...
1285 					// But since we will have journaling, that's not a big
1286 					// problem anyway.
1287 					RETURN_ERROR(status);
1288 				}
1289 			}
1290 
1291 			// reserve space for the other node
1292 			bplustree_node *other;
1293 			off_t otherOffset;
1294 			status_t status = cachedOther.Allocate(transaction, &other, &otherOffset);
1295 			if (status < B_OK) {
1296 				cachedNewRoot.Free(transaction, newRoot);
1297 				RETURN_ERROR(status);
1298 			}
1299 
1300 			if (SplitNode(writableNode, nodeAndKey.nodeOffset, other, otherOffset,
1301 					&nodeAndKey.keyIndex, keyBuffer, &keyLength, &value) < B_OK) {
1302 				// free root node & other node here
1303 				cachedOther.Free(transaction, otherOffset);
1304 				cachedNewRoot.Free(transaction, newRoot);
1305 
1306 				RETURN_ERROR(B_ERROR);
1307 			}
1308 #ifdef DEBUG
1309 			checker.Check("insert split");
1310 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1311 #endif
1312 
1313 			UpdateIterators(nodeAndKey.nodeOffset, otherOffset, nodeAndKey.keyIndex,
1314 				writableNode->NumKeys(), 1);
1315 
1316 			// update the right link of the node in the left of the new node
1317 			if ((other = cachedOther.SetToWritable(transaction, other->LeftLink())) != NULL) {
1318 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1319 			}
1320 
1321 			// create a new root if necessary
1322 			if (newRoot != BPLUSTREE_NULL) {
1323 				bplustree_node *root = cachedNewRoot.Node();
1324 
1325 				InsertKey(root, 0, keyBuffer, keyLength, writableNode->LeftLink());
1326 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(nodeAndKey.nodeOffset);
1327 
1328 				bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction);
1329 				if (header == NULL)
1330 					return B_IO_ERROR;
1331 
1332 				// finally, update header to point to the new root
1333 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1334 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(header->MaxNumberOfLevels() + 1);
1335 
1336 				return B_OK;
1337 			}
1338 		}
1339 	}
1340 	RETURN_ERROR(B_ERROR);
1341 }
1342 
1343 
1344 /**	Removes the duplicate index/value pair from the tree.
1345  *	It's part of the private tree interface.
1346  */
1347 
1348 status_t
1349 BPlusTree::RemoveDuplicate(Transaction &transaction, const bplustree_node *node,
1350 	CachedNode &cached, uint16 index, off_t value)
1351 {
1352 	off_t *values = node->Values();
1353 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1354 
1355 	CachedNode cachedDuplicate(this);
1356 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1357 	bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
1358 		duplicateOffset, false);
1359 	if (duplicate == NULL)
1360 		return B_IO_ERROR;
1361 
1362 	// if it's a duplicate fragment, remove the entry from there
1363 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1364 		duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
1365 
1366 		if (array->count > NUM_FRAGMENT_VALUES
1367 			|| array->count < 1) {
1368 			FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",
1369 				bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count));
1370 			return B_BAD_DATA;
1371 		}
1372 		if (!array->Remove(value)) {
1373 			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",
1374 				value, duplicateOffset));
1375 			return B_ENTRY_NOT_FOUND;
1376 		}
1377 
1378 		// remove the array from the fragment node if it is empty
1379 		if (array->count == 1) {
1380 			// set the link to the remaining value
1381 			if (cached.MakeWritable(transaction) == NULL)
1382 				return B_IO_ERROR;
1383 
1384 			values[index] = array->values[0];
1385 
1386 			// Remove the whole fragment node, if this was the only array,
1387 			// otherwise free just the array
1388 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1389 				status_t status = cachedDuplicate.Free(transaction, duplicateOffset);
1390 				if (status < B_OK)
1391 					return status;
1392 			} else
1393 				array->count = 0;
1394 		}
1395 		return B_OK;
1396 	}
1397 
1398 	// Remove value from a duplicate node!
1399 
1400 	duplicate_array *array = NULL;
1401 
1402 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1403 		FATAL(("invalid duplicate node: first left link points to %Ld!\n",
1404 			duplicate->LeftLink()));
1405 		return B_BAD_DATA;
1406 	}
1407 
1408 	// Search the duplicate nodes until the entry could be found (and removed)
1409 	while (duplicate != NULL) {
1410 		array = duplicate->DuplicateArray();
1411 		if (array->count > NUM_DUPLICATE_VALUES
1412 			|| array->count < 0) {
1413 			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
1414 				duplicateOffset, array->count));
1415 			return B_BAD_DATA;
1416 		}
1417 
1418 		if (array->Remove(value))
1419 			break;
1420 
1421 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1422 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1423 
1424 		cachedDuplicate.UnsetUnchanged(transaction);
1425 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false);
1426 	}
1427 	if (duplicate == NULL)
1428 		RETURN_ERROR(B_IO_ERROR);
1429 
1430 	// The entry got removed from the duplicate node, but we might want to free
1431 	// it now in case it's empty
1432 
1433 	while (true) {
1434 		off_t left = duplicate->LeftLink();
1435 		off_t right = duplicate->RightLink();
1436 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1437 
1438 		if ((isLast && array->count == 1) || array->count == 0) {
1439 			// Free empty duplicate page, link their siblings together, and
1440 			// update the duplicate link if needed (ie. when we either remove
1441 			// the last duplicate node or have a new first one)
1442 
1443 			if (left == BPLUSTREE_NULL) {
1444 				// the duplicate link points to us
1445 				if (cached.MakeWritable(transaction) == NULL)
1446 					return B_IO_ERROR;
1447 
1448 				if (array->count == 1) {
1449 					// this is the last node, and there is only one value left;
1450 					// replace the duplicate link with that value, it's no duplicate
1451 					// anymore
1452 					values[index] = array->values[0];
1453 				} else {
1454 					// move the duplicate link to the next node
1455 					values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1456 						BPLUSTREE_DUPLICATE_NODE, right));
1457 				}
1458 			}
1459 
1460 			status_t status;
1461 			if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK)
1462 				return status;
1463 
1464 			if (left != BPLUSTREE_NULL
1465 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) {
1466 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1467 
1468 				// If the next node is the last node, we need to free that node
1469 				// and convert the duplicate entry back into a normal entry
1470 				array = duplicate->DuplicateArray();
1471 				if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL
1472 					&& array->count <= NUM_FRAGMENT_VALUES) {
1473 					duplicateOffset = left;
1474 					continue;
1475 				}
1476 			}
1477 			if (right != BPLUSTREE_NULL
1478 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) {
1479 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1480 
1481 				// Again, we may need to turn the duplicate entry back into a normal entry
1482 				array = duplicate->DuplicateArray();
1483 				if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL
1484 					&& array->count <= NUM_FRAGMENT_VALUES) {
1485 					duplicateOffset = right;
1486 					continue;
1487 				}
1488 			}
1489 			return B_OK;
1490 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1491 			// If the number of entries fits in a duplicate fragment, then
1492 			// either find a free fragment node, or convert this node to a
1493 			// fragment node.
1494 			CachedNode cachedOther(this);
1495 
1496 			bplustree_node *fragment = NULL;
1497 			uint32 fragmentIndex = 0;
1498 			off_t offset;
1499 			if (FindFreeDuplicateFragment(transaction, node, cachedOther,
1500 					&offset, &fragment, &fragmentIndex) == B_OK) {
1501 				// move to other node
1502 				duplicate_array *target = fragment->FragmentAt(fragmentIndex);
1503 				memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1504 
1505 				cachedDuplicate.Free(transaction, duplicateOffset);
1506 				duplicateOffset = offset;
1507 			} else {
1508 				// convert node
1509 				memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1510 				memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1511 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1512 			}
1513 
1514 			if (cached.MakeWritable(transaction) == NULL)
1515 				return B_IO_ERROR;
1516 
1517 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1518 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1519 		}
1520 		return B_OK;
1521 	}
1522 }
1523 
1524 
1525 /** Removes the key with the given index from the specified node.
1526  *	Since it has to get the key from the node anyway (to obtain it's
1527  *	pointer), it's not needed to pass the key & its length, although
1528  *	the calling method (BPlusTree::Remove()) have this data.
1529  */
1530 
1531 void
1532 BPlusTree::RemoveKey(bplustree_node *node, uint16 index)
1533 {
1534 	// should never happen, but who knows?
1535 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1536 		FATAL(("Asked me to remove key outer limits: %u\n", index));
1537 		return;
1538 	}
1539 
1540 	off_t *values = node->Values();
1541 
1542 	// if we would have to drop the overflow link, drop
1543 	// the last key instead and update the overflow link
1544 	// to the value of that one
1545 	if (!node->IsLeaf() && index == node->NumKeys())
1546 		node->overflow_link = values[--index];
1547 
1548 	uint16 length;
1549 	uint8 *key = node->KeyAt(index, &length);
1550 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
1551 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1552 		FATAL(("Key length to long: %s, %u (inode at %ld,%u)\n", key, length,
1553 			fStream->BlockRun().allocation_group, fStream->BlockRun().start));
1554 		fStream->GetVolume()->Panic();
1555 		return;
1556 	}
1557 
1558 	uint16 *keyLengths = node->KeyLengths();
1559 	uint8 *keys = node->Keys();
1560 
1561 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1562 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length);
1563 
1564 	off_t *newValues = node->Values();
1565 	uint16 *newKeyLengths = node->KeyLengths();
1566 
1567 	// move key data
1568 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1569 
1570 	// move and update key lengths
1571 	if (index > 0 && newKeyLengths != keyLengths)
1572 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1573 	for (uint16 i = index; i < node->NumKeys(); i++) {
1574 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1575 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
1576 	}
1577 
1578 	// move values
1579 	if (index > 0)
1580 		memmove(newValues, values, index * sizeof(off_t));
1581 	if (node->NumKeys() > index)
1582 		memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t));
1583 }
1584 
1585 
1586 /**	Removes the specified key from the tree. The "value" parameter is only used
1587  *	for trees which allow duplicates, so you may safely ignore it.
1588  *	It's not an optional parameter, so at least you have to think about it.
1589  */
1590 
1591 status_t
1592 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value)
1593 {
1594 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1595 		RETURN_ERROR(B_BAD_VALUE);
1596 
1597 	// lock access to stream
1598 	WriteLocked locked(fStream->Lock());
1599 
1600 	Stack<node_and_key> stack;
1601 	if (SeekDown(stack, key, keyLength) != B_OK)
1602 		RETURN_ERROR(B_ERROR);
1603 
1604 	node_and_key nodeAndKey;
1605 	const bplustree_node *node;
1606 
1607 	CachedNode cached(this);
1608 	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1609 #ifdef DEBUG
1610 		NodeChecker checker(node, fNodeSize, "remove");
1611 #endif
1612 		if (node->IsLeaf()) {
1613 			// first round, check for duplicate entries
1614 			status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
1615 			if (status < B_OK)
1616 				RETURN_ERROR(status);
1617 
1618 			// If we will remove the last key, the iterator will be set
1619 			// to the next node after the current - if there aren't any
1620 			// more nodes, we need a way to prevent the TreeIterators to
1621 			// touch the old node again, we use BPLUSTREE_FREE for this
1622 			off_t next = node->RightLink() == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->RightLink();
1623 			UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 ?
1624 								next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
1625 
1626 			// is this a duplicate entry?
1627 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
1628 					node->Values()[nodeAndKey.keyIndex]))) {
1629 				if (fAllowDuplicates) {
1630 					return RemoveDuplicate(transaction, node, cached,
1631 						nodeAndKey.keyIndex, value);
1632 				} else {
1633 					FATAL(("dupliate node found where no duplicates are allowed!\n"));
1634 					RETURN_ERROR(B_ERROR);
1635 				}
1636 			}
1637 		}
1638 
1639 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1640 		if (writableNode == NULL)
1641 			return B_IO_ERROR;
1642 
1643 		// if it's an empty root node, we have to convert it
1644 		// to a leaf node by dropping the overflow link, or,
1645 		// if it's a leaf node, just empty it
1646 		if (nodeAndKey.nodeOffset == fHeader->RootNode()
1647 			&& node->NumKeys() == 0
1648 			|| node->NumKeys() == 1 && node->IsLeaf()) {
1649 			writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1650 			writableNode->all_key_count = 0;
1651 			writableNode->all_key_length = 0;
1652 
1653 			// if we've cleared the root node, reset the maximum
1654 			// number of levels in the header
1655 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1656 				bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction);
1657 				if (header == NULL)
1658 					return B_IO_ERROR;
1659 
1660 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1661 			}
1662 			return B_OK;
1663 		}
1664 
1665 		// if there is only one key left, we don't have to remove
1666 		// it, we can just dump the node (index nodes still have
1667 		// the overflow link, so we have to drop the last key)
1668 		if (writableNode->NumKeys() > 1
1669 			|| !writableNode->IsLeaf() && writableNode->NumKeys() == 1) {
1670 			RemoveKey(writableNode, nodeAndKey.keyIndex);
1671 			return B_OK;
1672 		}
1673 
1674 		// when we are here, we can just free the node, but
1675 		// we have to update the right/left link of the
1676 		// siblings first
1677 		CachedNode otherCached(this);
1678 		bplustree_node *other = otherCached.SetToWritable(transaction,
1679 			writableNode->LeftLink());
1680 		if (other != NULL)
1681 			other->right_link = writableNode->right_link;
1682 
1683 		if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL)
1684 			other->left_link = writableNode->left_link;
1685 
1686 		cached.Free(transaction, nodeAndKey.nodeOffset);
1687 	}
1688 	RETURN_ERROR(B_ERROR);
1689 }
1690 
1691 
1692 /**	Replaces the value for the key in the tree.
1693  *	Returns B_OK if the key could be found and its value replaced,
1694  *	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1695  *	to indicate that something went terribly wrong.
1696  *	Note that this doesn't work with duplicates - it will just
1697  *	return B_BAD_TYPE if you call this function on a tree where
1698  *	duplicates are allowed.
1699  */
1700 
1701 status_t
1702 BPlusTree::Replace(Transaction &transaction, const uint8 *key, uint16 keyLength, off_t value)
1703 {
1704 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1705 		|| key == NULL)
1706 		RETURN_ERROR(B_BAD_VALUE);
1707 
1708 	if (fAllowDuplicates)
1709 		RETURN_ERROR(B_BAD_TYPE);
1710 
1711 	// lock access to stream (a read lock is okay for this purpose)
1712 	ReadLocked locked(fStream->Lock());
1713 
1714 	off_t nodeOffset = fHeader->RootNode();
1715 	CachedNode cached(this);
1716 	const bplustree_node *node;
1717 
1718 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1719 		uint16 keyIndex = 0;
1720 		off_t nextOffset;
1721 		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1722 
1723 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1724 			if (status == B_OK) {
1725 				bplustree_node *writableNode = cached.MakeWritable(transaction);
1726 				if (writableNode != NULL)
1727 					writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value);
1728 				else
1729 					status = B_IO_ERROR;
1730 			}
1731 
1732 			return status;
1733 		} else if (nextOffset == nodeOffset)
1734 			RETURN_ERROR(B_ERROR);
1735 
1736 		nodeOffset = nextOffset;
1737 	}
1738 	RETURN_ERROR(B_ERROR);
1739 }
1740 
1741 
1742 /**	Searches the key in the tree, and stores the offset found in
1743  *	_value, if successful.
1744  *	It's very similar to BPlusTree::SeekDown(), but doesn't fill
1745  *	a stack while it descends the tree.
1746  *	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND
1747  *	if not. It can also return other errors to indicate that
1748  *	something went wrong.
1749  *	Note that this doesn't work with duplicates - it will just
1750  *	return B_BAD_TYPE if you call this function on a tree where
1751  *	duplicates are allowed.
1752  */
1753 
1754 status_t
1755 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value)
1756 {
1757 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1758 		|| key == NULL)
1759 		RETURN_ERROR(B_BAD_VALUE);
1760 
1761 	if (fAllowDuplicates)
1762 		RETURN_ERROR(B_BAD_TYPE);
1763 
1764 	// lock access to stream
1765 	ReadLocked locked(fStream->Lock());
1766 
1767 	off_t nodeOffset = fHeader->RootNode();
1768 	CachedNode cached(this);
1769 	const bplustree_node *node;
1770 
1771 #ifdef DEBUG
1772 	int32 levels = 0;
1773 #endif
1774 
1775 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1776 		uint16 keyIndex = 0;
1777 		off_t nextOffset;
1778 		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1779 
1780 #ifdef DEBUG
1781 		levels++;
1782 #endif
1783 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1784 			if (status == B_OK && _value != NULL)
1785 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1786 
1787 #ifdef DEBUG
1788 			if (levels != (int32)fHeader->MaxNumberOfLevels())
1789 				DEBUGGER(("levels don't match"));
1790 #endif
1791 			return status;
1792 		} else if (nextOffset == nodeOffset)
1793 			RETURN_ERROR(B_ERROR);
1794 
1795 		nodeOffset = nextOffset;
1796 	}
1797 	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
1798 	RETURN_ERROR(B_ERROR);
1799 }
1800 
1801 
1802 //	#pragma mark -
1803 
1804 
1805 TreeIterator::TreeIterator(BPlusTree *tree)
1806 	:
1807 	fTree(tree),
1808 	fCurrentNodeOffset(BPLUSTREE_NULL),
1809 	fNext(NULL)
1810 {
1811 	tree->AddIterator(this);
1812 }
1813 
1814 
1815 TreeIterator::~TreeIterator()
1816 {
1817 	if (fTree)
1818 		fTree->RemoveIterator(this);
1819 }
1820 
1821 
1822 status_t
1823 TreeIterator::Goto(int8 to)
1824 {
1825 	if (fTree == NULL || fTree->fHeader == NULL)
1826 		RETURN_ERROR(B_BAD_VALUE);
1827 
1828 	// lock access to stream
1829 	ReadLocked locked(fTree->fStream->Lock());
1830 
1831 	off_t nodeOffset = fTree->fHeader->RootNode();
1832 	CachedNode cached(fTree);
1833 	const bplustree_node *node;
1834 
1835 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1836 		// is the node a leaf node?
1837 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1838 			fCurrentNodeOffset = nodeOffset;
1839 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1840 			fDuplicateNode = BPLUSTREE_NULL;
1841 
1842 			return B_OK;
1843 		}
1844 
1845 		// get the next node offset depending on the direction (and if there
1846 		// are any keys in that node at all)
1847 		off_t nextOffset;
1848 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1849 			nextOffset = node->OverflowLink();
1850 		else {
1851 			if (node->AllKeyLength() > fTree->fNodeSize
1852 				|| (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->NumKeys())
1853 				RETURN_ERROR(B_ERROR);
1854 
1855 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1856 		}
1857 		if (nextOffset == nodeOffset)
1858 			break;
1859 
1860 		nodeOffset = nextOffset;
1861 	}
1862 	FATAL(("%s fails\n", __FUNCTION__));
1863 
1864 	RETURN_ERROR(B_ERROR);
1865 }
1866 
1867 
1868 /**	Iterates through the tree in the specified direction.
1869  *	When it iterates through duplicates, the "key" is only updated for the
1870  *	first entry - if you need to know when this happens, use the "duplicate"
1871  *	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
1872  *	the other duplicates.
1873  *	That's not too nice, but saves the 256 bytes that would be needed to
1874  *	store the last key - if this will ever become an issue, it will be
1875  *	easy to change.
1876  *	The other advantage of this is, that the queries can skip all duplicates
1877  *	at once when they are not relevant to them.
1878  */
1879 
1880 status_t
1881 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength,
1882 	off_t *value, uint16 *duplicate)
1883 {
1884 	if (fTree == NULL)
1885 		return B_INTERRUPTED;
1886 	if (fCurrentNodeOffset == BPLUSTREE_NULL
1887 		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
1888 		RETURN_ERROR(B_ERROR);
1889 
1890 	// if the tree was emptied since the last call
1891 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
1892 		return B_ENTRY_NOT_FOUND;
1893 
1894 	// lock access to stream
1895 	ReadLocked locked(fTree->fStream->Lock());
1896 
1897 	CachedNode cached(fTree);
1898 	const bplustree_node *node;
1899 
1900 	if (fDuplicateNode != BPLUSTREE_NULL) {
1901 		// regardless of traverse direction the duplicates are always presented in
1902 		// the same order; since they are all considered as equal, this shouldn't
1903 		// cause any problems
1904 
1905 		if (!fIsFragment || fDuplicate < fNumDuplicates)
1906 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1907 		else
1908 			node = NULL;
1909 
1910 		if (node != NULL) {
1911 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
1912 				// if the node is out of duplicates, we go directly to the next one
1913 				fDuplicateNode = node->RightLink();
1914 				if (fDuplicateNode != BPLUSTREE_NULL
1915 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
1916 					fNumDuplicates = node->CountDuplicates(fDuplicateNode, false);
1917 					fDuplicate = 0;
1918 				}
1919 			}
1920 			if (fDuplicate < fNumDuplicates) {
1921 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++);
1922 				if (duplicate)
1923 					*duplicate = 2;
1924 				return B_OK;
1925 			}
1926 		}
1927 		fDuplicateNode = BPLUSTREE_NULL;
1928 	}
1929 
1930 	off_t savedNodeOffset = fCurrentNodeOffset;
1931 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
1932 		RETURN_ERROR(B_ERROR);
1933 
1934 	if (duplicate)
1935 		*duplicate = 0;
1936 
1937 	fCurrentKey += direction;
1938 
1939 	// is the current key in the current node?
1940 	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->NumKeys())
1941 			|| (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) {
1942 		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->RightLink() : node->LeftLink();
1943 
1944 		// are there any more nodes?
1945 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
1946 			node = cached.SetTo(fCurrentNodeOffset);
1947 			if (!node)
1948 				RETURN_ERROR(B_ERROR);
1949 
1950 			// reset current key
1951 			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->NumKeys();
1952 		} else {
1953 			// there are no nodes left, so turn back to the last key
1954 			fCurrentNodeOffset = savedNodeOffset;
1955 			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->NumKeys() : -1;
1956 
1957 			return B_ENTRY_NOT_FOUND;
1958 		}
1959 	}
1960 
1961 	if (node->all_key_count == 0)
1962 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
1963 
1964 	uint16 length;
1965 	uint8 *keyStart = node->KeyAt(fCurrentKey, &length);
1966 	if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize
1967 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1968 		fTree->fStream->GetVolume()->Panic();
1969 		RETURN_ERROR(B_BAD_DATA);
1970 	}
1971 
1972 	length = min_c(length, maxLength);
1973 	memcpy(key, keyStart, length);
1974 
1975 	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	{
1976 		// terminate string type
1977 		if (length == maxLength)
1978 			length--;
1979 		((char *)key)[length] = '\0';
1980 	}
1981 	*keyLength = length;
1982 
1983 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
1984 
1985 	// duplicate fragments?
1986 	uint8 type = bplustree_node::LinkType(offset);
1987 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) {
1988 		fDuplicateNode = offset;
1989 
1990 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1991 		if (node == NULL)
1992 			RETURN_ERROR(B_ERROR);
1993 
1994 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
1995 
1996 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
1997 		if (fNumDuplicates) {
1998 			offset = node->DuplicateAt(offset, fIsFragment, 0);
1999 			fDuplicate = 1;
2000 			if (duplicate)
2001 				*duplicate = 1;
2002 		} else {
2003 			// shouldn't happen, but we're dealing here with potentially corrupt disks...
2004 			fDuplicateNode = BPLUSTREE_NULL;
2005 			offset = 0;
2006 		}
2007 	}
2008 	*value = offset;
2009 
2010 	return B_OK;
2011 }
2012 
2013 
2014 /**	This is more or less a copy of BPlusTree::Find() - but it just
2015  *	sets the current position in the iterator, regardless of if the
2016  *	key could be found or not.
2017  */
2018 
2019 status_t
2020 TreeIterator::Find(const uint8 *key, uint16 keyLength)
2021 {
2022 	if (fTree == NULL)
2023 		return B_INTERRUPTED;
2024 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
2025 		|| key == NULL)
2026 		RETURN_ERROR(B_BAD_VALUE);
2027 
2028 	// lock access to stream
2029 	ReadLocked locked(fTree->fStream->Lock());
2030 
2031 	off_t nodeOffset = fTree->fHeader->RootNode();
2032 
2033 	CachedNode cached(fTree);
2034 	const bplustree_node *node;
2035 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2036 		uint16 keyIndex = 0;
2037 		off_t nextOffset;
2038 		status_t status = fTree->FindKey(node, key, keyLength, &keyIndex, &nextOffset);
2039 
2040 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2041 			fCurrentNodeOffset = nodeOffset;
2042 			fCurrentKey = keyIndex - 1;
2043 			fDuplicateNode = BPLUSTREE_NULL;
2044 
2045 			return status;
2046 		} else if (nextOffset == nodeOffset)
2047 			RETURN_ERROR(B_ERROR);
2048 
2049 		nodeOffset = nextOffset;
2050 	}
2051 	RETURN_ERROR(B_ERROR);
2052 }
2053 
2054 
2055 void
2056 TreeIterator::SkipDuplicates()
2057 {
2058 	fDuplicateNode = BPLUSTREE_NULL;
2059 }
2060 
2061 
2062 void
2063 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change)
2064 {
2065 	if (offset != fCurrentNodeOffset)
2066 		return;
2067 
2068 	if (nextOffset != BPLUSTREE_NULL) {
2069 		fCurrentNodeOffset = nextOffset;
2070 		if (splitAt <= fCurrentKey) {
2071 			fCurrentKey -= splitAt;
2072 			keyIndex -= splitAt;
2073 		}
2074 	}
2075 
2076 	// Adjust fCurrentKey to point to the same key as before.
2077 	// Note, that if a key is inserted at the current position
2078 	// it won't be included in this tree transition.
2079 	if (keyIndex <= fCurrentKey)
2080 		fCurrentKey += change;
2081 
2082 	// ToDo: duplicate handling!
2083 }
2084 
2085 
2086 void
2087 TreeIterator::Stop()
2088 {
2089 	fTree = NULL;
2090 }
2091 
2092 
2093 #ifdef DEBUG
2094 void
2095 TreeIterator::Dump()
2096 {
2097 	__out("TreeIterator at %p:\n", this);
2098 	__out("\tfTree = %p\n", fTree);
2099 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2100 	__out("\tfCurrentKey = %ld\n", fCurrentKey);
2101 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode);
2102 	__out("\tfDuplicate = %u\n", fDuplicate);
2103 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2104 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2105 }
2106 #endif
2107 
2108 
2109 //	#pragma mark -
2110 
2111 
2112 void
2113 bplustree_node::Initialize()
2114 {
2115 	left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2116 	all_key_count = 0;
2117 	all_key_length = 0;
2118 }
2119 
2120 
2121 uint8 *
2122 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const
2123 {
2124 	if (index < 0 || index > NumKeys())
2125 		return NULL;
2126 
2127 	uint8 *keyStart = Keys();
2128 	uint16 *keyLengths = KeyLengths();
2129 
2130 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2131 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2132 	if (index > 0)
2133 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2134 
2135 	return keyStart;
2136 }
2137 
2138 
2139 uint8
2140 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2141 {
2142 	// the duplicate fragment handling is currently hard-coded to a node size
2143 	// of 1024 bytes - with future versions of BFS, this may be a problem
2144 
2145 	if (isFragment) {
2146 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2147 
2148 		return ((off_t *)this)[fragment];
2149 	}
2150 	return OverflowLink();
2151 }
2152 
2153 
2154 off_t
2155 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2156 {
2157 	uint32 start;
2158 	if (isFragment)
2159 		start = 8 * ((uint64)offset & 0x3ff);
2160 	else
2161 		start = 2;
2162 
2163 	return ((off_t *)this)[start + 1 + index];
2164 }
2165 
2166 
2167 /**	Although the name suggests it, this function doesn't return the real
2168  *	used fragment count; at least, it can only count to two: it returns
2169  *	0, if there is no fragment used, 1 if there is only one fragment
2170  *	used, and 2 if there are at least 2 fragments used.
2171  */
2172 
2173 uint32
2174 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2175 {
2176 	uint32 used = 0;
2177 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2178 		duplicate_array *array = FragmentAt(i);
2179 		if (array->count > 0 && ++used > 1)
2180 			return used;
2181 	}
2182 	return used;
2183 }
2184 
2185 
2186 #ifdef DEBUG
2187 status_t
2188 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2189 {
2190 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2191 		DEBUGGER(("invalid node: key/length count"));
2192 
2193 	for (int32 i = 0; i < NumKeys(); i++) {
2194 		uint16 length;
2195 		uint8 *key = KeyAt(i, &length);
2196 		if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize
2197 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2198 			dprintf("node %p, key %ld\n", this, i);
2199 			DEBUGGER(("invalid node: keys corrupted"));
2200 			return B_BAD_DATA;
2201 		}
2202 		if (Values()[i] == -1) {
2203 			dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]);
2204 			DEBUGGER(("invalid node: values corrupted"));
2205 			return B_BAD_DATA;
2206 		}
2207 	}
2208 	return B_OK;
2209 }
2210 #endif
2211 
2212 
2213 //	#pragma mark -
2214 
2215 
2216 int32
2217 compareKeys(type_code type, const void *key1, int keyLength1, const void *key2, int keyLength2)
2218 {
2219 	// if one of the keys is NULL, bail out gracefully
2220 	if (key1 == NULL || key2 == NULL) {
2221 //#if USER
2222 //		// that's here to see if it's reasonable to handle this case nicely at all
2223 //		DEBUGGER(("compareKeys() got NULL key!"));
2224 //#endif
2225 		// even if it's most probably a bug in the calling code, we try to
2226 		// give a meaningful result
2227 		if (key1 == NULL && key2 != NULL)
2228 			return 1;
2229 		else if (key2 == NULL && key1 != NULL)
2230 			return -1;
2231 
2232 		return 0;
2233 	}
2234 
2235 	switch (type) {
2236 	    case B_STRING_TYPE:
2237     	{
2238 			int len = min_c(keyLength1, keyLength2);
2239 			int result = strncmp((const char *)key1, (const char *)key2, len);
2240 
2241 			if (result == 0
2242 				&& !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0'))
2243 				result = keyLength1 - keyLength2;
2244 
2245 			return result;
2246 		}
2247 
2248 		case B_SSIZE_T_TYPE:
2249 		case B_INT32_TYPE:
2250 			return *(int32 *)key1 - *(int32 *)key2;
2251 
2252 		case B_SIZE_T_TYPE:
2253 		case B_UINT32_TYPE:
2254 			if (*(uint32 *)key1 == *(uint32 *)key2)
2255 				return 0;
2256 			else if (*(uint32 *)key1 > *(uint32 *)key2)
2257 				return 1;
2258 
2259 			return -1;
2260 
2261 		case B_OFF_T_TYPE:
2262 		case B_INT64_TYPE:
2263 			if (*(int64 *)key1 == *(int64 *)key2)
2264 				return 0;
2265 			else if (*(int64 *)key1 > *(int64 *)key2)
2266 				return 1;
2267 
2268 			return -1;
2269 
2270 		case B_UINT64_TYPE:
2271 			if (*(uint64 *)key1 == *(uint64 *)key2)
2272 				return 0;
2273 			else if (*(uint64 *)key1 > *(uint64 *)key2)
2274 				return 1;
2275 
2276 			return -1;
2277 
2278 		case B_FLOAT_TYPE:
2279 		{
2280 			float result = *(float *)key1 - *(float *)key2;
2281 			if (result == 0.0f)
2282 				return 0;
2283 
2284 			return (result < 0.0f) ? -1 : 1;
2285 		}
2286 
2287 		case B_DOUBLE_TYPE:
2288 		{
2289 			double result = *(double *)key1 - *(double *)key2;
2290 			if (result == 0.0)
2291 				return 0;
2292 
2293 			return (result < 0.0) ? -1 : 1;
2294 		}
2295 	}
2296 
2297 	// if the type is unknown, the entries don't match...
2298 	return -1;
2299 }
2300 
2301