xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 93a78ecaa45114d68952d08c4778f073515102f2)
1 /*
2  * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 //! B+Tree implementation
10 
11 
12 #include "Debug.h"
13 #include "BPlusTree.h"
14 #include "Inode.h"
15 #include "Utility.h"
16 
17 
18 #ifdef DEBUG
19 class NodeChecker {
20 	public:
21 		NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text)
22 			:
23 			fNode(node),
24 			fSize(nodeSize),
25 			fText(text)
26 		{
27 			Check("integrity check failed on construction.");
28 		}
29 
30 		~NodeChecker()
31 		{
32 			Check("integrity check failed on destruction.");
33 		}
34 
35 		void
36 		Check(const char *message)
37 		{
38 			if (fNode->CheckIntegrity(fSize) < B_OK)
39 				dprintf("%s: %s\n", fText, message);
40 		}
41 
42 	private:
43 		const bplustree_node	*fNode;
44 		int32					fSize;
45 		const char				*fText;
46 };
47 #endif
48 
49 
50 // Node Caching for the BPlusTree class
51 //
52 // With write support, there is the need for a function that allocates new
53 // nodes by either returning empty nodes, or by growing the file's data stream
54 //
55 // !! The CachedNode class assumes that you have properly locked the stream
56 // !! before asking for nodes.
57 //
58 // Note: This code will fail if the block size is smaller than the node size!
59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
60 // is hard-coded to 1024 bytes, that's not an issue now.
61 
62 void
63 CachedNode::UnsetUnchanged(Transaction &transaction)
64 {
65 	if (fTree == NULL || fTree->fStream == NULL)
66 		return;
67 
68 	if (fNode != NULL) {
69 		void *cache = fTree->fStream->GetVolume()->BlockCache();
70 
71 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
72 		block_cache_put(cache, fBlockNumber);
73 		fNode = NULL;
74 	}
75 }
76 
77 
78 void
79 CachedNode::Unset()
80 {
81 	if (fTree == NULL || fTree->fStream == NULL)
82 		return;
83 
84 	if (fNode != NULL) {
85 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber);
86 		fNode = NULL;
87 	}
88 }
89 
90 
91 const bplustree_node *
92 CachedNode::SetTo(off_t offset, bool check)
93 {
94 	if (fTree == NULL || fTree->fStream == NULL) {
95 		REPORT_ERROR(B_BAD_VALUE);
96 		return NULL;
97 	}
98 
99 	Unset();
100 
101 	// You can only ask for nodes at valid positions - you can't
102 	// even access the b+tree header with this method (use SetToHeader()
103 	// instead)
104 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
105 		|| offset <= 0
106 		|| (offset % fTree->fNodeSize) != 0)
107 		return NULL;
108 
109 	if (InternalSetTo(NULL, offset) != NULL && check) {
110 		// sanity checks (links, all_key_count)
111 		if (!fTree->fHeader->CheckNode(fNode)) {
112 			FATAL(("invalid node (%p) read from offset %Ld, inode at %Ld\n",
113 				fNode, offset, fTree->fStream->ID()));
114 			return NULL;
115 		}
116 	}
117 	return fNode;
118 }
119 
120 
121 bplustree_node *
122 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check)
123 {
124 	if (fTree == NULL || fTree->fStream == NULL) {
125 		REPORT_ERROR(B_BAD_VALUE);
126 		return NULL;
127 	}
128 
129 	Unset();
130 
131 	// You can only ask for nodes at valid positions - you can't
132 	// even access the b+tree header with this method (use SetToHeader()
133 	// instead)
134 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
135 		|| offset <= 0
136 		|| (offset % fTree->fNodeSize) != 0)
137 		return NULL;
138 
139 	if (InternalSetTo(&transaction, offset) != NULL && check) {
140 		// sanity checks (links, all_key_count)
141 		if (!fTree->fHeader->CheckNode(fNode)) {
142 			FATAL(("invalid node [%p] read from offset %Ld, inode at %Ld\n",
143 				fNode, offset, fTree->fStream->ID()));
144 			return NULL;
145 		}
146 	}
147 	return fNode;
148 }
149 
150 
151 bplustree_node *
152 CachedNode::MakeWritable(Transaction &transaction)
153 {
154 	if (fNode == NULL)
155 		return NULL;
156 
157 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
158 			fBlockNumber, transaction.ID()) == B_OK)
159 		return fNode;
160 
161 	return NULL;
162 }
163 
164 
165 bplustree_header *
166 CachedNode::MakeWritableHeader(Transaction &transaction)
167 {
168 	return (bplustree_header *)MakeWritable(transaction);
169 }
170 
171 
172 const bplustree_header *
173 CachedNode::SetToHeader()
174 {
175 	if (fTree == NULL || fTree->fStream == NULL) {
176 		REPORT_ERROR(B_BAD_VALUE);
177 		return NULL;
178 	}
179 
180 	Unset();
181 
182 	InternalSetTo(NULL, 0LL);
183 	return (bplustree_header *)fNode;
184 }
185 
186 
187 bplustree_header *
188 CachedNode::SetToWritableHeader(Transaction &transaction)
189 {
190 	if (fTree == NULL || fTree->fStream == NULL) {
191 		REPORT_ERROR(B_BAD_VALUE);
192 		return NULL;
193 	}
194 
195 	Unset();
196 
197 	InternalSetTo(&transaction, 0LL);
198 	return (bplustree_header *)fNode;
199 }
200 
201 
202 bplustree_node *
203 CachedNode::InternalSetTo(Transaction *transaction, off_t offset)
204 {
205 	fNode = NULL;
206 	fOffset = offset;
207 
208 	off_t fileOffset;
209 	block_run run;
210 	if (offset < fTree->fStream->Size()
211 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
212 		Volume *volume = fTree->fStream->GetVolume();
213 
214 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
215 		fBlockNumber = volume->ToBlock(run) + blockOffset;
216 		uint8 *block;
217 
218 		if (transaction != NULL) {
219 			block = (uint8 *)block_cache_get_writable(volume->BlockCache(),
220 				fBlockNumber, transaction->ID());
221 			fWritable = true;
222 		} else {
223 			block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber);
224 			fWritable = false;
225 		}
226 
227 		if (block) {
228 			// the node is somewhere in that block... (confusing offset calculation)
229 			fNode = (bplustree_node *)(block + offset -
230 						(fileOffset + (blockOffset << volume->BlockShift())));
231 		} else
232 			REPORT_ERROR(B_IO_ERROR);
233 	}
234 	return fNode;
235 }
236 
237 
238 status_t
239 CachedNode::Free(Transaction &transaction, off_t offset)
240 {
241 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
242 		RETURN_ERROR(B_BAD_VALUE);
243 
244 	// ToDo: scan the free nodes list and remove all nodes at the end
245 	// of the tree - perhaps that shouldn't be done everytime that
246 	// function is called, perhaps it should be done when the directory
247 	// inode is closed or based on some calculation or whatever...
248 
249 	bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction);
250 	if (header == NULL)
251 		return B_IO_ERROR;
252 
253 	// if the node is the last one in the tree, we shrink
254 	// the tree and file size by one node
255 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
256 	if (offset == lastOffset) {
257 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
258 		if (status < B_OK)
259 			return status;
260 
261 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
262 		return B_OK;
263 	}
264 
265 	// add the node to the free nodes list
266 	fNode->left_link = header->free_node_pointer;
267 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
268 
269 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
270 	return B_OK;
271 }
272 
273 
274 status_t
275 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node,
276 	off_t *_offset)
277 {
278 	if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL)
279 		RETURN_ERROR(B_BAD_VALUE);
280 
281 	Unset();
282 
283 	bplustree_header *header;
284 	status_t status;
285 
286 	// if there are any free nodes, recycle them
287 	if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) {
288 		header = fTree->fCachedHeader.MakeWritableHeader(transaction);
289 		if (header == NULL)
290 			return B_IO_ERROR;
291 
292 		*_offset = header->FreeNode();
293 		*_node = fNode;
294 
295 		// set new free node pointer
296 		header->free_node_pointer = fNode->left_link;
297 		fNode->Initialize();
298 		return B_OK;
299 	}
300 
301 	// allocate space for a new node
302 	Inode *stream = fTree->fStream;
303 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
304 		return status;
305 
306 	header = fTree->fCachedHeader.MakeWritableHeader(transaction);
307 	if (header == NULL)
308 		return B_IO_ERROR;
309 
310 	// the maximum_size has to be changed before the call to SetTo() - or
311 	// else it will fail because the requested node is out of bounds
312 	off_t offset = header->MaximumSize();
313 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
314 		+ fTree->fNodeSize);
315 
316 	if (SetToWritable(transaction, offset, false) != NULL) {
317 		fNode->Initialize();
318 
319 		*_offset = offset;
320 		*_node = fNode;
321 		return B_OK;
322 	}
323 
324 	// revert header size to old value
325 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
326 		- fTree->fNodeSize);
327 	RETURN_ERROR(B_ERROR);
328 }
329 
330 
331 //	#pragma mark -
332 
333 
334 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize)
335 	:
336 	fStream(NULL),
337 	fHeader(NULL),
338 	fCachedHeader(this)
339 {
340 	SetTo(transaction, stream);
341 }
342 
343 
344 BPlusTree::BPlusTree(Inode *stream)
345 	:
346 	fStream(NULL),
347 	fHeader(NULL),
348 	fCachedHeader(this)
349 {
350 	SetTo(stream);
351 }
352 
353 
354 BPlusTree::BPlusTree()
355 	:
356 	fStream(NULL),
357 	fHeader(NULL),
358 	fCachedHeader(this),
359 	fNodeSize(BPLUSTREE_NODE_SIZE),
360 	fAllowDuplicates(true),
361 	fStatus(B_NO_INIT)
362 {
363 }
364 
365 
366 BPlusTree::~BPlusTree()
367 {
368 	// if there are any TreeIterators left, we need to stop them
369 	// (can happen when the tree's inode gets deleted while
370 	// traversing the tree - a TreeIterator doesn't lock the inode)
371 	if (fIteratorLock.Lock() < B_OK)
372 		return;
373 
374 	TreeIterator *iterator = NULL;
375 	while ((iterator = fIterators.Next(iterator)) != NULL)
376 		iterator->Stop();
377 
378 	fIteratorLock.Unlock();
379 }
380 
381 
382 /*! Create a new B+Tree on the specified stream */
383 status_t
384 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize)
385 {
386 	// initializes in-memory B+Tree
387 
388 	fStream = stream;
389 
390 	bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction);
391 	if (header == NULL) {
392 		// allocate space for new header + node!
393 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
394 		if (fStatus < B_OK)
395 			RETURN_ERROR(fStatus);
396 
397 		header = fCachedHeader.SetToWritableHeader(transaction);
398 		if (header == NULL)
399 			RETURN_ERROR(fStatus = B_ERROR);
400 	}
401 
402 	fHeader = header;
403 	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
404 			&& stream->BlockRun() != stream->Parent())
405 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
406 
407 	fNodeSize = nodeSize;
408 
409 	// initialize b+tree header
410  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
411  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
412  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
413  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
414  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
415  	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
416  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
417 
418 	// initialize b+tree root node
419 	CachedNode cached(this);
420 	cached.SetToWritable(transaction, header->RootNode(), false);
421 	if (cached.Node() == NULL)
422 		RETURN_ERROR(B_IO_ERROR);
423 
424 	cached.Node()->Initialize();
425 
426 	return fStatus = B_OK;
427 }
428 
429 
430 status_t
431 BPlusTree::SetTo(Inode *stream)
432 {
433 	if (stream == NULL)
434 		RETURN_ERROR(fStatus = B_BAD_VALUE);
435 
436 	// get on-disk B+Tree header
437 
438 	fCachedHeader.Unset();
439 	fStream = stream;
440 
441 	fHeader = fCachedHeader.SetToHeader();
442 	if (fHeader == NULL)
443 		RETURN_ERROR(fStatus = B_NO_INIT);
444 
445 	// is header valid?
446 
447 	if (fHeader->MaximumSize() != stream->Size()) {
448 		FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n",
449 			fHeader->MaximumSize(), stream->Size()));
450 		// we can't change the header since we don't have a transaction
451 		//fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
452 	}
453 	if (fHeader->Magic() != BPLUSTREE_MAGIC
454 		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
455 		|| !fHeader->IsValidLink(fHeader->RootNode())
456 		|| !fHeader->IsValidLink(fHeader->FreeNode())) {
457 #ifdef DEBUG
458 		dump_bplustree_header(fHeader);
459 		dump_block((const char *)fHeader, 128);
460 #endif
461 		RETURN_ERROR(fStatus = B_BAD_DATA);
462 	}
463 
464 	fNodeSize = fHeader->NodeSize();
465 
466 	{
467 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
468 			S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
469 			S_DOUBLE_INDEX};
470 		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
471 			| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
472 			| S_FLOAT_INDEX | S_DOUBLE_INDEX);
473 
474 		if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
475 			|| (stream->Mode() & S_INDEX_DIR)
476 				&& toMode[fHeader->DataType()] != mode
477 			|| !stream->IsContainer()) {
478 			D(	dump_bplustree_header(fHeader);
479 				dump_inode(&stream->Node());
480 			);
481 			RETURN_ERROR(fStatus = B_BAD_TYPE);
482 		}
483 
484 		// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
485 		// in the original BFS code - we will honour it nevertheless
486 		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
487 			&& stream->BlockRun() != stream->Parent())
488 			|| (stream->Mode() & S_ALLOW_DUPS) != 0;
489 	}
490 
491 	CachedNode cached(this, fHeader->RootNode());
492 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
493 }
494 
495 
496 status_t
497 BPlusTree::InitCheck()
498 {
499 	return fStatus;
500 }
501 
502 
503 int32
504 BPlusTree::TypeCodeToKeyType(type_code code)
505 {
506 	switch (code) {
507 		case B_STRING_TYPE:
508 			return BPLUSTREE_STRING_TYPE;
509 		case B_SSIZE_T_TYPE:
510 		case B_INT32_TYPE:
511 			return BPLUSTREE_INT32_TYPE;
512 		case B_SIZE_T_TYPE:
513 		case B_UINT32_TYPE:
514 			return BPLUSTREE_UINT32_TYPE;
515 		case B_OFF_T_TYPE:
516 		case B_INT64_TYPE:
517 			return BPLUSTREE_INT64_TYPE;
518 		case B_UINT64_TYPE:
519 			return BPLUSTREE_UINT64_TYPE;
520 		case B_FLOAT_TYPE:
521 			return BPLUSTREE_FLOAT_TYPE;
522 		case B_DOUBLE_TYPE:
523 			return BPLUSTREE_DOUBLE_TYPE;
524 	}
525 	return -1;
526 }
527 
528 
529 int32
530 BPlusTree::ModeToKeyType(mode_t mode)
531 {
532 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
533 				   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
534 		case S_INT_INDEX:
535 			return BPLUSTREE_INT32_TYPE;
536 		case S_UINT_INDEX:
537 			return BPLUSTREE_UINT32_TYPE;
538 		case S_LONG_LONG_INDEX:
539 			return BPLUSTREE_INT64_TYPE;
540 		case S_ULONG_LONG_INDEX:
541 			return BPLUSTREE_UINT64_TYPE;
542 		case S_FLOAT_INDEX:
543 			return BPLUSTREE_FLOAT_TYPE;
544 		case S_DOUBLE_INDEX:
545 			return BPLUSTREE_DOUBLE_TYPE;
546 		case S_STR_INDEX:
547 		default:
548 			// default is for standard directories
549 			return BPLUSTREE_STRING_TYPE;
550 	}
551 }
552 
553 
554 //	#pragma mark -
555 
556 
557 void
558 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
559 	uint16 splitAt, int8 change)
560 {
561 	// Although every iterator which is affected by this update currently
562 	// waits on a semaphore, other iterators could be added/removed at
563 	// any time, so we need to protect this loop
564 	if (fIteratorLock.Lock() < B_OK)
565 		return;
566 
567 	TreeIterator *iterator = NULL;
568 	while ((iterator = fIterators.Next(iterator)) != NULL)
569 		iterator->Update(offset, nextOffset, keyIndex, splitAt, change);
570 
571 	fIteratorLock.Unlock();
572 }
573 
574 
575 void
576 BPlusTree::_AddIterator(TreeIterator *iterator)
577 {
578 	if (fIteratorLock.Lock() < B_OK)
579 		return;
580 
581 	fIterators.Add(iterator);
582 
583 	fIteratorLock.Unlock();
584 }
585 
586 
587 void
588 BPlusTree::_RemoveIterator(TreeIterator *iterator)
589 {
590 	if (fIteratorLock.Lock() < B_OK)
591 		return;
592 
593 	fIterators.Remove(iterator);
594 
595 	fIteratorLock.Unlock();
596 }
597 
598 
599 int32
600 BPlusTree::_CompareKeys(const void *key1, int keyLength1, const void *key2,
601 	int keyLength2)
602 {
603 	type_code type = 0;
604 	switch (fHeader->data_type) {
605 	    case BPLUSTREE_STRING_TYPE:
606 	    	type = B_STRING_TYPE;
607 	    	break;
608 		case BPLUSTREE_INT32_TYPE:
609 	    	type = B_INT32_TYPE;
610 	    	break;
611 		case BPLUSTREE_UINT32_TYPE:
612 	    	type = B_UINT32_TYPE;
613 	    	break;
614 		case BPLUSTREE_INT64_TYPE:
615 	    	type = B_INT64_TYPE;
616 	    	break;
617 		case BPLUSTREE_UINT64_TYPE:
618 	    	type = B_UINT64_TYPE;
619 	    	break;
620 		case BPLUSTREE_FLOAT_TYPE:
621 	    	type = B_FLOAT_TYPE;
622 	    	break;
623 		case BPLUSTREE_DOUBLE_TYPE:
624 	    	type = B_DOUBLE_TYPE;
625 	    	break;
626 	}
627    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
628 }
629 
630 
631 status_t
632 BPlusTree::_FindKey(const bplustree_node *node, const uint8 *key,
633 	uint16 keyLength, uint16 *_index, off_t *_next)
634 {
635 #ifdef DEBUG
636 		NodeChecker checker(node, fNodeSize, "find");
637 #endif
638 
639 	if (node->all_key_count == 0) {
640 		if (_index)
641 			*_index = 0;
642 		if (_next)
643 			*_next = node->OverflowLink();
644 		return B_ENTRY_NOT_FOUND;
645 	}
646 
647 	off_t *values = node->Values();
648 	int16 saveIndex = -1;
649 
650 	// binary search in the key array
651 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
652 		uint16 i = (first + last) >> 1;
653 
654 		uint16 searchLength;
655 		uint8 *searchKey = node->KeyAt(i, &searchLength);
656 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
657 				> (uint8 *)node + fNodeSize
658 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
659 			fStream->GetVolume()->Panic();
660 			RETURN_ERROR(B_BAD_DATA);
661 		}
662 
663 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
664 		if (cmp < 0) {
665 			last = i - 1;
666 			saveIndex = i;
667 		} else if (cmp > 0) {
668 			saveIndex = first = i + 1;
669 		} else {
670 			if (_index)
671 				*_index = i;
672 			if (_next)
673 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
674 			return B_OK;
675 		}
676 	}
677 
678 	if (_index)
679 		*_index = saveIndex;
680 	if (_next) {
681 		if (saveIndex == node->NumKeys())
682 			*_next = node->OverflowLink();
683 		else
684 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
685 	}
686 	return B_ENTRY_NOT_FOUND;
687 }
688 
689 
690 /*!
691 	Prepares the stack to contain all nodes that were passed while
692 	following the key, from the root node to the leaf node that could
693 	or should contain that key.
694 */
695 status_t
696 BPlusTree::_SeekDown(Stack<node_and_key> &stack, const uint8 *key,
697 	uint16 keyLength)
698 {
699 	// set the root node to begin with
700 	node_and_key nodeAndKey;
701 	nodeAndKey.nodeOffset = fHeader->RootNode();
702 
703 	CachedNode cached(this);
704 	const bplustree_node *node;
705 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
706 		// if we are already on leaf level, we're done
707 		if (node->OverflowLink() == BPLUSTREE_NULL) {
708 			// node that the keyIndex is not properly set here (but it's not
709 			// needed in the calling functions anyway)!
710 			nodeAndKey.keyIndex = 0;
711 			stack.Push(nodeAndKey);
712 			return B_OK;
713 		}
714 
715 		off_t nextOffset;
716 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
717 			&nextOffset);
718 
719 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
720 			RETURN_ERROR(B_ERROR);
721 
722 		// put the node offset & the correct keyIndex on the stack
723 		stack.Push(nodeAndKey);
724 
725 		nodeAndKey.nodeOffset = nextOffset;
726 	}
727 
728 	FATAL(("BPlusTree::SeekDown() could not open node %Ld\n",
729 		nodeAndKey.nodeOffset));
730 	return B_ERROR;
731 }
732 
733 
734 /*!
735 	This will find a free duplicate fragment in the given bplustree_node.
736 	The CachedNode will be set to the writable fragment on success.
737 */
738 status_t
739 BPlusTree::_FindFreeDuplicateFragment(Transaction &transaction,
740 	const bplustree_node *node, CachedNode &cached,
741 	off_t *_offset, bplustree_node **_fragment, uint32 *_index)
742 {
743 	off_t *values = node->Values();
744 	for (int32 i = 0; i < node->NumKeys(); i++) {
745 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
746 
747 		// does the value link to a duplicate fragment?
748 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
749 			continue;
750 
751 		const bplustree_node *fragment = cached.SetTo(
752 			bplustree_node::FragmentOffset(value), false);
753 		if (fragment == NULL) {
754 			FATAL(("Could not get duplicate fragment at %Ld\n", value));
755 			continue;
756 		}
757 
758 		// see if there is some space left for us
759 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
760 		for (uint32 j = 0; j < num; j++) {
761 			duplicate_array *array = fragment->FragmentAt(j);
762 
763 			if (array->count == 0) {
764 				// found an unused fragment
765 				*_fragment = cached.MakeWritable(transaction);
766 				if (*_fragment == NULL)
767 					return B_IO_ERROR;
768 
769 				*_offset = bplustree_node::FragmentOffset(value);
770 				*_index = j;
771 				return B_OK;
772 			}
773 		}
774 	}
775 	return B_ENTRY_NOT_FOUND;
776 }
777 
778 
779 status_t
780 BPlusTree::_InsertDuplicate(Transaction &transaction, CachedNode &cached,
781 	const bplustree_node *node, uint16 index, off_t value)
782 {
783 	CachedNode cachedDuplicate(this);
784 	off_t *values = node->Values();
785 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
786 	status_t status;
787 	off_t offset;
788 
789 	if (bplustree_node::IsDuplicate(oldValue)) {
790 		// If it's a duplicate fragment, try to insert it into that, or if it
791 		// doesn't fit anymore, create a new duplicate node
792 
793 		if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
794 			bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
795 				bplustree_node::FragmentOffset(oldValue), false);
796 			if (duplicate == NULL)
797 				return B_IO_ERROR;
798 
799 			duplicate_array *array = duplicate->FragmentAt(
800 				bplustree_node::FragmentIndex(oldValue));
801 			if (array->count > NUM_FRAGMENT_VALUES
802 				|| array->count < 1) {
803 				FATAL(("insertDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n",
804 					(int)bplustree_node::FragmentIndex(oldValue),
805 					bplustree_node::FragmentOffset(oldValue),
806 					array->count));
807 				return B_BAD_DATA;
808 			}
809 
810 			if (array->count < NUM_FRAGMENT_VALUES) {
811 				array->Insert(value);
812 			} else {
813 				// test if the fragment will be empty if we remove this key's values
814 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
815 					// the node will be empty without our values, so let us
816 					// reuse it as a duplicate node
817 					offset = bplustree_node::FragmentOffset(oldValue);
818 
819 					memmove(duplicate->DuplicateArray(), array,
820 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
821 					duplicate->left_link = duplicate->right_link
822 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
823 
824 					array = duplicate->DuplicateArray();
825 					array->Insert(value);
826 				} else {
827 					// create a new duplicate node
828 
829 					cachedDuplicate.UnsetUnchanged(transaction);
830 						// the old duplicate has not been touched, so we can reuse it
831 
832 					bplustree_node *newDuplicate;
833 					status = cachedDuplicate.Allocate(transaction,
834 						&newDuplicate, &offset);
835 					if (status < B_OK)
836 						RETURN_ERROR(status);
837 
838 					// copy the array from the fragment node to the duplicate node
839 					// and free the old entry (by zero'ing all values)
840 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
841 						array->count);
842 					memcpy(&newDuplicate->all_key_count, &array->values[0],
843 						array->count * sizeof(off_t));
844 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
845 
846 					array = newDuplicate->DuplicateArray();
847 					array->Insert(value);
848 				}
849 
850 				// update the main pointer to link to a duplicate node
851 				if (cached.MakeWritable(transaction) == NULL)
852 					return B_IO_ERROR;
853 
854 				values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
855 					BPLUSTREE_DUPLICATE_NODE, offset));
856 			}
857 
858 			return B_OK;
859 		}
860 
861 		// Put the value into a dedicated duplicate node
862 
863 		// search for free space in the duplicate nodes of that key
864 		duplicate_array *array;
865 		const bplustree_node *duplicate;
866 		off_t duplicateOffset;
867 		do {
868 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
869 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
870 			if (duplicate == NULL)
871 				return B_IO_ERROR;
872 
873 			array = duplicate->DuplicateArray();
874 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
875 				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
876 					duplicateOffset, array->count));
877 				return B_BAD_DATA;
878 			}
879 		} while (array->count >= NUM_DUPLICATE_VALUES
880 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
881 
882 		bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction);
883 		if (writableDuplicate == NULL)
884 			return B_IO_ERROR;
885 
886 		if (array->count < NUM_DUPLICATE_VALUES) {
887 			array = writableDuplicate->DuplicateArray();
888 			array->Insert(value);
889 		} else {
890 			// no space left - add a new duplicate node
891 
892 			bplustree_node *newDuplicate;
893 			status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset);
894 			if (status < B_OK)
895 				RETURN_ERROR(status);
896 
897 			// link the two nodes together
898 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
899 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
900 
901 			array = newDuplicate->DuplicateArray();
902 			array->count = 0;
903 			array->Insert(value);
904 		}
905 		return B_OK;
906 	}
907 
908 	// Search for a free duplicate fragment or create a new one
909 	// to insert the duplicate value into
910 
911 	uint32 fragmentIndex = 0;
912 	bplustree_node *fragment;
913 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
914 			&offset, &fragment, &fragmentIndex) != B_OK) {
915 		// allocate a new duplicate fragment node
916 		if ((status = cachedDuplicate.Allocate(transaction, &fragment,
917 				&offset)) < B_OK)
918 			RETURN_ERROR(status);
919 
920 		memset(fragment, 0, fNodeSize);
921 	}
922 
923 	duplicate_array *array = fragment->FragmentAt(fragmentIndex);
924 	array->Insert(oldValue);
925 	array->Insert(value);
926 
927 	if (cached.MakeWritable(transaction) == NULL)
928 		return B_IO_ERROR;
929 
930 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
931 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
932 
933 	return B_OK;
934 }
935 
936 
937 void
938 BPlusTree::_InsertKey(bplustree_node *node, uint16 index, uint8 *key,
939 	uint16 keyLength, off_t value)
940 {
941 	// should never happen, but who knows?
942 	if (index > node->NumKeys())
943 		return;
944 
945 	off_t *values = node->Values();
946 	uint16 *keyLengths = node->KeyLengths();
947 	uint8 *keys = node->Keys();
948 
949 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
950 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
951 		+ keyLength);
952 
953 	off_t *newValues = node->Values();
954 	uint16 *newKeyLengths = node->KeyLengths();
955 
956 	// move values and copy new value into them
957 	memmove(newValues + index + 1, values + index,
958 		sizeof(off_t) * (node->NumKeys() - 1 - index));
959 	memmove(newValues, values, sizeof(off_t) * index);
960 
961 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
962 
963 	// move and update key length index
964 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
965 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
966 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
967 	}
968 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
969 
970 	int32 keyStart;
971 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
972 		+ (keyStart = index > 0
973 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
974 
975 	// move keys and copy new key into them
976 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
977 	int32 size = node->AllKeyLength() - length;
978 	if (size > 0)
979 		memmove(keys + length, keys + length - keyLength, size);
980 
981 	memcpy(keys + keyStart, key, keyLength);
982 }
983 
984 
985 /*!
986 	Splits the \a node into two halves - the other half will be put into \a other.
987 	It also takes care to create a new overflow link if the node to split is an
988 	index node.
989 */
990 status_t
991 BPlusTree::_SplitNode(bplustree_node *node, off_t nodeOffset,
992 	bplustree_node *other, off_t otherOffset, uint16 *_keyIndex, uint8 *key,
993 	uint16 *_keyLength, off_t *_value)
994 {
995 	if (*_keyIndex > node->NumKeys() + 1)
996 		return B_BAD_VALUE;
997 
998 	uint16 *inKeyLengths = node->KeyLengths();
999 	off_t *inKeyValues = node->Values();
1000 	uint8 *inKeys = node->Keys();
1001 	uint8 *outKeys = other->Keys();
1002 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1003 
1004 	// how many keys will fit in one (half) page?
1005 	// that loop will find the answer to this question and
1006 	// change the key lengths indices for their new home
1007 
1008 	// "bytes" is the number of bytes written for the new key,
1009 	// "bytesBefore" are the bytes before that key
1010 	// "bytesAfter" are the bytes after the new key, if any
1011 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1012 
1013 	size_t size = fNodeSize >> 1;
1014 	int32 out, in;
1015 	for (in = out = 0; in < node->NumKeys() + 1;) {
1016 		if (!bytes) {
1017 			bytesBefore = in > 0
1018 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1019 		}
1020 
1021 		if (in == keyIndex && !bytes) {
1022 			bytes = *_keyLength;
1023 		} else {
1024 			if (keyIndex < out) {
1025 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1026 					- bytesBefore;
1027 			}
1028 
1029 			in++;
1030 		}
1031 		out++;
1032 
1033 		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1034 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1035 			// we have found the number of keys in the new node!
1036 			break;
1037 		}
1038 	}
1039 
1040 	// if the new key was not inserted, set the length of the keys
1041 	// that can be copied directly
1042 	if (keyIndex >= out && in > 0)
1043 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1044 
1045 	if (bytesBefore < 0 || bytesAfter < 0)
1046 		return B_BAD_DATA;
1047 
1048 	other->left_link = node->left_link;
1049 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1050 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1051 		+ bytesAfter);
1052 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1053 
1054 	uint16 *outKeyLengths = other->KeyLengths();
1055 	off_t *outKeyValues = other->Values();
1056 	int32 keys = out > keyIndex ? keyIndex : out;
1057 
1058 	if (bytesBefore) {
1059 		// copy the keys
1060 		memcpy(outKeys, inKeys, bytesBefore);
1061 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1062 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1063 	}
1064 	if (bytes) {
1065 		// copy the newly inserted key
1066 		memcpy(outKeys + bytesBefore, key, bytes);
1067 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1068 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1069 
1070 		if (bytesAfter) {
1071 			// copy the keys after the new key
1072 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1073 				bytesAfter);
1074 			keys = out - keyIndex - 1;
1075 			for (int32 i = 0;i < keys;i++) {
1076 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1077 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) + bytes);
1078 			}
1079 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1080 				keys * sizeof(off_t));
1081 		}
1082 	}
1083 
1084 	// if the new key was already inserted, we shouldn't use it again
1085 	if (in != out)
1086 		keyIndex--;
1087 
1088 	int32 total = bytesBefore + bytesAfter;
1089 
1090 	// these variables are for the key that will be returned
1091 	// to the parent node
1092 	uint8 *newKey = NULL;
1093 	uint16 newLength;
1094 	bool newAllocated = false;
1095 
1096 	// If we have split an index node, we have to drop the first key
1097 	// of the next node (which can also be the new key to insert).
1098 	// The dropped key is also the one which has to be inserted in
1099 	// the parent node, so we will set the "newKey" already here.
1100 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1101 		if (in == keyIndex) {
1102 			newKey = key;
1103 			newLength = *_keyLength;
1104 
1105 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1106 			keyIndex--;
1107 		} else {
1108 			// If a key is dropped (is not the new key), we have to copy
1109 			// it, because it would be lost if not.
1110 			uint8 *droppedKey = node->KeyAt(in, &newLength);
1111 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1112 					> (uint8 *)node + fNodeSize
1113 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1114 				fStream->GetVolume()->Panic();
1115 				RETURN_ERROR(B_BAD_DATA);
1116 			}
1117 			newKey = (uint8 *)malloc(newLength);
1118 			if (newKey == NULL)
1119 				return B_NO_MEMORY;
1120 
1121 			newAllocated = true;
1122 			memcpy(newKey, droppedKey, newLength);
1123 
1124 			other->overflow_link = inKeyValues[in];
1125 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1126 		}
1127 	}
1128 
1129 	// and now the same game for the other page and the rest of the keys
1130 	// (but with memmove() instead of memcpy(), because they may overlap)
1131 
1132 	bytesBefore = bytesAfter = bytes = 0;
1133 	out = 0;
1134 	int32 skip = in;
1135 	while (in < node->NumKeys() + 1) {
1136 		if (in == keyIndex && !bytes) {
1137 			// it's enough to set bytesBefore once here, because we do
1138 			// not need to know the exact length of all keys in this
1139 			// loop
1140 			bytesBefore = in > skip
1141 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1142 			bytes = *_keyLength;
1143 		} else {
1144 			if (in < node->NumKeys()) {
1145 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1146 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1147 
1148 				if (bytes) {
1149 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1150 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1151 
1152 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1153 						- bytesBefore - bytes;
1154 				}
1155 			}
1156 			in++;
1157 		}
1158 
1159 		out++;
1160 
1161 		// break out when all keys are done
1162 		if (in > node->NumKeys() && keyIndex < in)
1163 			break;
1164 	}
1165 
1166 	// adjust the byte counts (since we were a bit lazy in the loop)
1167 	if (keyIndex >= in && keyIndex - skip < out) {
1168 		bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1169 			- bytesBefore - total;
1170 	} else if (keyIndex < skip)
1171 		bytesBefore = node->AllKeyLength() - total;
1172 
1173 	if (bytesBefore < 0 || bytesAfter < 0) {
1174 		if (newAllocated)
1175 			free(newKey);
1176 		return B_BAD_DATA;
1177 	}
1178 
1179 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1180 		// right link, and overflow link can stay the same
1181 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1182 		+ bytesAfter);
1183 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1);
1184 
1185 	// array positions have changed
1186 	outKeyLengths = node->KeyLengths();
1187 	outKeyValues = node->Values();
1188 
1189 	// move the keys in the old node: the order is important here,
1190 	// because we don't want to overwrite any contents
1191 
1192 	keys = keyIndex <= skip ? out : keyIndex - skip;
1193 	keyIndex -= skip;
1194 
1195 	if (bytesBefore)
1196 		memmove(inKeys, inKeys + total, bytesBefore);
1197 	if (bytesAfter) {
1198 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1199 			bytesAfter);
1200 	}
1201 
1202 	if (bytesBefore)
1203 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1204 	in = out - keyIndex - 1;
1205 	if (bytesAfter) {
1206 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1207 			in * sizeof(uint16));
1208 	}
1209 
1210 	if (bytesBefore)
1211 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1212 	if (bytesAfter) {
1213 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1214 			in * sizeof(off_t));
1215 	}
1216 
1217 	if (bytes) {
1218 		// finally, copy the newly inserted key (don't overwrite anything)
1219 		memcpy(inKeys + bytesBefore, key, bytes);
1220 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1221 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1222 	}
1223 
1224 	// Prepare the key that will be inserted in the parent node which
1225 	// is either the dropped key or the last of the other node.
1226 	// If it's the dropped key, "newKey" was already set earlier.
1227 
1228 	if (newKey == NULL)
1229 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1230 
1231 	memcpy(key, newKey, newLength);
1232 	*_keyLength = newLength;
1233 	*_value = otherOffset;
1234 
1235 	if (newAllocated)
1236 		free(newKey);
1237 
1238 	return B_OK;
1239 }
1240 
1241 
1242 /*!
1243 	This inserts a key into the tree. The changes made to the tree will
1244 	all be part of the \a transaction.
1245 */
1246 status_t
1247 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength,
1248 	off_t value)
1249 {
1250 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1251 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1252 		RETURN_ERROR(B_BAD_VALUE);
1253 #ifdef DEBUG
1254 	if (value < 0)
1255 		panic("tried to insert invalid value %Ld!\n", value);
1256 #endif
1257 
1258 	// lock access to stream
1259 	WriteLocked locked(fStream->Lock());
1260 
1261 	Stack<node_and_key> stack;
1262 	if (_SeekDown(stack, key, keyLength) != B_OK)
1263 		RETURN_ERROR(B_ERROR);
1264 
1265 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1266 
1267 	memcpy(keyBuffer, key, keyLength);
1268 	keyBuffer[keyLength] = 0;
1269 
1270 	node_and_key nodeAndKey;
1271 	const bplustree_node *node;
1272 
1273 	CachedNode cached(this);
1274 	while (stack.Pop(&nodeAndKey)
1275 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1276 #ifdef DEBUG
1277 		NodeChecker checker(node, fNodeSize, "insert");
1278 #endif
1279 		if (node->IsLeaf()) {
1280 			// first round, check for duplicate entries
1281 			status_t status = _FindKey(node, key, keyLength,
1282 				&nodeAndKey.keyIndex);
1283 
1284 			// is this a duplicate entry?
1285 			if (status == B_OK) {
1286 				if (fAllowDuplicates) {
1287 					status = _InsertDuplicate(transaction, cached, node,
1288 						nodeAndKey.keyIndex, value);
1289 					if (status != B_OK)
1290 						RETURN_ERROR(status);
1291 					return B_OK;
1292 				}
1293 
1294 				RETURN_ERROR(B_NAME_IN_USE);
1295 			}
1296 		}
1297 
1298 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1299 		if (writableNode == NULL)
1300 			return B_IO_ERROR;
1301 
1302 		// is the node big enough to hold the pair?
1303 		if (int32(round_up(sizeof(bplustree_node)
1304 				+ writableNode->AllKeyLength() + keyLength)
1305 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1306 				+ sizeof(off_t))) < fNodeSize) {
1307 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1308 				keyBuffer, keyLength, value);
1309 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1310 				nodeAndKey.keyIndex, 0, 1);
1311 
1312 			return B_OK;
1313 		} else {
1314 			CachedNode cachedNewRoot(this);
1315 			CachedNode cachedOther(this);
1316 
1317 			// do we need to allocate a new root node? if so, then do
1318 			// it now
1319 			off_t newRoot = BPLUSTREE_NULL;
1320 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1321 				bplustree_node *root;
1322 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1323 					&newRoot);
1324 				if (status < B_OK) {
1325 					// The tree is most likely corrupted!
1326 					// But it's still sane at leaf level - we could set
1327 					// a flag in the header that forces the tree to be
1328 					// rebuild next time...
1329 					// But since we will have journaling, that's not a big
1330 					// problem anyway.
1331 					RETURN_ERROR(status);
1332 				}
1333 			}
1334 
1335 			// reserve space for the other node
1336 			bplustree_node *other;
1337 			off_t otherOffset;
1338 			status_t status = cachedOther.Allocate(transaction, &other,
1339 				&otherOffset);
1340 			if (status < B_OK) {
1341 				cachedNewRoot.Free(transaction, newRoot);
1342 				RETURN_ERROR(status);
1343 			}
1344 
1345 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1346 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1347 					&value) < B_OK) {
1348 				// free root node & other node here
1349 				cachedOther.Free(transaction, otherOffset);
1350 				cachedNewRoot.Free(transaction, newRoot);
1351 
1352 				RETURN_ERROR(B_ERROR);
1353 			}
1354 #ifdef DEBUG
1355 			checker.Check("insert split");
1356 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1357 #endif
1358 
1359 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1360 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1361 
1362 			// update the right link of the node in the left of the new node
1363 			if ((other = cachedOther.SetToWritable(transaction,
1364 					other->LeftLink())) != NULL) {
1365 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1366 			}
1367 
1368 			// create a new root if necessary
1369 			if (newRoot != BPLUSTREE_NULL) {
1370 				bplustree_node *root = cachedNewRoot.Node();
1371 
1372 				_InsertKey(root, 0, keyBuffer, keyLength,
1373 					writableNode->LeftLink());
1374 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1375 					nodeAndKey.nodeOffset);
1376 
1377 				bplustree_header *header
1378 					= fCachedHeader.MakeWritableHeader(transaction);
1379 				if (header == NULL)
1380 					return B_IO_ERROR;
1381 
1382 				// finally, update header to point to the new root
1383 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1384 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1385 					header->MaxNumberOfLevels() + 1);
1386 
1387 				return B_OK;
1388 			}
1389 		}
1390 	}
1391 	RETURN_ERROR(B_ERROR);
1392 }
1393 
1394 
1395 /*!
1396 	Removes the duplicate index/value pair from the tree.
1397 	It's part of the private tree interface.
1398 */
1399 status_t
1400 BPlusTree::_RemoveDuplicate(Transaction &transaction,
1401 	const bplustree_node *node, CachedNode &cached, uint16 index,
1402 	off_t value)
1403 {
1404 	off_t *values = node->Values();
1405 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1406 
1407 	CachedNode cachedDuplicate(this);
1408 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1409 	bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
1410 		duplicateOffset, false);
1411 	if (duplicate == NULL)
1412 		return B_IO_ERROR;
1413 
1414 	// if it's a duplicate fragment, remove the entry from there
1415 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1416 		duplicate_array *array = duplicate->FragmentAt(
1417 			bplustree_node::FragmentIndex(oldValue));
1418 
1419 		if (array->count > NUM_FRAGMENT_VALUES
1420 			|| array->count < 1) {
1421 			FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n",
1422 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count));
1423 			return B_BAD_DATA;
1424 		}
1425 		if (!array->Remove(value)) {
1426 			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",
1427 				value, duplicateOffset));
1428 			return B_ENTRY_NOT_FOUND;
1429 		}
1430 
1431 		// remove the array from the fragment node if it is empty
1432 		if (array->count == 1) {
1433 			// set the link to the remaining value
1434 			if (cached.MakeWritable(transaction) == NULL)
1435 				return B_IO_ERROR;
1436 
1437 			values[index] = array->values[0];
1438 
1439 			// Remove the whole fragment node, if this was the only array,
1440 			// otherwise free just the array
1441 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1442 				status_t status = cachedDuplicate.Free(transaction, duplicateOffset);
1443 				if (status < B_OK)
1444 					return status;
1445 			} else
1446 				array->count = 0;
1447 		}
1448 		return B_OK;
1449 	}
1450 
1451 	// Remove value from a duplicate node!
1452 
1453 	duplicate_array *array = NULL;
1454 
1455 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1456 		FATAL(("invalid duplicate node: first left link points to %Ld!\n",
1457 			duplicate->LeftLink()));
1458 		return B_BAD_DATA;
1459 	}
1460 
1461 	// Search the duplicate nodes until the entry could be found (and removed)
1462 	while (duplicate != NULL) {
1463 		array = duplicate->DuplicateArray();
1464 		if (array->count > NUM_DUPLICATE_VALUES
1465 			|| array->count < 0) {
1466 			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
1467 				duplicateOffset, array->count));
1468 			return B_BAD_DATA;
1469 		}
1470 
1471 		if (array->Remove(value))
1472 			break;
1473 
1474 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1475 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1476 
1477 		cachedDuplicate.UnsetUnchanged(transaction);
1478 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false);
1479 	}
1480 	if (duplicate == NULL)
1481 		RETURN_ERROR(B_IO_ERROR);
1482 
1483 	// The entry got removed from the duplicate node, but we might want to free
1484 	// it now in case it's empty
1485 
1486 	while (true) {
1487 		off_t left = duplicate->LeftLink();
1488 		off_t right = duplicate->RightLink();
1489 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1490 
1491 		if ((isLast && array->count == 1) || array->count == 0) {
1492 			// Free empty duplicate page, link their siblings together, and
1493 			// update the duplicate link if needed (ie. when we either remove
1494 			// the last duplicate node or have a new first one)
1495 
1496 			if (left == BPLUSTREE_NULL) {
1497 				// the duplicate link points to us
1498 				if (cached.MakeWritable(transaction) == NULL)
1499 					return B_IO_ERROR;
1500 
1501 				if (array->count == 1) {
1502 					// this is the last node, and there is only one value left;
1503 					// replace the duplicate link with that value, it's no duplicate
1504 					// anymore
1505 					values[index] = array->values[0];
1506 				} else {
1507 					// move the duplicate link to the next node
1508 					values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1509 						BPLUSTREE_DUPLICATE_NODE, right));
1510 				}
1511 			}
1512 
1513 			status_t status;
1514 			if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK)
1515 				return status;
1516 
1517 			if (left != BPLUSTREE_NULL
1518 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) {
1519 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1520 
1521 				// If the next node is the last node, we need to free that node
1522 				// and convert the duplicate entry back into a normal entry
1523 				array = duplicate->DuplicateArray();
1524 				if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL
1525 					&& array->count <= NUM_FRAGMENT_VALUES) {
1526 					duplicateOffset = left;
1527 					continue;
1528 				}
1529 			}
1530 			if (right != BPLUSTREE_NULL
1531 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) {
1532 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1533 
1534 				// Again, we may need to turn the duplicate entry back into a normal entry
1535 				array = duplicate->DuplicateArray();
1536 				if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL
1537 					&& array->count <= NUM_FRAGMENT_VALUES) {
1538 					duplicateOffset = right;
1539 					continue;
1540 				}
1541 			}
1542 			return B_OK;
1543 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1544 			// If the number of entries fits in a duplicate fragment, then
1545 			// either find a free fragment node, or convert this node to a
1546 			// fragment node.
1547 			CachedNode cachedOther(this);
1548 
1549 			bplustree_node *fragment = NULL;
1550 			uint32 fragmentIndex = 0;
1551 			off_t offset;
1552 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
1553 					&offset, &fragment, &fragmentIndex) == B_OK) {
1554 				// move to other node
1555 				duplicate_array *target = fragment->FragmentAt(fragmentIndex);
1556 				memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1557 
1558 				cachedDuplicate.Free(transaction, duplicateOffset);
1559 				duplicateOffset = offset;
1560 			} else {
1561 				// convert node
1562 				memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1563 				memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1564 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1565 			}
1566 
1567 			if (cached.MakeWritable(transaction) == NULL)
1568 				return B_IO_ERROR;
1569 
1570 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1571 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1572 		}
1573 		return B_OK;
1574 	}
1575 }
1576 
1577 
1578 /*!
1579 	Removes the key with the given index from the specified node.
1580 	Since it has to get the key from the node anyway (to obtain it's
1581 	pointer), it's not needed to pass the key & its length, although
1582 	the calling method (BPlusTree::Remove()) have this data.
1583 */
1584 void
1585 BPlusTree::_RemoveKey(bplustree_node *node, uint16 index)
1586 {
1587 	// should never happen, but who knows?
1588 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1589 		FATAL(("Asked me to remove key outer limits: %u\n", index));
1590 		return;
1591 	}
1592 
1593 	off_t *values = node->Values();
1594 
1595 	// if we would have to drop the overflow link, drop
1596 	// the last key instead and update the overflow link
1597 	// to the value of that one
1598 	if (!node->IsLeaf() && index == node->NumKeys())
1599 		node->overflow_link = values[--index];
1600 
1601 	uint16 length;
1602 	uint8 *key = node->KeyAt(index, &length);
1603 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
1604 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1605 		FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length,
1606 			(int)fStream->BlockRun().allocation_group, fStream->BlockRun().start));
1607 		fStream->GetVolume()->Panic();
1608 		return;
1609 	}
1610 
1611 	uint16 *keyLengths = node->KeyLengths();
1612 	uint8 *keys = node->Keys();
1613 
1614 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1615 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length);
1616 
1617 	off_t *newValues = node->Values();
1618 	uint16 *newKeyLengths = node->KeyLengths();
1619 
1620 	// move key data
1621 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1622 
1623 	// move and update key lengths
1624 	if (index > 0 && newKeyLengths != keyLengths)
1625 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1626 	for (uint16 i = index; i < node->NumKeys(); i++) {
1627 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1628 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
1629 	}
1630 
1631 	// move values
1632 	if (index > 0)
1633 		memmove(newValues, values, index * sizeof(off_t));
1634 	if (node->NumKeys() > index)
1635 		memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t));
1636 }
1637 
1638 
1639 /*!
1640 	Removes the specified key from the tree. The "value" parameter is only used
1641 	for trees which allow duplicates, so you may safely ignore it.
1642 	It's not an optional parameter, so at least you have to think about it.
1643 */
1644 status_t
1645 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength,
1646 	off_t value)
1647 {
1648 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1649 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1650 		RETURN_ERROR(B_BAD_VALUE);
1651 
1652 	// lock access to stream
1653 	WriteLocked locked(fStream->Lock());
1654 
1655 	Stack<node_and_key> stack;
1656 	if (_SeekDown(stack, key, keyLength) != B_OK)
1657 		RETURN_ERROR(B_ERROR);
1658 
1659 	node_and_key nodeAndKey;
1660 	const bplustree_node *node;
1661 
1662 	CachedNode cached(this);
1663 	while (stack.Pop(&nodeAndKey)
1664 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1665 #ifdef DEBUG
1666 		NodeChecker checker(node, fNodeSize, "remove");
1667 #endif
1668 		if (node->IsLeaf()) {
1669 			// first round, check for duplicate entries
1670 			status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
1671 			if (status < B_OK)
1672 				RETURN_ERROR(status);
1673 
1674 			// If we will remove the last key, the iterator will be set
1675 			// to the next node after the current - if there aren't any
1676 			// more nodes, we need a way to prevent the TreeIterators to
1677 			// touch the old node again, we use BPLUSTREE_FREE for this
1678 			off_t next = node->RightLink() == BPLUSTREE_NULL
1679 				? BPLUSTREE_FREE : node->RightLink();
1680 			_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
1681 				? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
1682 
1683 			// is this a duplicate entry?
1684 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
1685 					node->Values()[nodeAndKey.keyIndex]))) {
1686 				if (fAllowDuplicates) {
1687 					return _RemoveDuplicate(transaction, node, cached,
1688 						nodeAndKey.keyIndex, value);
1689 				} else {
1690 					FATAL(("dupliate node found where no duplicates are allowed!\n"));
1691 					RETURN_ERROR(B_ERROR);
1692 				}
1693 			}
1694 		}
1695 
1696 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1697 		if (writableNode == NULL)
1698 			return B_IO_ERROR;
1699 
1700 		// if it's an empty root node, we have to convert it
1701 		// to a leaf node by dropping the overflow link, or,
1702 		// if it's already a leaf node, just empty it
1703 		if (nodeAndKey.nodeOffset == fHeader->RootNode()
1704 			&& (node->NumKeys() == 0 || node->NumKeys() == 1 && node->IsLeaf())) {
1705 			writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1706 			writableNode->all_key_count = 0;
1707 			writableNode->all_key_length = 0;
1708 
1709 			// if we've made a leaf node out of the root node, we need
1710 			// to reset the maximum number of levels in the header
1711 			if (fHeader->MaxNumberOfLevels() != 1) {
1712 				bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction);
1713 				if (header == NULL)
1714 					return B_IO_ERROR;
1715 
1716 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1717 			}
1718 			return B_OK;
1719 		}
1720 
1721 		// if there is only one key left, we don't have to remove
1722 		// it, we can just dump the node (index nodes still have
1723 		// the overflow link, so we have to drop the last key)
1724 		if (writableNode->NumKeys() > 1
1725 			|| !writableNode->IsLeaf() && writableNode->NumKeys() == 1) {
1726 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
1727 			return B_OK;
1728 		}
1729 
1730 		// when we are here, we can just free the node, but
1731 		// we have to update the right/left link of the
1732 		// siblings first
1733 		CachedNode otherCached(this);
1734 		bplustree_node *other = otherCached.SetToWritable(transaction,
1735 			writableNode->LeftLink());
1736 		if (other != NULL)
1737 			other->right_link = writableNode->right_link;
1738 
1739 		if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL)
1740 			other->left_link = writableNode->left_link;
1741 
1742 		cached.Free(transaction, nodeAndKey.nodeOffset);
1743 	}
1744 	RETURN_ERROR(B_ERROR);
1745 }
1746 
1747 
1748 /*!
1749 	Replaces the value for the key in the tree.
1750 	Returns B_OK if the key could be found and its value replaced,
1751 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1752 	to indicate that something went terribly wrong.
1753 	Note that this doesn't work with duplicates - it will just
1754 	return B_BAD_TYPE if you call this function on a tree where
1755 	duplicates are allowed.
1756 */
1757 status_t
1758 BPlusTree::Replace(Transaction &transaction, const uint8 *key,
1759 	uint16 keyLength, off_t value)
1760 {
1761 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1762 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1763 		|| key == NULL)
1764 		RETURN_ERROR(B_BAD_VALUE);
1765 
1766 	if (fAllowDuplicates)
1767 		RETURN_ERROR(B_BAD_TYPE);
1768 
1769 	// lock access to stream (a read lock is okay for this purpose)
1770 	ReadLocked locked(fStream->Lock());
1771 
1772 	off_t nodeOffset = fHeader->RootNode();
1773 	CachedNode cached(this);
1774 	const bplustree_node *node;
1775 
1776 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1777 		uint16 keyIndex = 0;
1778 		off_t nextOffset;
1779 		status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1780 
1781 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1782 			if (status == B_OK) {
1783 				bplustree_node *writableNode = cached.MakeWritable(transaction);
1784 				if (writableNode != NULL)
1785 					writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value);
1786 				else
1787 					status = B_IO_ERROR;
1788 			}
1789 
1790 			return status;
1791 		} else if (nextOffset == nodeOffset)
1792 			RETURN_ERROR(B_ERROR);
1793 
1794 		nodeOffset = nextOffset;
1795 	}
1796 	RETURN_ERROR(B_ERROR);
1797 }
1798 
1799 
1800 /*!
1801 	Searches the key in the tree, and stores the offset found in
1802 	_value, if successful.
1803 	It's very similar to BPlusTree::SeekDown(), but doesn't fill
1804 	a stack while it descends the tree.
1805 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND
1806 	if not. It can also return other errors to indicate that
1807 	something went wrong.
1808 	Note that this doesn't work with duplicates - it will just
1809 	return B_BAD_TYPE if you call this function on a tree where
1810 	duplicates are allowed.
1811 */
1812 status_t
1813 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value)
1814 {
1815 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1816 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1817 		|| key == NULL)
1818 		RETURN_ERROR(B_BAD_VALUE);
1819 
1820 	if (fAllowDuplicates)
1821 		RETURN_ERROR(B_BAD_TYPE);
1822 
1823 	// lock access to stream
1824 	ReadLocked locked(fStream->Lock());
1825 
1826 	off_t nodeOffset = fHeader->RootNode();
1827 	CachedNode cached(this);
1828 	const bplustree_node *node;
1829 
1830 #ifdef DEBUG
1831 	int32 levels = 0;
1832 #endif
1833 
1834 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1835 		uint16 keyIndex = 0;
1836 		off_t nextOffset;
1837 		status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1838 
1839 #ifdef DEBUG
1840 		levels++;
1841 #endif
1842 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1843 			if (status == B_OK && _value != NULL)
1844 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1845 
1846 #ifdef DEBUG
1847 			if (levels != (int32)fHeader->MaxNumberOfLevels())
1848 				DEBUGGER(("levels don't match"));
1849 #endif
1850 			return status;
1851 		} else if (nextOffset == nodeOffset)
1852 			RETURN_ERROR(B_ERROR);
1853 
1854 		nodeOffset = nextOffset;
1855 	}
1856 	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
1857 	RETURN_ERROR(B_ERROR);
1858 }
1859 
1860 
1861 //	#pragma mark -
1862 
1863 
1864 TreeIterator::TreeIterator(BPlusTree *tree)
1865 	:
1866 	fTree(tree),
1867 	fCurrentNodeOffset(BPLUSTREE_NULL),
1868 	fNext(NULL)
1869 {
1870 	tree->_AddIterator(this);
1871 }
1872 
1873 
1874 TreeIterator::~TreeIterator()
1875 {
1876 	if (fTree)
1877 		fTree->_RemoveIterator(this);
1878 }
1879 
1880 
1881 status_t
1882 TreeIterator::Goto(int8 to)
1883 {
1884 	if (fTree == NULL || fTree->fHeader == NULL)
1885 		RETURN_ERROR(B_BAD_VALUE);
1886 
1887 	// lock access to stream
1888 	ReadLocked locked(fTree->fStream->Lock());
1889 
1890 	off_t nodeOffset = fTree->fHeader->RootNode();
1891 	CachedNode cached(fTree);
1892 	const bplustree_node *node;
1893 
1894 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1895 		// is the node a leaf node?
1896 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1897 			fCurrentNodeOffset = nodeOffset;
1898 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1899 			fDuplicateNode = BPLUSTREE_NULL;
1900 
1901 			return B_OK;
1902 		}
1903 
1904 		// get the next node offset depending on the direction (and if there
1905 		// are any keys in that node at all)
1906 		off_t nextOffset;
1907 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1908 			nextOffset = node->OverflowLink();
1909 		else {
1910 			if (node->AllKeyLength() > fTree->fNodeSize
1911 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
1912 					- 8 * node->NumKeys())
1913 				RETURN_ERROR(B_ERROR);
1914 
1915 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1916 		}
1917 		if (nextOffset == nodeOffset)
1918 			break;
1919 
1920 		nodeOffset = nextOffset;
1921 	}
1922 	FATAL(("%s fails\n", __FUNCTION__));
1923 
1924 	RETURN_ERROR(B_ERROR);
1925 }
1926 
1927 
1928 /*!
1929 	Iterates through the tree in the specified direction.
1930 	When it iterates through duplicates, the "key" is only updated for the
1931 	first entry - if you need to know when this happens, use the "duplicate"
1932 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
1933 	the other duplicates.
1934 	That's not too nice, but saves the 256 bytes that would be needed to
1935 	store the last key - if this will ever become an issue, it will be
1936 	easy to change.
1937 	The other advantage of this is, that the queries can skip all duplicates
1938 	at once when they are not relevant to them.
1939 */
1940 status_t
1941 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength,
1942 	uint16 maxLength, off_t *value, uint16 *duplicate)
1943 {
1944 	if (fTree == NULL)
1945 		return B_INTERRUPTED;
1946 
1947 	bool forward = direction == BPLUSTREE_FORWARD;
1948 
1949 	if (fCurrentNodeOffset == BPLUSTREE_NULL
1950 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
1951 		RETURN_ERROR(B_ERROR);
1952 
1953 	// if the tree was emptied since the last call
1954 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
1955 		return B_ENTRY_NOT_FOUND;
1956 
1957 	// lock access to stream
1958 	ReadLocked locked(fTree->fStream->Lock());
1959 
1960 	CachedNode cached(fTree);
1961 	const bplustree_node *node;
1962 
1963 	if (fDuplicateNode != BPLUSTREE_NULL) {
1964 		// regardless of traverse direction the duplicates are always presented in
1965 		// the same order; since they are all considered as equal, this shouldn't
1966 		// cause any problems
1967 
1968 		if (!fIsFragment || fDuplicate < fNumDuplicates)
1969 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1970 		else
1971 			node = NULL;
1972 
1973 		if (node != NULL) {
1974 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
1975 				// if the node is out of duplicates, we go directly to the next one
1976 				fDuplicateNode = node->RightLink();
1977 				if (fDuplicateNode != BPLUSTREE_NULL
1978 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
1979 					fNumDuplicates = node->CountDuplicates(fDuplicateNode, false);
1980 					fDuplicate = 0;
1981 				}
1982 			}
1983 			if (fDuplicate < fNumDuplicates) {
1984 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++);
1985 				if (duplicate)
1986 					*duplicate = 2;
1987 				return B_OK;
1988 			}
1989 		}
1990 		fDuplicateNode = BPLUSTREE_NULL;
1991 	}
1992 
1993 	off_t savedNodeOffset = fCurrentNodeOffset;
1994 	int32 savedKey = fCurrentKey;
1995 
1996 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
1997 		RETURN_ERROR(B_ERROR);
1998 
1999 	if (duplicate)
2000 		*duplicate = 0;
2001 
2002 	fCurrentKey += direction;
2003 
2004 	// is the current key in the current node?
2005 	while ((forward && fCurrentKey >= node->NumKeys())
2006 			|| (!forward && fCurrentKey < 0)) {
2007 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2008 
2009 		// are there any more nodes?
2010 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2011 			node = cached.SetTo(fCurrentNodeOffset);
2012 			if (!node)
2013 				RETURN_ERROR(B_ERROR);
2014 
2015 			// reset current key
2016 			fCurrentKey = forward ? 0 : node->NumKeys();
2017 		} else {
2018 			// there are no nodes left, so turn back to the last key
2019 			fCurrentNodeOffset = savedNodeOffset;
2020 			fCurrentKey = savedKey;
2021 
2022 			return B_ENTRY_NOT_FOUND;
2023 		}
2024 	}
2025 
2026 	if (node->all_key_count == 0)
2027 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2028 
2029 	uint16 length;
2030 	uint8 *keyStart = node->KeyAt(fCurrentKey, &length);
2031 	if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize
2032 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2033 		fTree->fStream->GetVolume()->Panic();
2034 		RETURN_ERROR(B_BAD_DATA);
2035 	}
2036 
2037 	length = min_c(length, maxLength);
2038 	memcpy(key, keyStart, length);
2039 
2040 	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	{
2041 		// terminate string type
2042 		if (length == maxLength)
2043 			length--;
2044 		((char *)key)[length] = '\0';
2045 	}
2046 	*keyLength = length;
2047 
2048 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2049 
2050 	// duplicate fragments?
2051 	uint8 type = bplustree_node::LinkType(offset);
2052 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) {
2053 		fDuplicateNode = offset;
2054 
2055 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
2056 		if (node == NULL)
2057 			RETURN_ERROR(B_ERROR);
2058 
2059 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2060 
2061 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2062 		if (fNumDuplicates) {
2063 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2064 			fDuplicate = 1;
2065 			if (duplicate)
2066 				*duplicate = 1;
2067 		} else {
2068 			// shouldn't happen, but we're dealing here with potentially corrupt disks...
2069 			fDuplicateNode = BPLUSTREE_NULL;
2070 			offset = 0;
2071 		}
2072 	}
2073 	*value = offset;
2074 
2075 	return B_OK;
2076 }
2077 
2078 
2079 /*!
2080 	This is more or less a copy of BPlusTree::Find() - but it just
2081 	sets the current position in the iterator, regardless of if the
2082 	key could be found or not.
2083 */
2084 status_t
2085 TreeIterator::Find(const uint8 *key, uint16 keyLength)
2086 {
2087 	if (fTree == NULL)
2088 		return B_INTERRUPTED;
2089 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
2090 		|| key == NULL)
2091 		RETURN_ERROR(B_BAD_VALUE);
2092 
2093 	// lock access to stream
2094 	ReadLocked locked(fTree->fStream->Lock());
2095 
2096 	off_t nodeOffset = fTree->fHeader->RootNode();
2097 
2098 	CachedNode cached(fTree);
2099 	const bplustree_node *node;
2100 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2101 		uint16 keyIndex = 0;
2102 		off_t nextOffset;
2103 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2104 			&nextOffset);
2105 
2106 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2107 			fCurrentNodeOffset = nodeOffset;
2108 			fCurrentKey = keyIndex - 1;
2109 			fDuplicateNode = BPLUSTREE_NULL;
2110 
2111 			return status;
2112 		} else if (nextOffset == nodeOffset)
2113 			RETURN_ERROR(B_ERROR);
2114 
2115 		nodeOffset = nextOffset;
2116 	}
2117 	RETURN_ERROR(B_ERROR);
2118 }
2119 
2120 
2121 void
2122 TreeIterator::SkipDuplicates()
2123 {
2124 	fDuplicateNode = BPLUSTREE_NULL;
2125 }
2126 
2127 
2128 void
2129 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change)
2130 {
2131 	if (offset != fCurrentNodeOffset)
2132 		return;
2133 
2134 	if (nextOffset != BPLUSTREE_NULL) {
2135 		fCurrentNodeOffset = nextOffset;
2136 		if (splitAt <= fCurrentKey) {
2137 			fCurrentKey -= splitAt;
2138 			keyIndex -= splitAt;
2139 		}
2140 	}
2141 
2142 	// Adjust fCurrentKey to point to the same key as before.
2143 	// Note, that if a key is inserted at the current position
2144 	// it won't be included in this tree transition.
2145 	if (keyIndex <= fCurrentKey)
2146 		fCurrentKey += change;
2147 
2148 	// ToDo: duplicate handling!
2149 }
2150 
2151 
2152 void
2153 TreeIterator::Stop()
2154 {
2155 	fTree = NULL;
2156 }
2157 
2158 
2159 #ifdef DEBUG
2160 void
2161 TreeIterator::Dump()
2162 {
2163 	__out("TreeIterator at %p:\n", this);
2164 	__out("\tfTree = %p\n", fTree);
2165 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2166 	__out("\tfCurrentKey = %ld\n", fCurrentKey);
2167 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode);
2168 	__out("\tfDuplicate = %u\n", fDuplicate);
2169 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2170 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2171 }
2172 #endif
2173 
2174 
2175 //	#pragma mark -
2176 
2177 
2178 void
2179 bplustree_node::Initialize()
2180 {
2181 	left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2182 	all_key_count = 0;
2183 	all_key_length = 0;
2184 }
2185 
2186 
2187 uint8 *
2188 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const
2189 {
2190 	if (index < 0 || index > NumKeys())
2191 		return NULL;
2192 
2193 	uint8 *keyStart = Keys();
2194 	uint16 *keyLengths = KeyLengths();
2195 
2196 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2197 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2198 	if (index > 0)
2199 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2200 
2201 	return keyStart;
2202 }
2203 
2204 
2205 uint8
2206 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2207 {
2208 	// the duplicate fragment handling is currently hard-coded to a node size
2209 	// of 1024 bytes - with future versions of BFS, this may be a problem
2210 
2211 	if (isFragment) {
2212 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2213 
2214 		return ((off_t *)this)[fragment];
2215 	}
2216 	return OverflowLink();
2217 }
2218 
2219 
2220 off_t
2221 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2222 {
2223 	uint32 start;
2224 	if (isFragment)
2225 		start = 8 * ((uint64)offset & 0x3ff);
2226 	else
2227 		start = 2;
2228 
2229 	return ((off_t *)this)[start + 1 + index];
2230 }
2231 
2232 
2233 /*!	Although the name suggests it, this function doesn't return the real
2234 	used fragment count; at least, it can only count to two: it returns
2235 	0, if there is no fragment used, 1 if there is only one fragment
2236 	used, and 2 if there are at least 2 fragments used.
2237 */
2238 uint32
2239 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2240 {
2241 	uint32 used = 0;
2242 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2243 		duplicate_array *array = FragmentAt(i);
2244 		if (array->count > 0 && ++used > 1)
2245 			return used;
2246 	}
2247 	return used;
2248 }
2249 
2250 
2251 #ifdef DEBUG
2252 status_t
2253 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2254 {
2255 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2256 		DEBUGGER(("invalid node: key/length count"));
2257 
2258 	for (int32 i = 0; i < NumKeys(); i++) {
2259 		uint16 length;
2260 		uint8 *key = KeyAt(i, &length);
2261 		if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize
2262 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2263 			dprintf("node %p, key %ld\n", this, i);
2264 			DEBUGGER(("invalid node: keys corrupted"));
2265 			return B_BAD_DATA;
2266 		}
2267 		if (Values()[i] == -1) {
2268 			dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]);
2269 			DEBUGGER(("invalid node: values corrupted"));
2270 			return B_BAD_DATA;
2271 		}
2272 	}
2273 	return B_OK;
2274 }
2275 #endif
2276 
2277 
2278 //	#pragma mark -
2279 
2280 
2281 int32
2282 compareKeys(type_code type, const void *key1, int keyLength1,
2283 	const void *key2, int keyLength2)
2284 {
2285 	// if one of the keys is NULL, bail out gracefully
2286 	if (key1 == NULL || key2 == NULL) {
2287 		// even if it's most probably a bug in the calling code, we try to
2288 		// give a meaningful result
2289 		if (key1 == NULL && key2 != NULL)
2290 			return 1;
2291 		else if (key2 == NULL && key1 != NULL)
2292 			return -1;
2293 
2294 		return 0;
2295 	}
2296 
2297 	switch (type) {
2298 	    case B_STRING_TYPE:
2299     	{
2300 			int len = min_c(keyLength1, keyLength2);
2301 			int result = strncmp((const char *)key1, (const char *)key2, len);
2302 
2303 			if (result == 0
2304 				&& !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0'))
2305 				result = keyLength1 - keyLength2;
2306 
2307 			return result;
2308 		}
2309 
2310 		case B_SSIZE_T_TYPE:
2311 		case B_INT32_TYPE:
2312 			return *(int32 *)key1 - *(int32 *)key2;
2313 
2314 		case B_SIZE_T_TYPE:
2315 		case B_UINT32_TYPE:
2316 			if (*(uint32 *)key1 == *(uint32 *)key2)
2317 				return 0;
2318 			else if (*(uint32 *)key1 > *(uint32 *)key2)
2319 				return 1;
2320 
2321 			return -1;
2322 
2323 		case B_OFF_T_TYPE:
2324 		case B_INT64_TYPE:
2325 			if (*(int64 *)key1 == *(int64 *)key2)
2326 				return 0;
2327 			else if (*(int64 *)key1 > *(int64 *)key2)
2328 				return 1;
2329 
2330 			return -1;
2331 
2332 		case B_UINT64_TYPE:
2333 			if (*(uint64 *)key1 == *(uint64 *)key2)
2334 				return 0;
2335 			else if (*(uint64 *)key1 > *(uint64 *)key2)
2336 				return 1;
2337 
2338 			return -1;
2339 
2340 		case B_FLOAT_TYPE:
2341 		{
2342 			float result = *(float *)key1 - *(float *)key2;
2343 			if (result == 0.0f)
2344 				return 0;
2345 
2346 			return (result < 0.0f) ? -1 : 1;
2347 		}
2348 
2349 		case B_DOUBLE_TYPE:
2350 		{
2351 			double result = *(double *)key1 - *(double *)key2;
2352 			if (result == 0.0)
2353 				return 0;
2354 
2355 			return (result < 0.0) ? -1 : 1;
2356 		}
2357 	}
2358 
2359 	// if the type is unknown, the entries don't match...
2360 	return -1;
2361 }
2362 
2363