xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 9eb55bc1d104b8fda80898f8b25c94d8000c8255)
1 /* BPlusTree - BFS B+Tree implementation
2 **
3 ** Initial version by Axel Dörfler, axeld@pinc-software.de
4 ** Roughly based on 'btlib' written by Marcus J. Ranum
5 **
6 ** Copyright (c) 2001-2004 pinc Software. All Rights Reserved.
7 ** This file may be used under the terms of the OpenBeOS License.
8 */
9 
10 
11 #include "Debug.h"
12 #include "BPlusTree.h"
13 #include "Inode.h"
14 #include "Utility.h"
15 #include "Stack.h"
16 
17 #include <kernel_cpp.h>
18 #include <TypeConstants.h>
19 
20 #include <string.h>
21 #include <stdlib.h>
22 #include <stdio.h>
23 
24 
25 #ifdef DEBUG
26 class NodeChecker {
27 	public:
28 		NodeChecker(bplustree_node *node, int32 nodeSize)
29 			:
30 			fNode(node),
31 			fSize(nodeSize)
32 		{
33 		}
34 
35 		~NodeChecker()
36 		{
37 			fNode->CheckIntegrity(fSize);
38 		}
39 
40 	private:
41 		bplustree_node *fNode;
42 		int32	fSize;
43 };
44 #endif
45 
46 // Node Caching for the BPlusTree class
47 //
48 // With write support, there is the need for a function that allocates new
49 // nodes by either returning empty nodes, or by growing the file's data stream
50 //
51 // !! The CachedNode class assumes that you have properly locked the stream
52 // !! before asking for nodes.
53 //
54 // Note: This code will fail if the block size is smaller than the node size!
55 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
56 // is hard-coded to 1024 bytes, that's not an issue now.
57 
58 void
59 CachedNode::Unset()
60 {
61 	if (fTree == NULL || fTree->fStream == NULL)
62 		return;
63 
64 	if (fBlock != NULL) {
65 		release_block(fTree->fStream->GetVolume()->Device(), fBlockNumber);
66 
67 		fBlock = NULL;
68 		fNode = NULL;
69 	}
70 }
71 
72 
73 bplustree_node *
74 CachedNode::SetTo(off_t offset, bool check)
75 {
76 	if (fTree == NULL || fTree->fStream == NULL) {
77 		REPORT_ERROR(B_BAD_VALUE);
78 		return NULL;
79 	}
80 
81 	Unset();
82 
83 	// You can only ask for nodes at valid positions - you can't
84 	// even access the b+tree header with this method (use SetToHeader()
85 	// instead)
86 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
87 		|| offset <= 0
88 		|| (offset % fTree->fNodeSize) != 0)
89 		return NULL;
90 
91 	if (InternalSetTo(offset) != NULL && check) {
92 		// sanity checks (links, all_key_count)
93 		bplustree_header *header = fTree->fHeader;
94 		if (!header->IsValidLink(fNode->LeftLink())
95 			|| !header->IsValidLink(fNode->RightLink())
96 			|| !header->IsValidLink(fNode->OverflowLink())
97 			|| (int8 *)fNode->Values() + fNode->NumKeys() * sizeof(off_t) >
98 					(int8 *)fNode + fTree->fNodeSize) {
99 			FATAL(("invalid node read from offset %Ld, inode at %Ld\n",
100 					offset, fTree->fStream->ID()));
101 			return NULL;
102 		}
103 	}
104 	return fNode;
105 }
106 
107 
108 bplustree_header *
109 CachedNode::SetToHeader()
110 {
111 	if (fTree == NULL || fTree->fStream == NULL) {
112 		REPORT_ERROR(B_BAD_VALUE);
113 		return NULL;
114 	}
115 
116 	Unset();
117 
118 	InternalSetTo(0LL);
119 	return (bplustree_header *)fNode;
120 }
121 
122 
123 bplustree_node *
124 CachedNode::InternalSetTo(off_t offset)
125 {
126 	fNode = NULL;
127 
128 	off_t fileOffset;
129 	block_run run;
130 	if (offset < fTree->fStream->Size()
131 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
132 		Volume *volume = fTree->fStream->GetVolume();
133 
134 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
135 		fBlockNumber = volume->ToBlock(run) + blockOffset;
136 
137 		fBlock = (uint8 *)get_block(volume->Device(), fBlockNumber, volume->BlockSize());
138 		if (fBlock) {
139 			// the node is somewhere in that block... (confusing offset calculation)
140 			fNode = (bplustree_node *)(fBlock + offset -
141 						(fileOffset + blockOffset * volume->BlockSize()));
142 		} else
143 			REPORT_ERROR(B_IO_ERROR);
144 	}
145 	return fNode;
146 }
147 
148 
149 status_t
150 CachedNode::Free(Transaction *transaction, off_t offset)
151 {
152 	if (transaction == NULL || fTree == NULL || fTree->fStream == NULL
153 		|| offset == BPLUSTREE_NULL)
154 		RETURN_ERROR(B_BAD_VALUE);
155 
156 	// ToDo: scan the free nodes list and remove all nodes at the end
157 	// of the tree - perhaps that shouldn't be done everytime that
158 	// function is called, perhaps it should be done when the directory
159 	// inode is closed or based on some calculation or whatever...
160 
161 	// if the node is the last one in the tree, we shrink
162 	// the tree and file size by one node
163 	off_t lastOffset = fTree->fHeader->MaximumSize() - fTree->fNodeSize;
164 	if (offset == lastOffset) {
165 		fTree->fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
166 
167 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
168 		if (status < B_OK)
169 			return status;
170 
171 		return fTree->fCachedHeader.WriteBack(transaction);
172 	}
173 
174 	// add the node to the free nodes list
175 	fNode->left_link = fTree->fHeader->free_node_pointer;
176 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
177 
178 	if (WriteBack(transaction) == B_OK) {
179 		fTree->fHeader->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
180 		return fTree->fCachedHeader.WriteBack(transaction);
181 	}
182 	return B_ERROR;
183 }
184 
185 
186 status_t
187 CachedNode::Allocate(Transaction *transaction, bplustree_node **_node, off_t *_offset)
188 {
189 	if (transaction == NULL || fTree == NULL || fTree->fHeader == NULL
190 		|| fTree->fStream == NULL) {
191 		RETURN_ERROR(B_BAD_VALUE);
192 	}
193 
194 	status_t status;
195 
196 	// if there are any free nodes, recycle them
197 	if (SetTo(fTree->fHeader->FreeNode(), false) != NULL) {
198 		*_offset = fTree->fHeader->FreeNode();
199 
200 		// set new free node pointer
201 		fTree->fHeader->free_node_pointer = fNode->left_link;
202 		if ((status = fTree->fCachedHeader.WriteBack(transaction)) == B_OK) {
203 			fNode->Initialize();
204 			*_node = fNode;
205 			return B_OK;
206 		}
207 		return status;
208 	}
209 	// allocate space for a new node
210 	Inode *stream = fTree->fStream;
211 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
212 		return status;
213 
214 	// the maximum_size has to be changed before the call to SetTo() - or
215 	// else it will fail because the requested node is out of bounds
216 	off_t offset = fTree->fHeader->MaximumSize();
217 	fTree->fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(fTree->fHeader->MaximumSize() + fTree->fNodeSize);
218 
219 	if (SetTo(offset, false) != NULL) {
220 		*_offset = offset;
221 
222 		if (fTree->fCachedHeader.WriteBack(transaction) >= B_OK) {
223 			fNode->Initialize();
224 			*_node = fNode;
225 			return B_OK;
226 		}
227 	}
228 	RETURN_ERROR(B_ERROR);
229 }
230 
231 
232 status_t
233 CachedNode::WriteBack(Transaction *transaction)
234 {
235 	if (transaction == NULL || fTree == NULL || fTree->fStream == NULL || fNode == NULL)
236 		RETURN_ERROR(B_BAD_VALUE);
237 
238 	return transaction->WriteBlocks(fBlockNumber, fBlock);
239 }
240 
241 
242 //	#pragma mark -
243 
244 
245 BPlusTree::BPlusTree(Transaction *transaction, Inode *stream, int32 nodeSize)
246 	:
247 	fStream(NULL),
248 	fHeader(NULL),
249 	fCachedHeader(this)
250 {
251 	SetTo(transaction, stream);
252 }
253 
254 
255 BPlusTree::BPlusTree(Inode *stream)
256 	:
257 	fStream(NULL),
258 	fHeader(NULL),
259 	fCachedHeader(this)
260 {
261 	SetTo(stream);
262 }
263 
264 
265 BPlusTree::BPlusTree()
266 	:
267 	fStream(NULL),
268 	fHeader(NULL),
269 	fCachedHeader(this),
270 	fNodeSize(BPLUSTREE_NODE_SIZE),
271 	fAllowDuplicates(true),
272 	fStatus(B_NO_INIT)
273 {
274 }
275 
276 
277 BPlusTree::~BPlusTree()
278 {
279 	// if there are any TreeIterators left, we need to stop them
280 	// (can happen when the tree's inode gets deleted while
281 	// traversing the tree - a TreeIterator doesn't lock the inode)
282 	if (fIteratorLock.Lock() < B_OK)
283 		return;
284 
285 	TreeIterator *iterator = NULL;
286 	while ((iterator = fIterators.Next(iterator)) != NULL)
287 		iterator->Stop();
288 
289 	fIteratorLock.Unlock();
290 }
291 
292 
293 /** Create a new B+Tree on the specified stream */
294 
295 status_t
296 BPlusTree::SetTo(Transaction *transaction, Inode *stream, int32 nodeSize)
297 {
298 	// initializes in-memory B+Tree
299 
300 	fCachedHeader.Unset();
301 	fStream = stream;
302 
303 	fHeader = fCachedHeader.SetToHeader();
304 	if (fHeader == NULL) {
305 		// allocate space for new header + node!
306 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
307 		if (fStatus < B_OK)
308 			RETURN_ERROR(fStatus);
309 
310 		fHeader = fCachedHeader.SetToHeader();
311 		if (fHeader == NULL)
312 			RETURN_ERROR(fStatus = B_ERROR);
313 	}
314 
315 	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
316 						&& stream->BlockRun() != stream->Parent())
317 						|| (stream->Mode() & S_ALLOW_DUPS) != 0;
318 
319 	fNodeSize = nodeSize;
320 
321 	// initialize b+tree header
322  	fHeader->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
323  	fHeader->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
324  	fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
325  	fHeader->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
326  	fHeader->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
327  	fHeader->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
328  	fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
329 
330 	if (fCachedHeader.WriteBack(transaction) < B_OK)
331 		RETURN_ERROR(fStatus = B_ERROR);
332 
333 	// initialize b+tree root node
334 	CachedNode cached(this, fHeader->RootNode(), false);
335 	if (cached.Node() == NULL)
336 		RETURN_ERROR(B_ERROR);
337 
338 	cached.Node()->Initialize();
339 	return fStatus = cached.WriteBack(transaction);
340 }
341 
342 
343 status_t
344 BPlusTree::SetTo(Inode *stream)
345 {
346 	if (stream == NULL || stream->Node() == NULL)
347 		RETURN_ERROR(fStatus = B_BAD_VALUE);
348 
349 	// get on-disk B+Tree header
350 
351 	fCachedHeader.Unset();
352 	fStream = stream;
353 
354 	fHeader = fCachedHeader.SetToHeader();
355 	if (fHeader == NULL)
356 		RETURN_ERROR(fStatus = B_NO_INIT);
357 
358 	// is header valid?
359 
360 	if (fHeader->Magic() != BPLUSTREE_MAGIC
361 		|| fHeader->MaximumSize() != stream->Size()
362 		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
363 		|| !fHeader->IsValidLink(fHeader->RootNode())
364 		|| !fHeader->IsValidLink(fHeader->FreeNode()))
365 		RETURN_ERROR(fStatus = B_BAD_DATA);
366 
367 	fNodeSize = fHeader->NodeSize();
368 
369 	{
370 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
371 						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
372 		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
373 						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
374 
375 		if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
376 			|| (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->DataType()] != mode
377 			|| !stream->IsContainer()) {
378 			D(	dump_bplustree_header(fHeader);
379 				dump_inode(stream->Node());
380 			);
381 			RETURN_ERROR(fStatus = B_BAD_TYPE);
382 		}
383 
384 		 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
385 		 // in the original BFS code - we will honour it nevertheless
386 		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
387 							&& stream->BlockRun() != stream->Parent())
388 							|| (stream->Mode() & S_ALLOW_DUPS) != 0;
389 	}
390 
391 	CachedNode cached(this, fHeader->RootNode());
392 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
393 }
394 
395 
396 status_t
397 BPlusTree::InitCheck()
398 {
399 	return fStatus;
400 }
401 
402 
403 int32
404 BPlusTree::TypeCodeToKeyType(type_code code)
405 {
406 	switch (code) {
407 		case B_STRING_TYPE:
408 			return BPLUSTREE_STRING_TYPE;
409 		case B_SSIZE_T_TYPE:
410 		case B_INT32_TYPE:
411 			return BPLUSTREE_INT32_TYPE;
412 		case B_SIZE_T_TYPE:
413 		case B_UINT32_TYPE:
414 			return BPLUSTREE_UINT32_TYPE;
415 		case B_OFF_T_TYPE:
416 		case B_INT64_TYPE:
417 			return BPLUSTREE_INT64_TYPE;
418 		case B_UINT64_TYPE:
419 			return BPLUSTREE_UINT64_TYPE;
420 		case B_FLOAT_TYPE:
421 			return BPLUSTREE_FLOAT_TYPE;
422 		case B_DOUBLE_TYPE:
423 			return BPLUSTREE_DOUBLE_TYPE;
424 	}
425 	return -1;
426 }
427 
428 
429 int32
430 BPlusTree::ModeToKeyType(mode_t mode)
431 {
432 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
433 				   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
434 		case S_INT_INDEX:
435 			return BPLUSTREE_INT32_TYPE;
436 		case S_UINT_INDEX:
437 			return BPLUSTREE_UINT32_TYPE;
438 		case S_LONG_LONG_INDEX:
439 			return BPLUSTREE_INT64_TYPE;
440 		case S_ULONG_LONG_INDEX:
441 			return BPLUSTREE_UINT64_TYPE;
442 		case S_FLOAT_INDEX:
443 			return BPLUSTREE_FLOAT_TYPE;
444 		case S_DOUBLE_INDEX:
445 			return BPLUSTREE_DOUBLE_TYPE;
446 		case S_STR_INDEX:
447 		default:
448 			// default is for standard directories
449 			return BPLUSTREE_STRING_TYPE;
450 	}
451 }
452 
453 
454 //	#pragma mark -
455 
456 
457 void
458 BPlusTree::UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt,
459 	int8 change)
460 {
461 	// Although every iterator which is affected by this update currently
462 	// waits on a semaphore, other iterators could be added/removed at
463 	// any time, so we need to protect this loop
464 	if (fIteratorLock.Lock() < B_OK)
465 		return;
466 
467 	TreeIterator *iterator = NULL;
468 	while ((iterator = fIterators.Next(iterator)) != NULL)
469 		iterator->Update(offset, nextOffset, keyIndex, splitAt, change);
470 
471 	fIteratorLock.Unlock();
472 }
473 
474 
475 void
476 BPlusTree::AddIterator(TreeIterator *iterator)
477 {
478 	if (fIteratorLock.Lock() < B_OK)
479 		return;
480 
481 	fIterators.Add(iterator);
482 
483 	fIteratorLock.Unlock();
484 }
485 
486 
487 void
488 BPlusTree::RemoveIterator(TreeIterator *iterator)
489 {
490 	if (fIteratorLock.Lock() < B_OK)
491 		return;
492 
493 	fIterators.Remove(iterator);
494 
495 	fIteratorLock.Unlock();
496 }
497 
498 
499 int32
500 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
501 {
502 	type_code type = 0;
503 	switch (fHeader->data_type) {
504 	    case BPLUSTREE_STRING_TYPE:
505 	    	type = B_STRING_TYPE;
506 	    	break;
507 		case BPLUSTREE_INT32_TYPE:
508 	    	type = B_INT32_TYPE;
509 	    	break;
510 		case BPLUSTREE_UINT32_TYPE:
511 	    	type = B_UINT32_TYPE;
512 	    	break;
513 		case BPLUSTREE_INT64_TYPE:
514 	    	type = B_INT64_TYPE;
515 	    	break;
516 		case BPLUSTREE_UINT64_TYPE:
517 	    	type = B_UINT64_TYPE;
518 	    	break;
519 		case BPLUSTREE_FLOAT_TYPE:
520 	    	type = B_FLOAT_TYPE;
521 	    	break;
522 		case BPLUSTREE_DOUBLE_TYPE:
523 	    	type = B_DOUBLE_TYPE;
524 	    	break;
525 	}
526    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
527 }
528 
529 
530 status_t
531 BPlusTree::FindKey(bplustree_node *node, const uint8 *key, uint16 keyLength, uint16 *index,
532 	off_t *next)
533 {
534 	if (node->all_key_count == 0) {
535 		if (index)
536 			*index = 0;
537 		if (next)
538 			*next = node->OverflowLink();
539 		return B_ENTRY_NOT_FOUND;
540 	}
541 
542 	off_t *values = node->Values();
543 	int16 saveIndex = -1;
544 
545 	// binary search in the key array
546 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
547 		uint16 i = (first + last) >> 1;
548 
549 		uint16 searchLength;
550 		uint8 *searchKey = node->KeyAt(i, &searchLength);
551 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
552 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
553 			fStream->GetVolume()->Panic();
554 			RETURN_ERROR(B_BAD_DATA);
555 		}
556 
557 		int32 cmp = CompareKeys(key, keyLength, searchKey, searchLength);
558 		if (cmp < 0) {
559 			last = i - 1;
560 			saveIndex = i;
561 		} else if (cmp > 0) {
562 			saveIndex = first = i + 1;
563 		} else {
564 			if (index)
565 				*index = i;
566 			if (next)
567 				*next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
568 			return B_OK;
569 		}
570 	}
571 
572 	if (index)
573 		*index = saveIndex;
574 	if (next) {
575 		if (saveIndex == node->NumKeys())
576 			*next = node->OverflowLink();
577 		else
578 			*next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
579 	}
580 	return B_ENTRY_NOT_FOUND;
581 }
582 
583 
584 /**	Prepares the stack to contain all nodes that were passed while
585  *	following the key, from the root node to the leaf node that could
586  *	or should contain that key.
587  */
588 
589 status_t
590 BPlusTree::SeekDown(Stack<node_and_key> &stack, const uint8 *key, uint16 keyLength)
591 {
592 	// set the root node to begin with
593 	node_and_key nodeAndKey;
594 	nodeAndKey.nodeOffset = fHeader->RootNode();
595 
596 	CachedNode cached(this);
597 	bplustree_node *node;
598 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
599 		// if we are already on leaf level, we're done
600 		if (node->OverflowLink() == BPLUSTREE_NULL) {
601 			// node that the keyIndex is not properly set here (but it's not
602 			// needed in the calling functions anyway)!
603 			nodeAndKey.keyIndex = 0;
604 			stack.Push(nodeAndKey);
605 			return B_OK;
606 		}
607 
608 		off_t nextOffset;
609 		status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex, &nextOffset);
610 
611 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
612 			RETURN_ERROR(B_ERROR);
613 
614 		// put the node offset & the correct keyIndex on the stack
615 		stack.Push(nodeAndKey);
616 
617 		nodeAndKey.nodeOffset = nextOffset;
618 	}
619 	RETURN_ERROR(B_ERROR);
620 }
621 
622 
623 status_t
624 BPlusTree::FindFreeDuplicateFragment(bplustree_node *node, CachedNode *cached, off_t *_offset,
625 	bplustree_node **_fragment, uint32 *_index)
626 {
627 	off_t *values = node->Values();
628 	for (int32 i = 0; i < node->NumKeys(); i++) {
629 		// does the value link to a duplicate fragment?
630 		if (bplustree_node::LinkType(values[i]) != BPLUSTREE_DUPLICATE_FRAGMENT)
631 			continue;
632 
633 		bplustree_node *fragment = cached->SetTo(bplustree_node::FragmentOffset(values[i]), false);
634 		if (fragment == NULL) {
635 			FATAL(("Could not get duplicate fragment at %Ld\n", values[i]));
636 			continue;
637 		}
638 
639 		// see if there is some space left for us
640 		int32 num = (fNodeSize >> 3) / (NUM_FRAGMENT_VALUES + 1);
641 		for (int32 j = 0;j < num;j++) {
642 			duplicate_array *array = fragment->FragmentAt(j);
643 
644 			if (array->count == 0) {
645 				*_offset = bplustree_node::FragmentOffset(values[i]);
646 				*_fragment = fragment;
647 				*_index = j;
648 				return B_OK;
649 			}
650 		}
651 	}
652 	return B_ENTRY_NOT_FOUND;
653 }
654 
655 
656 status_t
657 BPlusTree::InsertDuplicate(Transaction *transaction, CachedNode *cached, bplustree_node *node,
658 	uint16 index, off_t value)
659 {
660 	CachedNode cachedDuplicate(this);
661 	off_t *values = node->Values();
662 	off_t oldValue = values[index];
663 	status_t status;
664 	off_t offset;
665 
666 	if (bplustree_node::IsDuplicate(oldValue)) {
667 		//
668 		// If it's a duplicate fragment, try to insert it into that, or if it
669 		// doesn't fit anymore, create a new duplicate node
670 		//
671 		if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
672 			bplustree_node *duplicate = cachedDuplicate.SetTo(bplustree_node::FragmentOffset(oldValue), false);
673 			if (duplicate == NULL)
674 				return B_IO_ERROR;
675 
676 			duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
677 			if (array->count > NUM_FRAGMENT_VALUES
678 				|| array->count < 1) {
679 				FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",
680 					bplustree_node::FragmentIndex(oldValue),
681 					bplustree_node::FragmentOffset(oldValue),
682 					array->count));
683 				return B_BAD_DATA;
684 			}
685 
686 			if (array->count < NUM_FRAGMENT_VALUES) {
687 				array->Insert(value);
688 			} else {
689 				// test if the fragment will be empty if we remove this key's values
690 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
691 					// the node will be empty without our values, so let us
692 					// reuse it as a duplicate node
693 					offset = bplustree_node::FragmentOffset(oldValue);
694 
695 					memmove(duplicate->DuplicateArray(), array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
696 					duplicate->left_link = duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
697 
698 					array = duplicate->DuplicateArray();
699 					array->Insert(value);
700 				} else {
701 					// create a new duplicate node
702 					CachedNode cachedNewDuplicate(this);
703 					bplustree_node *newDuplicate;
704 					status = cachedNewDuplicate.Allocate(transaction, &newDuplicate, &offset);
705 					if (status < B_OK)
706 						return status;
707 
708 					// copy the array from the fragment node to the duplicate node
709 					// and free the old entry (by zero'ing all values)
710 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(array->count);
711 					memcpy(&newDuplicate->all_key_count, &array->values[0],
712 						array->count * sizeof(off_t));
713 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
714 
715 					array = newDuplicate->DuplicateArray();
716 					array->Insert(value);
717 
718 					// if this fails, the old fragments node will contain wrong
719 					// data... (but since it couldn't be written, it shouldn't
720 					// be fatal)
721 					if ((status = cachedNewDuplicate.WriteBack(transaction)) < B_OK)
722 						return status;
723 				}
724 
725 				// update the main pointer to link to a duplicate node
726 				values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE, offset);
727 				if ((status = cached->WriteBack(transaction)) < B_OK)
728 					return status;
729 			}
730 
731 			return cachedDuplicate.WriteBack(transaction);
732 		}
733 
734 		//
735 		// Put the value into a dedicated duplicate node
736 		//
737 
738 		// search for free space in the duplicate nodes of that key
739 		duplicate_array *array;
740 		bplustree_node *duplicate;
741 		off_t duplicateOffset;
742 		do {
743 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
744 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
745 			if (duplicate == NULL)
746 				return B_IO_ERROR;
747 
748 			array = duplicate->DuplicateArray();
749 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
750 				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
751 					duplicateOffset, array->count));
752 				return B_BAD_DATA;
753 			}
754 		} while (array->count >= NUM_DUPLICATE_VALUES
755 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
756 
757 		if (array->count < NUM_DUPLICATE_VALUES) {
758 			array->Insert(value);
759 		} else {
760 			// no space left - add a new duplicate node
761 
762 			CachedNode cachedNewDuplicate(this);
763 			bplustree_node *newDuplicate;
764 			status = cachedNewDuplicate.Allocate(transaction, &newDuplicate, &offset);
765 			if (status < B_OK)
766 				return status;
767 
768 			// link the two nodes together
769 			duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
770 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
771 
772 			array = newDuplicate->DuplicateArray();
773 			array->count = 0;
774 			array->Insert(value);
775 
776 			status = cachedNewDuplicate.WriteBack(transaction);
777 			if (status < B_OK)
778 				return status;
779 		}
780 		return cachedDuplicate.WriteBack(transaction);
781 	}
782 
783 	//
784 	// Search for a free duplicate fragment or create a new one
785 	// to insert the duplicate value into
786 	//
787 
788 	uint32 fragmentIndex = 0;
789 	bplustree_node *fragment;
790 	if (FindFreeDuplicateFragment(node, &cachedDuplicate, &offset, &fragment, &fragmentIndex) < B_OK) {
791 		// allocate a new duplicate fragment node
792 		if ((status = cachedDuplicate.Allocate(transaction, &fragment, &offset)) < B_OK)
793 			return status;
794 
795 		memset(fragment, 0, fNodeSize);
796 	}
797 	duplicate_array *array = fragment->FragmentAt(fragmentIndex);
798 	array->Insert(oldValue);
799 	array->Insert(value);
800 
801 	if ((status = cachedDuplicate.WriteBack(transaction)) < B_OK)
802 		return status;
803 
804 	values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex);
805 
806 	return cached->WriteBack(transaction);
807 }
808 
809 
810 void
811 BPlusTree::InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength,
812 	off_t value)
813 {
814 	// should never happen, but who knows?
815 	if (index > node->NumKeys())
816 		return;
817 
818 	off_t *values = node->Values();
819 	uint16 *keyLengths = node->KeyLengths();
820 	uint8 *keys = node->Keys();
821 
822 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
823 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() + keyLength);
824 
825 	off_t *newValues = node->Values();
826 	uint16 *newKeyLengths = node->KeyLengths();
827 
828 	// move values and copy new value into them
829 	memmove(newValues + index + 1, values + index, sizeof(off_t) * (node->NumKeys() - 1 - index));
830 	memmove(newValues, values, sizeof(off_t) * index);
831 
832 	newValues[index] = value;
833 
834 	// move and update key length index
835 	for (uint16 i = node->NumKeys(); i-- > index + 1;)
836 		newKeyLengths[i] = keyLengths[i - 1] + keyLength;
837 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
838 
839 	int32 keyStart;
840 	newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0);
841 
842 	// move keys and copy new key into them
843 	int32 size = node->AllKeyLength() - newKeyLengths[index];
844 	if (size > 0)
845 		memmove(keys + newKeyLengths[index], keys + newKeyLengths[index] - keyLength, size);
846 
847 	memcpy(keys + keyStart, key, keyLength);
848 }
849 
850 
851 status_t
852 BPlusTree::SplitNode(bplustree_node *node, off_t nodeOffset, bplustree_node *other,
853 	off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value)
854 {
855 	if (*_keyIndex > node->NumKeys() + 1)
856 		return B_BAD_VALUE;
857 
858 	uint16 *inKeyLengths = node->KeyLengths();
859 	off_t *inKeyValues = node->Values();
860 	uint8 *inKeys = node->Keys();
861 	uint8 *outKeys = other->Keys();
862 	int32 keyIndex = *_keyIndex;	// can become less than zero!
863 
864 	// how many keys will fit in one (half) page?
865 	// that loop will find the answer to this question and
866 	// change the key lengths indices for their new home
867 
868 	// "bytes" is the number of bytes written for the new key,
869 	// "bytesBefore" are the bytes before that key
870 	// "bytesAfter" are the bytes after the new key, if any
871 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
872 
873 	size_t size = fNodeSize >> 1;
874 	int32 out, in;
875 	for (in = out = 0; in < node->NumKeys() + 1;) {
876 		if (!bytes)
877 			bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0;
878 
879 		if (in == keyIndex && !bytes) {
880 			bytes = *_keyLength;
881 		} else {
882 			if (keyIndex < out)
883 				bytesAfter = inKeyLengths[in] - bytesBefore;
884 
885 			in++;
886 		}
887 		out++;
888 
889 		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
890 						out * (sizeof(uint16) + sizeof(off_t)) >= size) {
891 			// we have found the number of keys in the new node!
892 			break;
893 		}
894 	}
895 
896 	// if the new key was not inserted, set the length of the keys
897 	// that can be copied directly
898 	if (keyIndex >= out && in > 0)
899 		bytesBefore = inKeyLengths[in - 1];
900 
901 	if (bytesBefore < 0 || bytesAfter < 0)
902 		return B_BAD_DATA;
903 
904 	other->left_link = node->left_link;
905 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
906 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
907 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
908 
909 	uint16 *outKeyLengths = other->KeyLengths();
910 	off_t *outKeyValues = other->Values();
911 	int32 keys = out > keyIndex ? keyIndex : out;
912 
913 	if (bytesBefore) {
914 		// copy the keys
915 		memcpy(outKeys, inKeys, bytesBefore);
916 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
917 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
918 	}
919 	if (bytes) {
920 		// copy the newly inserted key
921 		memcpy(outKeys + bytesBefore, key, bytes);
922 		outKeyLengths[keyIndex] = bytes + bytesBefore;
923 		outKeyValues[keyIndex] = *_value;
924 
925 		if (bytesAfter) {
926 			// copy the keys after the new key
927 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, bytesAfter);
928 			keys = out - keyIndex - 1;
929 			for (int32 i = 0;i < keys;i++)
930 				outKeyLengths[keyIndex + i + 1] = inKeyLengths[keyIndex + i] + bytes;
931 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, keys * sizeof(off_t));
932 		}
933 	}
934 
935 	// if the new key was already inserted, we shouldn't use it again
936 	if (in != out)
937 		keyIndex--;
938 
939 	int32 total = bytesBefore + bytesAfter;
940 
941 	// these variables are for the key that will be returned
942 	// to the parent node
943 	uint8 *newKey = NULL;
944 	uint16 newLength;
945 	bool newAllocated = false;
946 
947 	// If we have split an index node, we have to drop the first key
948 	// of the next node (which can also be the new key to insert).
949 	// The dropped key is also the one which has to be inserted in
950 	// the parent node, so we will set the "newKey" already here.
951 	if (node->OverflowLink() != BPLUSTREE_NULL) {
952 		if (in == keyIndex) {
953 			newKey = key;
954 			newLength = *_keyLength;
955 
956 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
957 			keyIndex--;
958 		} else {
959 			// If a key is dropped (is not the new key), we have to copy
960 			// it, because it would be lost if not.
961 			uint8 *droppedKey = node->KeyAt(in, &newLength);
962 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
963 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
964 				fStream->GetVolume()->Panic();
965 				RETURN_ERROR(B_BAD_DATA);
966 			}
967 			newKey = (uint8 *)malloc(newLength);
968 			if (newKey == NULL)
969 				return B_NO_MEMORY;
970 			memcpy(newKey, droppedKey, newLength);
971 
972 			other->overflow_link = inKeyValues[in];
973 			total = inKeyLengths[in++];
974 		}
975 	}
976 
977 	// and now the same game for the other page and the rest of the keys
978 	// (but with memmove() instead of memcpy(), because they may overlap)
979 
980 	bytesBefore = bytesAfter = bytes = 0;
981 	out = 0;
982 	int32 skip = in;
983 	while (in < node->NumKeys() + 1) {
984 		if (in == keyIndex && !bytes) {
985 			// it's enough to set bytesBefore once here, because we do
986 			// not need to know the exact length of all keys in this
987 			// loop
988 			bytesBefore = in > skip ? inKeyLengths[in - 1] : 0;
989 			bytes = *_keyLength;
990 		} else {
991 			if (in < node->NumKeys()) {
992 				inKeyLengths[in] -= total;
993 				if (bytes) {
994 					inKeyLengths[in] += bytes;
995 					bytesAfter = inKeyLengths[in] - bytesBefore - bytes;
996 				}
997 			}
998 			in++;
999 		}
1000 
1001 		out++;
1002 
1003 		// break out when all keys are done
1004 		if (in > node->NumKeys() && keyIndex < in)
1005 			break;
1006 	}
1007 
1008 	// adjust the byte counts (since we were a bit lazy in the loop)
1009 	if (keyIndex >= in && keyIndex - skip < out)
1010 		bytesAfter = inKeyLengths[in] - bytesBefore - total;
1011 	else if (keyIndex < skip)
1012 		bytesBefore = node->AllKeyLength() - total;
1013 
1014 	if (bytesBefore < 0 || bytesAfter < 0)
1015 		return B_BAD_DATA;
1016 
1017 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1018 		// right link, and overflow link can stay the same
1019 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
1020 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1);
1021 
1022 	// array positions have changed
1023 	outKeyLengths = node->KeyLengths();
1024 	outKeyValues = node->Values();
1025 
1026 	// move the keys in the old node: the order is important here,
1027 	// because we don't want to overwrite any contents
1028 
1029 	keys = keyIndex <= skip ? out : keyIndex - skip;
1030 	keyIndex -= skip;
1031 
1032 	if (bytesBefore)
1033 		memmove(inKeys, inKeys + total, bytesBefore);
1034 	if (bytesAfter)
1035 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, bytesAfter);
1036 
1037 	if (bytesBefore)
1038 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1039 	in = out - keyIndex - 1;
1040 	if (bytesAfter)
1041 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, in * sizeof(uint16));
1042 
1043 	if (bytesBefore)
1044 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1045 	if (bytesAfter)
1046 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, in * sizeof(off_t));
1047 
1048 	if (bytes) {
1049 		// finally, copy the newly inserted key (don't overwrite anything)
1050 		memcpy(inKeys + bytesBefore, key, bytes);
1051 		outKeyLengths[keyIndex] = bytes + bytesBefore;
1052 		outKeyValues[keyIndex] = *_value;
1053 	}
1054 
1055 	// Prepare the key that will be inserted in the parent node which
1056 	// is either the dropped key or the last of the other node.
1057 	// If it's the dropped key, "newKey" was already set earlier.
1058 
1059 	if (newKey == NULL)
1060 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1061 
1062 	memcpy(key, newKey, newLength);
1063 	*_keyLength = newLength;
1064 	*_value = otherOffset;
1065 
1066 	if (newAllocated)
1067 		free(newKey);
1068 
1069 	return B_OK;
1070 }
1071 
1072 
1073 status_t
1074 BPlusTree::Insert(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value)
1075 {
1076 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1077 		RETURN_ERROR(B_BAD_VALUE);
1078 
1079 	// lock access to stream
1080 	WriteLocked locked(fStream->Lock());
1081 
1082 	Stack<node_and_key> stack;
1083 	if (SeekDown(stack, key, keyLength) != B_OK)
1084 		RETURN_ERROR(B_ERROR);
1085 
1086 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1087 
1088 	memcpy(keyBuffer, key, keyLength);
1089 	keyBuffer[keyLength] = 0;
1090 
1091 	node_and_key nodeAndKey;
1092 	bplustree_node *node;
1093 
1094 	CachedNode cached(this);
1095 	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1096 #ifdef DEBUG
1097 		NodeChecker checker(node, fNodeSize);
1098 #endif
1099 		if (node->IsLeaf()) {
1100 			// first round, check for duplicate entries
1101 			status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
1102 
1103 			// is this a duplicate entry?
1104 			if (status == B_OK) {
1105 				if (fAllowDuplicates)
1106 					return InsertDuplicate(transaction, &cached, node, nodeAndKey.keyIndex, value);
1107 
1108 				RETURN_ERROR(B_NAME_IN_USE);
1109 			}
1110 		}
1111 
1112 		// is the node big enough to hold the pair?
1113 		if (int32(round_up(sizeof(bplustree_node) + node->AllKeyLength() + keyLength)
1114 			+ (node->NumKeys() + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
1115 		{
1116 			InsertKey(node, nodeAndKey.keyIndex, keyBuffer, keyLength, value);
1117 			UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, nodeAndKey.keyIndex, 0, 1);
1118 
1119 			return cached.WriteBack(transaction);
1120 		} else {
1121 			CachedNode cachedNewRoot(this);
1122 			CachedNode cachedOther(this);
1123 
1124 			// do we need to allocate a new root node? if so, then do
1125 			// it now
1126 			off_t newRoot = BPLUSTREE_NULL;
1127 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1128 				bplustree_node *root;
1129 				status_t status = cachedNewRoot.Allocate(transaction, &root, &newRoot);
1130 				if (status < B_OK) {
1131 					// The tree is most likely corrupted!
1132 					// But it's still sane at leaf level - we could set
1133 					// a flag in the header that forces the tree to be
1134 					// rebuild next time...
1135 					// But since we will have journaling, that's not a big
1136 					// problem anyway.
1137 					RETURN_ERROR(status);
1138 				}
1139 			}
1140 
1141 			// reserve space for the other node
1142 			bplustree_node *other;
1143 			off_t otherOffset;
1144 			status_t status = cachedOther.Allocate(transaction, &other, &otherOffset);
1145 			if (status < B_OK) {
1146 				cachedNewRoot.Free(transaction, newRoot);
1147 				RETURN_ERROR(status);
1148 			}
1149 
1150 			if (SplitNode(node, nodeAndKey.nodeOffset, other, otherOffset,
1151 					&nodeAndKey.keyIndex, keyBuffer, &keyLength, &value) < B_OK) {
1152 				// free root node & other node here
1153 				cachedNewRoot.Free(transaction, newRoot);
1154 				cachedOther.Free(transaction, otherOffset);
1155 
1156 				RETURN_ERROR(B_ERROR);
1157 			}
1158 
1159 			// write the updated nodes back
1160 
1161 			if (cached.WriteBack(transaction) < B_OK
1162 				|| cachedOther.WriteBack(transaction) < B_OK)
1163 				RETURN_ERROR(B_ERROR);
1164 
1165 			UpdateIterators(nodeAndKey.nodeOffset, otherOffset, nodeAndKey.keyIndex,
1166 				node->NumKeys(), 1);
1167 
1168 			// update the right link of the node in the left of the new node
1169 			if ((other = cachedOther.SetTo(other->LeftLink())) != NULL) {
1170 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1171 				if (cachedOther.WriteBack(transaction) < B_OK)
1172 					RETURN_ERROR(B_ERROR);
1173 			}
1174 
1175 			// create a new root if necessary
1176 			if (newRoot != BPLUSTREE_NULL) {
1177 				bplustree_node *root = cachedNewRoot.Node();
1178 
1179 				InsertKey(root, 0, keyBuffer, keyLength, node->LeftLink());
1180 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(nodeAndKey.nodeOffset);
1181 
1182 				if (cachedNewRoot.WriteBack(transaction) < B_OK)
1183 					RETURN_ERROR(B_ERROR);
1184 
1185 				// finally, update header to point to the new root
1186 				fHeader->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1187 				fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(fHeader->MaxNumberOfLevels() + 1);
1188 
1189 				return fCachedHeader.WriteBack(transaction);
1190 			}
1191 		}
1192 	}
1193 	RETURN_ERROR(B_ERROR);
1194 }
1195 
1196 
1197 status_t
1198 BPlusTree::RemoveDuplicate(Transaction *transaction, bplustree_node *node, CachedNode *cached,
1199 	uint16 index, off_t value)
1200 {
1201 	CachedNode cachedDuplicate(this);
1202 	off_t *values = node->Values();
1203 	off_t oldValue = values[index];
1204 	status_t status;
1205 
1206 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1207 	bplustree_node *duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1208 	if (duplicate == NULL)
1209 		return B_IO_ERROR;
1210 
1211 	// if it's a duplicate fragment, remove the entry from there
1212 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1213 		duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
1214 
1215 		if (array->count > NUM_FRAGMENT_VALUES
1216 			|| array->count < 1) {
1217 			FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",
1218 				bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count));
1219 			return B_BAD_DATA;
1220 		}
1221 		if (!array->Remove(value))
1222 			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",
1223 				value, duplicateOffset));
1224 
1225 		// remove the array from the fragment node if it is empty
1226 		if (array->count == 1) {
1227 			// set the link to the remaining value
1228 			values[index] = array->values[0];
1229 
1230 			// Remove the whole fragment node, if this was the only array,
1231 			// otherwise free the array and write the changes back
1232 			if (duplicate->FragmentsUsed(fNodeSize) == 1)
1233 				status = cachedDuplicate.Free(transaction, duplicateOffset);
1234 			else {
1235 				array->count = 0;
1236 				status = cachedDuplicate.WriteBack(transaction);
1237 			}
1238 			if (status < B_OK)
1239 				return status;
1240 
1241 			return cached->WriteBack(transaction);
1242 		}
1243 		return cachedDuplicate.WriteBack(transaction);
1244 	}
1245 
1246 	//
1247 	// Remove value from a duplicate node!
1248 	//
1249 
1250 	duplicate_array *array;
1251 
1252 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1253 		FATAL(("invalid duplicate node: first left link points to %Ld!\n", duplicate->LeftLink()));
1254 		return B_BAD_DATA;
1255 	}
1256 
1257 	// Search the duplicate nodes until the entry could be found (and removed)
1258 	while (duplicate != NULL) {
1259 		array = duplicate->DuplicateArray();
1260 		if (array->count > NUM_DUPLICATE_VALUES
1261 			|| array->count < 0) {
1262 			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
1263 				duplicateOffset, array->count));
1264 			return B_BAD_DATA;
1265 		}
1266 
1267 		if (array->Remove(value))
1268 			break;
1269 
1270 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1271 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1272 
1273 		duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1274 	}
1275 	if (duplicate == NULL)
1276 		RETURN_ERROR(B_IO_ERROR);
1277 
1278 	while (true) {
1279 		off_t left = duplicate->LeftLink();
1280 		off_t right = duplicate->RightLink();
1281 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1282 
1283 		if (isLast && array->count == 1 || array->count == 0) {
1284 			// Free empty duplicate page, link their siblings together, and
1285 			// update the duplicate link if needed (which should not be, if
1286 			// we are the only one working on that tree...)
1287 
1288 			if (duplicateOffset == bplustree_node::FragmentOffset(oldValue)
1289 				|| array->count == 1) {
1290 				if (array->count == 1 && isLast)
1291 					values[index] = array->values[0];
1292 				else if (isLast) {
1293 					FATAL(("removed last value from duplicate!\n"));
1294 				} else
1295 					values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE, right);
1296 
1297 				if ((status = cached->WriteBack(transaction)) < B_OK)
1298 					return status;
1299 			}
1300 
1301 			if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK)
1302 				return status;
1303 
1304 			if (left != BPLUSTREE_NULL
1305 				&& (duplicate = cachedDuplicate.SetTo(left, false)) != NULL) {
1306 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1307 
1308 				// If the next node is the last node, we need to free that node
1309 				// and convert the duplicate entry back into a normal entry
1310 				if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL
1311 					&& duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) {
1312 					duplicateOffset = left;
1313 					continue;
1314 				}
1315 
1316 				status = cachedDuplicate.WriteBack(transaction);
1317 				if (status < B_OK)
1318 					return status;
1319 			}
1320 			if (right != BPLUSTREE_NULL
1321 				&& (duplicate = cachedDuplicate.SetTo(right, false)) != NULL) {
1322 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1323 
1324 				// Again, we may need to turn the duplicate entry back into a normal entry
1325 				array = duplicate->DuplicateArray();
1326 				if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL
1327 					&& duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) {
1328 					duplicateOffset = right;
1329 					continue;
1330 				}
1331 
1332 				return cachedDuplicate.WriteBack(transaction);
1333 			}
1334 			return status;
1335 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1336 			// If the number of entries fits in a duplicate fragment, then
1337 			// either find a free fragment node, or convert this node to a
1338 			// fragment node.
1339 			CachedNode cachedOther(this);
1340 
1341 			bplustree_node *fragment = NULL;
1342 			uint32 fragmentIndex = 0;
1343 			off_t offset;
1344 			if (FindFreeDuplicateFragment(node, &cachedOther, &offset,
1345 					&fragment, &fragmentIndex) < B_OK) {
1346 				// convert node
1347 				memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1348 				memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1349 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1350 			} else {
1351 				// move to other node
1352 				duplicate_array *target = fragment->FragmentAt(fragmentIndex);
1353 				memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1354 
1355 				cachedDuplicate.Free(transaction, duplicateOffset);
1356 				duplicateOffset = offset;
1357 			}
1358 			values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT,
1359 								duplicateOffset, fragmentIndex);
1360 
1361 			if ((status = cached->WriteBack(transaction)) < B_OK)
1362 				return status;
1363 
1364 			if (fragment != NULL)
1365 				return cachedOther.WriteBack(transaction);
1366 		}
1367 		return cachedDuplicate.WriteBack(transaction);
1368 	}
1369 }
1370 
1371 
1372 /** Removes the key with the given index from the specified node.
1373  *	Since it has to get the key from the node anyway (to obtain it's
1374  *	pointer), it's not needed to pass the key & its length, although
1375  *	the calling method (BPlusTree::Remove()) have this data.
1376  */
1377 
1378 void
1379 BPlusTree::RemoveKey(bplustree_node *node, uint16 index)
1380 {
1381 	// should never happen, but who knows?
1382 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1383 		FATAL(("Asked me to remove key outer limits: %u\n", index));
1384 		return;
1385 	}
1386 
1387 	off_t *values = node->Values();
1388 
1389 	// if we would have to drop the overflow link, drop
1390 	// the last key instead and update the overflow link
1391 	// to the value of that one
1392 	if (!node->IsLeaf() && index == node->NumKeys())
1393 		node->overflow_link = values[--index];
1394 
1395 	uint16 length;
1396 	uint8 *key = node->KeyAt(index, &length);
1397 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
1398 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1399 		FATAL(("Key length to long: %s, %u (inode at %ld,%u)\n", key, length,
1400 			fStream->BlockRun().allocation_group, fStream->BlockRun().start));
1401 		fStream->GetVolume()->Panic();
1402 		return;
1403 	}
1404 
1405 	uint16 *keyLengths = node->KeyLengths();
1406 	uint8 *keys = node->Keys();
1407 
1408 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1409 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length);
1410 
1411 	off_t *newValues = node->Values();
1412 	uint16 *newKeyLengths = node->KeyLengths();
1413 
1414 	// move key data
1415 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1416 
1417 	// move and update key lengths
1418 	if (index > 0 && newKeyLengths != keyLengths)
1419 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1420 	for (uint16 i = index; i < node->NumKeys(); i++)
1421 		newKeyLengths[i] = keyLengths[i + 1] - length;
1422 
1423 	// move values
1424 	if (index > 0)
1425 		memmove(newValues, values, index * sizeof(off_t));
1426 	if (node->NumKeys() > index)
1427 		memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t));
1428 }
1429 
1430 
1431 /**	Removes the specified key from the tree. The "value" parameter is only used
1432  *	for trees which allow duplicates, so you may safely ignore it.
1433  *	It's not an optional parameter, so at least you have to think about it.
1434  */
1435 
1436 status_t
1437 BPlusTree::Remove(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value)
1438 {
1439 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1440 		RETURN_ERROR(B_BAD_VALUE);
1441 
1442 	// lock access to stream
1443 	WriteLocked locked(fStream->Lock());
1444 
1445 	Stack<node_and_key> stack;
1446 	if (SeekDown(stack, key, keyLength) != B_OK)
1447 		RETURN_ERROR(B_ERROR);
1448 
1449 	node_and_key nodeAndKey;
1450 	bplustree_node *node;
1451 
1452 	CachedNode cached(this);
1453 	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1454 #ifdef DEBUG
1455 		NodeChecker checker(node, fNodeSize);
1456 #endif
1457 		if (node->IsLeaf()) {
1458 			// first round, check for duplicate entries
1459 			status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
1460 			if (status < B_OK)
1461 				RETURN_ERROR(status);
1462 
1463 			// If we will remove the last key, the iterator will be set
1464 			// to the next node after the current - if there aren't any
1465 			// more nodes, we need a way to prevent the TreeIterators to
1466 			// touch the old node again, we use BPLUSTREE_FREE for this
1467 			off_t next = node->RightLink() == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->RightLink();
1468 			UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 ?
1469 								next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
1470 
1471 			// is this a duplicate entry?
1472 			if (bplustree_node::IsDuplicate(node->Values()[nodeAndKey.keyIndex])) {
1473 				if (fAllowDuplicates)
1474 					return RemoveDuplicate(transaction, node, &cached, nodeAndKey.keyIndex, value);
1475 				else
1476 					RETURN_ERROR(B_NAME_IN_USE);
1477 			}
1478 		}
1479 
1480 		// if it's an empty root node, we have to convert it
1481 		// to a leaf node by dropping the overflow link, or,
1482 		// if it's a leaf node, just empty it
1483 		if (nodeAndKey.nodeOffset == fHeader->RootNode()
1484 			&& node->NumKeys() == 0
1485 			|| node->NumKeys() == 1 && node->IsLeaf()) {
1486 			node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1487 			node->all_key_count = 0;
1488 			node->all_key_length = 0;
1489 
1490 			if (cached.WriteBack(transaction) < B_OK)
1491 				return B_IO_ERROR;
1492 
1493 			// if we've cleared the root node, reset the maximum
1494 			// number of levels in the header
1495 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1496 				fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1497 				return fCachedHeader.WriteBack(transaction);
1498 			}
1499 			return B_OK;
1500 		}
1501 
1502 		// if there is only one key left, we don't have to remove
1503 		// it, we can just dump the node (index nodes still have
1504 		// the overflow link, so we have to drop the last key)
1505 		if (node->NumKeys() > 1
1506 			|| !node->IsLeaf() && node->NumKeys() == 1) {
1507 			RemoveKey(node, nodeAndKey.keyIndex);
1508 			return cached.WriteBack(transaction);
1509 		}
1510 
1511 		// when we are here, we can just free the node, but
1512 		// we have to update the right/left link of the
1513 		// siblings first
1514 		CachedNode otherCached(this);
1515 		bplustree_node *other = otherCached.SetTo(node->LeftLink());
1516 		if (other != NULL) {
1517 			other->right_link = node->right_link;
1518 			if (otherCached.WriteBack(transaction) < B_OK)
1519 				return B_IO_ERROR;
1520 		}
1521 
1522 		if ((other = otherCached.SetTo(node->RightLink())) != NULL) {
1523 			other->left_link = node->left_link;
1524 			if (otherCached.WriteBack(transaction) < B_OK)
1525 				return B_IO_ERROR;
1526 		}
1527 
1528 		cached.Free(transaction, nodeAndKey.nodeOffset);
1529 	}
1530 	RETURN_ERROR(B_ERROR);
1531 }
1532 
1533 
1534 /**	Replaces the value for the key in the tree.
1535  *	Returns B_OK if the key could be found and its value replaced,
1536  *	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1537  *	to indicate that something went terribly wrong.
1538  *	Note that this doesn't work with duplicates - it will just
1539  *	return B_BAD_TYPE if you call this function on a tree where
1540  *	duplicates are allowed.
1541  */
1542 
1543 status_t
1544 BPlusTree::Replace(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value)
1545 {
1546 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1547 		|| key == NULL)
1548 		RETURN_ERROR(B_BAD_VALUE);
1549 
1550 	if (fAllowDuplicates)
1551 		RETURN_ERROR(B_BAD_TYPE);
1552 
1553 	// lock access to stream (a read lock is okay for this purpose)
1554 	ReadLocked locked(fStream->Lock());
1555 
1556 	off_t nodeOffset = fHeader->RootNode();
1557 	CachedNode cached(this);
1558 	bplustree_node *node;
1559 
1560 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1561 		uint16 keyIndex = 0;
1562 		off_t nextOffset;
1563 		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1564 
1565 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1566 			if (status == B_OK) {
1567 				node->Values()[keyIndex] = value;
1568 				return cached.WriteBack(transaction);
1569 			}
1570 
1571 			return status;
1572 		} else if (nextOffset == nodeOffset)
1573 			RETURN_ERROR(B_ERROR);
1574 
1575 		nodeOffset = nextOffset;
1576 	}
1577 	RETURN_ERROR(B_ERROR);
1578 }
1579 
1580 
1581 /**	Searches the key in the tree, and stores the offset found in
1582  *	_value, if successful.
1583  *	It's very similar to BPlusTree::SeekDown(), but doesn't fill
1584  *	a stack while it descends the tree.
1585  *	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND
1586  *	if not. It can also return other errors to indicate that
1587  *	something went wrong.
1588  *	Note that this doesn't work with duplicates - it will just
1589  *	return B_BAD_TYPE if you call this function on a tree where
1590  *	duplicates are allowed.
1591  */
1592 
1593 status_t
1594 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value)
1595 {
1596 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1597 		|| key == NULL)
1598 		RETURN_ERROR(B_BAD_VALUE);
1599 
1600 	if (fAllowDuplicates)
1601 		RETURN_ERROR(B_BAD_TYPE);
1602 
1603 	// lock access to stream
1604 	ReadLocked locked(fStream->Lock());
1605 
1606 	off_t nodeOffset = fHeader->RootNode();
1607 	CachedNode cached(this);
1608 	bplustree_node *node;
1609 
1610 #ifdef DEBUG
1611 	int32 levels = 0;
1612 #endif
1613 
1614 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1615 		uint16 keyIndex = 0;
1616 		off_t nextOffset;
1617 		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1618 
1619 #ifdef DEBUG
1620 		levels++;
1621 #endif
1622 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1623 			if (status == B_OK && _value != NULL)
1624 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1625 
1626 #ifdef DEBUG
1627 			if (levels != (int32)fHeader->MaxNumberOfLevels())
1628 				DEBUGGER(("levels don't match"));
1629 #endif
1630 			return status;
1631 		} else if (nextOffset == nodeOffset)
1632 			RETURN_ERROR(B_ERROR);
1633 
1634 		nodeOffset = nextOffset;
1635 	}
1636 	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
1637 	RETURN_ERROR(B_ERROR);
1638 }
1639 
1640 
1641 //	#pragma mark -
1642 
1643 
1644 TreeIterator::TreeIterator(BPlusTree *tree)
1645 	:
1646 	fTree(tree),
1647 	fCurrentNodeOffset(BPLUSTREE_NULL),
1648 	fNext(NULL)
1649 {
1650 	tree->AddIterator(this);
1651 }
1652 
1653 
1654 TreeIterator::~TreeIterator()
1655 {
1656 	if (fTree)
1657 		fTree->RemoveIterator(this);
1658 }
1659 
1660 
1661 status_t
1662 TreeIterator::Goto(int8 to)
1663 {
1664 	if (fTree == NULL || fTree->fHeader == NULL)
1665 		RETURN_ERROR(B_BAD_VALUE);
1666 
1667 	// lock access to stream
1668 	ReadLocked locked(fTree->fStream->Lock());
1669 
1670 	off_t nodeOffset = fTree->fHeader->RootNode();
1671 	CachedNode cached(fTree);
1672 	bplustree_node *node;
1673 
1674 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1675 		// is the node a leaf node?
1676 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1677 			fCurrentNodeOffset = nodeOffset;
1678 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1679 			fDuplicateNode = BPLUSTREE_NULL;
1680 
1681 			return B_OK;
1682 		}
1683 
1684 		// get the next node offset depending on the direction (and if there
1685 		// are any keys in that node at all)
1686 		off_t nextOffset;
1687 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1688 			nextOffset = node->OverflowLink();
1689 		else {
1690 			if (node->AllKeyLength() > fTree->fNodeSize
1691 				|| (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->NumKeys())
1692 				RETURN_ERROR(B_ERROR);
1693 
1694 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1695 		}
1696 		if (nextOffset == nodeOffset)
1697 			break;
1698 
1699 		nodeOffset = nextOffset;
1700 	}
1701 	FATAL(("%s fails\n", __FUNCTION__));
1702 
1703 	RETURN_ERROR(B_ERROR);
1704 }
1705 
1706 
1707 /**	Iterates through the tree in the specified direction.
1708  *	When it iterates through duplicates, the "key" is only updated for the
1709  *	first entry - if you need to know when this happens, use the "duplicate"
1710  *	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
1711  *	the other duplicates.
1712  *	That's not too nice, but saves the 256 bytes that would be needed to
1713  *	store the last key - if this will ever become an issue, it will be
1714  *	easy to change.
1715  *	The other advantage of this is, that the queries can skip all duplicates
1716  *	at once when they are not relevant to them.
1717  */
1718 
1719 status_t
1720 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength,
1721 	off_t *value, uint16 *duplicate)
1722 {
1723 	if (fTree == NULL)
1724 		return B_INTERRUPTED;
1725 	if (fCurrentNodeOffset == BPLUSTREE_NULL
1726 		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
1727 		RETURN_ERROR(B_ERROR);
1728 
1729 	// if the tree was emptied since the last call
1730 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
1731 		return B_ENTRY_NOT_FOUND;
1732 
1733 	// lock access to stream
1734 	ReadLocked locked(fTree->fStream->Lock());
1735 
1736 	CachedNode cached(fTree);
1737 	bplustree_node *node;
1738 
1739 	if (fDuplicateNode != BPLUSTREE_NULL) {
1740 		// regardless of traverse direction the duplicates are always presented in
1741 		// the same order; since they are all considered as equal, this shouldn't
1742 		// cause any problems
1743 
1744 		if (!fIsFragment || fDuplicate < fNumDuplicates)
1745 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1746 		else
1747 			node = NULL;
1748 
1749 		if (node != NULL) {
1750 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
1751 				// if the node is out of duplicates, we go directly to the next one
1752 				fDuplicateNode = node->RightLink();
1753 				if (fDuplicateNode != BPLUSTREE_NULL
1754 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
1755 					fNumDuplicates = node->CountDuplicates(fDuplicateNode, false);
1756 					fDuplicate = 0;
1757 				}
1758 			}
1759 			if (fDuplicate < fNumDuplicates) {
1760 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++);
1761 				if (duplicate)
1762 					*duplicate = 2;
1763 				return B_OK;
1764 			}
1765 		}
1766 		fDuplicateNode = BPLUSTREE_NULL;
1767 	}
1768 
1769 	off_t savedNodeOffset = fCurrentNodeOffset;
1770 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
1771 		RETURN_ERROR(B_ERROR);
1772 
1773 	if (duplicate)
1774 		*duplicate = 0;
1775 
1776 	fCurrentKey += direction;
1777 
1778 	// is the current key in the current node?
1779 	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->NumKeys())
1780 			|| (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0)) {
1781 		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->RightLink() : node->LeftLink();
1782 
1783 		// are there any more nodes?
1784 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
1785 			node = cached.SetTo(fCurrentNodeOffset);
1786 			if (!node)
1787 				RETURN_ERROR(B_ERROR);
1788 
1789 			// reset current key
1790 			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->NumKeys();
1791 		} else {
1792 			// there are no nodes left, so turn back to the last key
1793 			fCurrentNodeOffset = savedNodeOffset;
1794 			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->NumKeys() : -1;
1795 
1796 			return B_ENTRY_NOT_FOUND;
1797 		}
1798 	}
1799 
1800 	if (node->all_key_count == 0)
1801 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
1802 
1803 	uint16 length;
1804 	uint8 *keyStart = node->KeyAt(fCurrentKey, &length);
1805 	if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize
1806 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1807 		fTree->fStream->GetVolume()->Panic();
1808 		RETURN_ERROR(B_BAD_DATA);
1809 	}
1810 
1811 	length = min_c(length, maxLength);
1812 	memcpy(key, keyStart, length);
1813 
1814 	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	{
1815 		// terminate string type
1816 		if (length == maxLength)
1817 			length--;
1818 		((char *)key)[length] = '\0';
1819 	}
1820 	*keyLength = length;
1821 
1822 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
1823 
1824 	// duplicate fragments?
1825 	uint8 type = bplustree_node::LinkType(offset);
1826 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) {
1827 		fDuplicateNode = offset;
1828 
1829 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1830 		if (node == NULL)
1831 			RETURN_ERROR(B_ERROR);
1832 
1833 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
1834 
1835 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
1836 		if (fNumDuplicates) {
1837 			offset = node->DuplicateAt(offset, fIsFragment, 0);
1838 			fDuplicate = 1;
1839 			if (duplicate)
1840 				*duplicate = 1;
1841 		} else {
1842 			// shouldn't happen, but we're dealing here with potentially corrupt disks...
1843 			fDuplicateNode = BPLUSTREE_NULL;
1844 			offset = 0;
1845 		}
1846 	}
1847 	*value = offset;
1848 
1849 	return B_OK;
1850 }
1851 
1852 
1853 /**	This is more or less a copy of BPlusTree::Find() - but it just
1854  *	sets the current position in the iterator, regardless of if the
1855  *	key could be found or not.
1856  */
1857 
1858 status_t
1859 TreeIterator::Find(const uint8 *key, uint16 keyLength)
1860 {
1861 	if (fTree == NULL)
1862 		return B_INTERRUPTED;
1863 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1864 		|| key == NULL)
1865 		RETURN_ERROR(B_BAD_VALUE);
1866 
1867 	// lock access to stream
1868 	ReadLocked locked(fTree->fStream->Lock());
1869 
1870 	off_t nodeOffset = fTree->fHeader->RootNode();
1871 
1872 	CachedNode cached(fTree);
1873 	bplustree_node *node;
1874 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1875 		uint16 keyIndex = 0;
1876 		off_t nextOffset;
1877 		status_t status = fTree->FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1878 
1879 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1880 			fCurrentNodeOffset = nodeOffset;
1881 			fCurrentKey = keyIndex - 1;
1882 			fDuplicateNode = BPLUSTREE_NULL;
1883 
1884 			return status;
1885 		} else if (nextOffset == nodeOffset)
1886 			RETURN_ERROR(B_ERROR);
1887 
1888 		nodeOffset = nextOffset;
1889 	}
1890 	RETURN_ERROR(B_ERROR);
1891 }
1892 
1893 
1894 void
1895 TreeIterator::SkipDuplicates()
1896 {
1897 	fDuplicateNode = BPLUSTREE_NULL;
1898 }
1899 
1900 
1901 void
1902 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change)
1903 {
1904 	if (offset != fCurrentNodeOffset)
1905 		return;
1906 
1907 	if (nextOffset != BPLUSTREE_NULL) {
1908 		fCurrentNodeOffset = nextOffset;
1909 		if (splitAt <= fCurrentKey) {
1910 			fCurrentKey -= splitAt;
1911 			keyIndex -= splitAt;
1912 		}
1913 	}
1914 
1915 	// Adjust fCurrentKey to point to the same key as before.
1916 	// Note, that if a key is inserted at the current position
1917 	// it won't be included in this tree transition.
1918 	if (keyIndex <= fCurrentKey)
1919 		fCurrentKey += change;
1920 
1921 	// ToDo: duplicate handling!
1922 }
1923 
1924 
1925 void
1926 TreeIterator::Stop()
1927 {
1928 	fTree = NULL;
1929 }
1930 
1931 
1932 #ifdef DEBUG
1933 void
1934 TreeIterator::Dump()
1935 {
1936 	__out("TreeIterator at %p:\n", this);
1937 	__out("\tfTree = %p\n", fTree);
1938 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
1939 	__out("\tfCurrentKey = %ld\n", fCurrentKey);
1940 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode);
1941 	__out("\tfDuplicate = %u\n", fDuplicate);
1942 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
1943 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
1944 }
1945 #endif
1946 
1947 
1948 //	#pragma mark -
1949 
1950 
1951 void
1952 bplustree_node::Initialize()
1953 {
1954 	left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1955 	all_key_count = 0;
1956 	all_key_length = 0;
1957 }
1958 
1959 
1960 uint8 *
1961 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const
1962 {
1963 	if (index < 0 || index > NumKeys())
1964 		return NULL;
1965 
1966 	uint8 *keyStart = Keys();
1967 	uint16 *keyLengths = KeyLengths();
1968 
1969 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
1970 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
1971 	if (index > 0)
1972 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
1973 
1974 	return keyStart;
1975 }
1976 
1977 
1978 uint8
1979 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
1980 {
1981 	// the duplicate fragment handling is currently hard-coded to a node size
1982 	// of 1024 bytes - with future versions of BFS, this may be a problem
1983 
1984 	if (isFragment) {
1985 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
1986 
1987 		return ((off_t *)this)[fragment];
1988 	}
1989 	return OverflowLink();
1990 }
1991 
1992 
1993 off_t
1994 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
1995 {
1996 	uint32 start;
1997 	if (isFragment)
1998 		start = 8 * ((uint64)offset & 0x3ff);
1999 	else
2000 		start = 2;
2001 
2002 	return ((off_t *)this)[start + 1 + index];
2003 }
2004 
2005 
2006 /**	Although the name suggests it, this function doesn't return the real
2007  *	used fragment count; at least, it can only count to two: it returns
2008  *	0, if there is no fragment used, 1 if there is only one fragment
2009  *	used, and 2 if there are at least 2 fragments used.
2010  */
2011 
2012 int32
2013 bplustree_node::FragmentsUsed(uint32 nodeSize)
2014 {
2015 	uint32 used = 0;
2016 	for (uint32 i = 0; i < nodeSize / ((NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); i++) {
2017 		duplicate_array *array = FragmentAt(i);
2018 		if (array->count > 0 && ++used > 1)
2019 			return used;
2020 	}
2021 	return used;
2022 }
2023 
2024 
2025 #ifdef DEBUG
2026 void
2027 bplustree_node::CheckIntegrity(uint32 nodeSize)
2028 {
2029 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2030 		DEBUGGER(("invalid node: key/length count"));
2031 
2032 	for (int32 i = 0; i < NumKeys(); i++) {
2033 		uint16 length;
2034 		uint8 *key = KeyAt(i, &length);
2035 		if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize
2036 			|| length > BPLUSTREE_MAX_KEY_LENGTH)
2037 			DEBUGGER(("invalid node: keys corrupted"));
2038 	}
2039 }
2040 #endif
2041 
2042 
2043 //	#pragma mark -
2044 
2045 
2046 int32
2047 compareKeys(type_code type, const void *key1, int keyLength1, const void *key2, int keyLength2)
2048 {
2049 	// if one of the keys is NULL, bail out gracefully
2050 	if (key1 == NULL || key2 == NULL) {
2051 //#if USER
2052 //		// that's here to see if it's reasonable to handle this case nicely at all
2053 //		DEBUGGER(("compareKeys() got NULL key!"));
2054 //#endif
2055 		// even if it's most probably a bug in the calling code, we try to
2056 		// give a meaningful result
2057 		if (key1 == NULL && key2 != NULL)
2058 			return 1;
2059 		else if (key2 == NULL && key1 != NULL)
2060 			return -1;
2061 
2062 		return 0;
2063 	}
2064 
2065 	switch (type) {
2066 	    case B_STRING_TYPE:
2067     	{
2068 			int len = min_c(keyLength1, keyLength2);
2069 			int result = strncmp((const char *)key1, (const char *)key2, len);
2070 
2071 			if (result == 0
2072 				&& !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0'))
2073 				result = keyLength1 - keyLength2;
2074 
2075 			return result;
2076 		}
2077 
2078 		case B_SSIZE_T_TYPE:
2079 		case B_INT32_TYPE:
2080 			return *(int32 *)key1 - *(int32 *)key2;
2081 
2082 		case B_SIZE_T_TYPE:
2083 		case B_UINT32_TYPE:
2084 			if (*(uint32 *)key1 == *(uint32 *)key2)
2085 				return 0;
2086 			else if (*(uint32 *)key1 > *(uint32 *)key2)
2087 				return 1;
2088 
2089 			return -1;
2090 
2091 		case B_OFF_T_TYPE:
2092 		case B_INT64_TYPE:
2093 			if (*(int64 *)key1 == *(int64 *)key2)
2094 				return 0;
2095 			else if (*(int64 *)key1 > *(int64 *)key2)
2096 				return 1;
2097 
2098 			return -1;
2099 
2100 		case B_UINT64_TYPE:
2101 			if (*(uint64 *)key1 == *(uint64 *)key2)
2102 				return 0;
2103 			else if (*(uint64 *)key1 > *(uint64 *)key2)
2104 				return 1;
2105 
2106 			return -1;
2107 
2108 		case B_FLOAT_TYPE:
2109 		{
2110 			float result = *(float *)key1 - *(float *)key2;
2111 			if (result == 0.0f)
2112 				return 0;
2113 
2114 			return (result < 0.0f) ? -1 : 1;
2115 		}
2116 
2117 		case B_DOUBLE_TYPE:
2118 		{
2119 			double result = *(double *)key1 - *(double *)key2;
2120 			if (result == 0.0)
2121 				return 0;
2122 
2123 			return (result < 0.0) ? -1 : 1;
2124 		}
2125 	}
2126 
2127 	// if the type is unknown, the entries don't match...
2128 	return -1;
2129 }
2130 
2131