xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision d5cd5d63ff0ad395989db6cf4841a64d5b545d1d)
1 /* BPlusTree - BFS B+Tree implementation
2 **
3 ** Initial version by Axel Dörfler, axeld@pinc-software.de
4 ** Roughly based on 'btlib' written by Marcus J. Ranum
5 **
6 ** Copyright (c) 2001-2002 pinc Software. All Rights Reserved.
7 ** This file may be used under the terms of the OpenBeOS License.
8 */
9 
10 
11 #include "Debug.h"
12 #include "cpp.h"
13 #include "BPlusTree.h"
14 #include "Inode.h"
15 #include "Utility.h"
16 #include "Stack.h"
17 
18 #include <TypeConstants.h>
19 
20 #include <string.h>
21 #include <stdlib.h>
22 #include <stdio.h>
23 
24 
25 #ifdef DEBUG
26 class NodeChecker {
27 	public:
28 		NodeChecker(bplustree_node *node, int32 nodeSize)
29 			:
30 			fNode(node),
31 			fSize(nodeSize)
32 		{
33 		}
34 
35 		~NodeChecker()
36 		{
37 			fNode->CheckIntegrity(fSize);
38 		}
39 
40 	private:
41 		bplustree_node *fNode;
42 		int32	fSize;
43 };
44 #endif
45 
46 // Node Caching for the BPlusTree class
47 //
48 // With write support, there is the need for a function that allocates new
49 // nodes by either returning empty nodes, or by growing the file's data stream
50 //
51 // !! The CachedNode class assumes that you have properly locked the stream
52 // !! before asking for nodes.
53 //
54 // Note: This code will fail if the block size is smaller than the node size!
55 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
56 // is hard-coded to 1024 bytes, that's not an issue now.
57 
58 void
59 CachedNode::Unset()
60 {
61 	if (fTree == NULL || fTree->fStream == NULL)
62 		return;
63 
64 	if (fBlock != NULL) {
65 		release_block(fTree->fStream->GetVolume()->Device(),fBlockNumber);
66 
67 		fBlock = NULL;
68 		fNode = NULL;
69 	}
70 }
71 
72 
73 bplustree_node *
74 CachedNode::SetTo(off_t offset,bool check)
75 {
76 	if (fTree == NULL || fTree->fStream == NULL) {
77 		REPORT_ERROR(B_BAD_VALUE);
78 		return NULL;
79 	}
80 
81 	Unset();
82 
83 	// You can only ask for nodes at valid positions - you can't
84 	// even access the b+tree header with this method (use SetToHeader()
85 	// instead)
86 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
87 		|| offset <= 0
88 		|| (offset % fTree->fNodeSize) != 0)
89 		return NULL;
90 
91 	if (InternalSetTo(offset) != NULL && check) {
92 		// sanity checks (links, all_key_count)
93 		bplustree_header *header = fTree->fHeader;
94 		if (!header->IsValidLink(fNode->LeftLink())
95 			|| !header->IsValidLink(fNode->RightLink())
96 			|| !header->IsValidLink(fNode->OverflowLink())
97 			|| (int8 *)fNode->Values() + fNode->NumKeys() * sizeof(off_t) >
98 					(int8 *)fNode + fTree->fNodeSize) {
99 			FATAL(("invalid node read from offset %Ld, inode at %Ld\n",
100 					offset,fTree->fStream->ID()));
101 			return NULL;
102 		}
103 	}
104 	return fNode;
105 }
106 
107 
108 bplustree_header *
109 CachedNode::SetToHeader()
110 {
111 	if (fTree == NULL || fTree->fStream == NULL) {
112 		REPORT_ERROR(B_BAD_VALUE);
113 		return NULL;
114 	}
115 
116 	Unset();
117 
118 	InternalSetTo(0LL);
119 	return (bplustree_header *)fNode;
120 }
121 
122 
123 bplustree_node *
124 CachedNode::InternalSetTo(off_t offset)
125 {
126 	fNode = NULL;
127 
128 	off_t fileOffset;
129 	block_run run;
130 	if (offset < fTree->fStream->Size()
131 		&& fTree->fStream->FindBlockRun(offset,run,fileOffset) == B_OK) {
132 		Volume *volume = fTree->fStream->GetVolume();
133 
134 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
135 		fBlockNumber = volume->ToBlock(run) + blockOffset;
136 
137 		fBlock = (uint8 *)get_block(volume->Device(),fBlockNumber,volume->BlockSize());
138 		if (fBlock) {
139 			// the node is somewhere in that block... (confusing offset calculation)
140 			fNode = (bplustree_node *)(fBlock + offset -
141 						(fileOffset + blockOffset * volume->BlockSize()));
142 		} else
143 			REPORT_ERROR(B_IO_ERROR);
144 	}
145 	return fNode;
146 }
147 
148 
149 status_t
150 CachedNode::Free(Transaction *transaction,off_t offset)
151 {
152 	if (transaction == NULL || fTree == NULL || fTree->fStream == NULL
153 		|| offset == BPLUSTREE_NULL)
154 		RETURN_ERROR(B_BAD_VALUE);
155 
156 	// ToDo: scan the free nodes list and remove all nodes at the end
157 	// of the tree - perhaps that shouldn't be done everytime that
158 	// function is called, perhaps it should be done when the directory
159 	// inode is closed or based on some calculation or whatever...
160 
161 	// if the node is the last one in the tree, we shrink
162 	// the tree and file size by one node
163 	off_t lastOffset = fTree->fHeader->MaximumSize() - fTree->fNodeSize;
164 	if (offset == lastOffset) {
165 		fTree->fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
166 
167 		status_t status = fTree->fStream->SetFileSize(transaction,lastOffset);
168 		if (status < B_OK)
169 			return status;
170 
171 		return fTree->fCachedHeader.WriteBack(transaction);
172 	}
173 
174 	// add the node to the free nodes list
175 	fNode->left_link = fTree->fHeader->free_node_pointer;
176 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
177 
178 	if (WriteBack(transaction) == B_OK) {
179 		fTree->fHeader->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
180 		return fTree->fCachedHeader.WriteBack(transaction);
181 	}
182 	return B_ERROR;
183 }
184 
185 
186 status_t
187 CachedNode::Allocate(Transaction *transaction, bplustree_node **_node, off_t *_offset)
188 {
189 	if (transaction == NULL || fTree == NULL || fTree->fHeader == NULL
190 		|| fTree->fStream == NULL) {
191 		RETURN_ERROR(B_BAD_VALUE);
192 	}
193 
194 	status_t status;
195 
196 	// if there are any free nodes, recycle them
197 	if (SetTo(fTree->fHeader->FreeNode(), false) != NULL) {
198 		*_offset = fTree->fHeader->FreeNode();
199 
200 		// set new free node pointer
201 		fTree->fHeader->free_node_pointer = fNode->left_link;
202 		if ((status = fTree->fCachedHeader.WriteBack(transaction)) == B_OK) {
203 			fNode->Initialize();
204 			*_node = fNode;
205 			return B_OK;
206 		}
207 		return status;
208 	}
209 	// allocate space for a new node
210 	Inode *stream = fTree->fStream;
211 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
212 		return status;
213 
214 	// the maximum_size has to be changed before the call to SetTo() - or
215 	// else it will fail because the requested node is out of bounds
216 	off_t offset = fTree->fHeader->MaximumSize();
217 	fTree->fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(fTree->fHeader->MaximumSize() + fTree->fNodeSize);
218 
219 	if (SetTo(offset, false) != NULL) {
220 		*_offset = offset;
221 
222 		if (fTree->fCachedHeader.WriteBack(transaction) >= B_OK) {
223 			fNode->Initialize();
224 			*_node = fNode;
225 			return B_OK;
226 		}
227 	}
228 	RETURN_ERROR(B_ERROR);
229 }
230 
231 
232 status_t
233 CachedNode::WriteBack(Transaction *transaction)
234 {
235 	if (transaction == NULL || fTree == NULL || fTree->fStream == NULL || fNode == NULL)
236 		RETURN_ERROR(B_BAD_VALUE);
237 
238 	return transaction->WriteBlocks(fBlockNumber, fBlock);
239 }
240 
241 
242 //	#pragma mark -
243 
244 
245 BPlusTree::BPlusTree(Transaction *transaction,Inode *stream,int32 nodeSize)
246 	:
247 	fStream(NULL),
248 	fHeader(NULL),
249 	fCachedHeader(this)
250 {
251 	SetTo(transaction,stream);
252 }
253 
254 
255 BPlusTree::BPlusTree(Inode *stream)
256 	:
257 	fStream(NULL),
258 	fHeader(NULL),
259 	fCachedHeader(this)
260 {
261 	SetTo(stream);
262 }
263 
264 
265 BPlusTree::BPlusTree()
266 	:
267 	fStream(NULL),
268 	fHeader(NULL),
269 	fCachedHeader(this),
270 	fNodeSize(BPLUSTREE_NODE_SIZE),
271 	fAllowDuplicates(true),
272 	fStatus(B_NO_INIT)
273 {
274 }
275 
276 
277 BPlusTree::~BPlusTree()
278 {
279 	// if there are any TreeIterators left, we need to stop them
280 	// (can happen when the tree's inode gets deleted while
281 	// traversing the tree - a TreeIterator doesn't lock the inode)
282 	if (fIteratorLock.Lock() < B_OK)
283 		return;
284 
285 	TreeIterator *iterator = NULL;
286 	while ((iterator = fIterators.Next(iterator)) != NULL)
287 		iterator->Stop();
288 
289 	fIteratorLock.Unlock();
290 }
291 
292 
293 /** Create a new B+Tree on the specified stream */
294 
295 status_t
296 BPlusTree::SetTo(Transaction *transaction, Inode *stream, int32 nodeSize)
297 {
298 	// initializes in-memory B+Tree
299 
300 	fCachedHeader.Unset();
301 	fStream = stream;
302 
303 	fHeader = fCachedHeader.SetToHeader();
304 	if (fHeader == NULL) {
305 		// allocate space for new header + node!
306 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
307 		if (fStatus < B_OK)
308 			RETURN_ERROR(fStatus);
309 
310 		fHeader = fCachedHeader.SetToHeader();
311 		if (fHeader == NULL)
312 			RETURN_ERROR(fStatus = B_ERROR);
313 	}
314 
315 	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
316 						&& stream->BlockRun() != stream->Parent())
317 						|| (stream->Mode() & S_ALLOW_DUPS) != 0;
318 
319 	fNodeSize = nodeSize;
320 
321 	// initialize b+tree header
322  	fHeader->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
323  	fHeader->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
324  	fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
325  	fHeader->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
326  	fHeader->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
327  	fHeader->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
328  	fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
329 
330 	if (fCachedHeader.WriteBack(transaction) < B_OK)
331 		RETURN_ERROR(fStatus = B_ERROR);
332 
333 	// initialize b+tree root node
334 	CachedNode cached(this, fHeader->RootNode(), false);
335 	if (cached.Node() == NULL)
336 		RETURN_ERROR(B_ERROR);
337 
338 	cached.Node()->Initialize();
339 	return fStatus = cached.WriteBack(transaction);
340 }
341 
342 
343 status_t
344 BPlusTree::SetTo(Inode *stream)
345 {
346 	if (stream == NULL || stream->Node() == NULL)
347 		RETURN_ERROR(fStatus = B_BAD_VALUE);
348 
349 	// get on-disk B+Tree header
350 
351 	fCachedHeader.Unset();
352 	fStream = stream;
353 
354 	fHeader = fCachedHeader.SetToHeader();
355 	if (fHeader == NULL)
356 		RETURN_ERROR(fStatus = B_NO_INIT);
357 
358 	// is header valid?
359 
360 	if (fHeader->Magic() != BPLUSTREE_MAGIC
361 		|| fHeader->MaximumSize() != stream->Size()
362 		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
363 		|| !fHeader->IsValidLink(fHeader->RootNode())
364 		|| !fHeader->IsValidLink(fHeader->FreeNode()))
365 		RETURN_ERROR(fStatus = B_BAD_DATA);
366 
367 	fNodeSize = fHeader->NodeSize();
368 
369 	{
370 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX, S_LONG_LONG_INDEX,
371 						   S_ULONG_LONG_INDEX, S_FLOAT_INDEX, S_DOUBLE_INDEX};
372 		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
373 						   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX);
374 
375 		if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
376 			|| (stream->Mode() & S_INDEX_DIR) && toMode[fHeader->DataType()] != mode
377 			|| !stream->IsContainer()) {
378 			D(	dump_bplustree_header(fHeader);
379 				dump_inode(stream->Node());
380 			);
381 			RETURN_ERROR(fStatus = B_BAD_TYPE);
382 		}
383 
384 		 // although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
385 		 // in the original BFS code - we will honour it nevertheless
386 		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
387 							&& stream->BlockRun() != stream->Parent())
388 							|| (stream->Mode() & S_ALLOW_DUPS) != 0;
389 	}
390 
391 	CachedNode cached(this, fHeader->RootNode());
392 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
393 }
394 
395 
396 status_t
397 BPlusTree::InitCheck()
398 {
399 	return fStatus;
400 }
401 
402 
403 int32
404 BPlusTree::TypeCodeToKeyType(type_code code)
405 {
406 	switch (code) {
407 		case B_STRING_TYPE:
408 			return BPLUSTREE_STRING_TYPE;
409 		case B_SSIZE_T_TYPE:
410 		case B_INT32_TYPE:
411 			return BPLUSTREE_INT32_TYPE;
412 		case B_SIZE_T_TYPE:
413 		case B_UINT32_TYPE:
414 			return BPLUSTREE_UINT32_TYPE;
415 		case B_OFF_T_TYPE:
416 		case B_INT64_TYPE:
417 			return BPLUSTREE_INT64_TYPE;
418 		case B_UINT64_TYPE:
419 			return BPLUSTREE_UINT64_TYPE;
420 		case B_FLOAT_TYPE:
421 			return BPLUSTREE_FLOAT_TYPE;
422 		case B_DOUBLE_TYPE:
423 			return BPLUSTREE_DOUBLE_TYPE;
424 	}
425 	return -1;
426 }
427 
428 
429 int32
430 BPlusTree::ModeToKeyType(mode_t mode)
431 {
432 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
433 				   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
434 		case S_INT_INDEX:
435 			return BPLUSTREE_INT32_TYPE;
436 		case S_UINT_INDEX:
437 			return BPLUSTREE_UINT32_TYPE;
438 		case S_LONG_LONG_INDEX:
439 			return BPLUSTREE_INT64_TYPE;
440 		case S_ULONG_LONG_INDEX:
441 			return BPLUSTREE_UINT64_TYPE;
442 		case S_FLOAT_INDEX:
443 			return BPLUSTREE_FLOAT_TYPE;
444 		case S_DOUBLE_INDEX:
445 			return BPLUSTREE_DOUBLE_TYPE;
446 		case S_STR_INDEX:
447 		default:
448 			// default is for standard directories
449 			return BPLUSTREE_STRING_TYPE;
450 	}
451 }
452 
453 
454 //	#pragma mark -
455 
456 
457 void
458 BPlusTree::UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt,
459 	int8 change)
460 {
461 	// Although every iterator which is affected by this update currently
462 	// waits on a semaphore, other iterators could be added/removed at
463 	// any time, so we need to protect this loop
464 	if (fIteratorLock.Lock() < B_OK)
465 		return;
466 
467 	TreeIterator *iterator = NULL;
468 	while ((iterator = fIterators.Next(iterator)) != NULL)
469 		iterator->Update(offset, nextOffset, keyIndex, splitAt, change);
470 
471 	fIteratorLock.Unlock();
472 }
473 
474 
475 void
476 BPlusTree::AddIterator(TreeIterator *iterator)
477 {
478 	if (fIteratorLock.Lock() < B_OK)
479 		return;
480 
481 	fIterators.Add(iterator);
482 
483 	fIteratorLock.Unlock();
484 }
485 
486 
487 void
488 BPlusTree::RemoveIterator(TreeIterator *iterator)
489 {
490 	if (fIteratorLock.Lock() < B_OK)
491 		return;
492 
493 	fIterators.Remove(iterator);
494 
495 	fIteratorLock.Unlock();
496 }
497 
498 
499 int32
500 BPlusTree::CompareKeys(const void *key1, int keyLength1, const void *key2, int keyLength2)
501 {
502 	type_code type = 0;
503 	switch (fHeader->data_type)
504 	{
505 	    case BPLUSTREE_STRING_TYPE:
506 	    	type = B_STRING_TYPE;
507 	    	break;
508 		case BPLUSTREE_INT32_TYPE:
509 	    	type = B_INT32_TYPE;
510 	    	break;
511 		case BPLUSTREE_UINT32_TYPE:
512 	    	type = B_UINT32_TYPE;
513 	    	break;
514 		case BPLUSTREE_INT64_TYPE:
515 	    	type = B_INT64_TYPE;
516 	    	break;
517 		case BPLUSTREE_UINT64_TYPE:
518 	    	type = B_UINT64_TYPE;
519 	    	break;
520 		case BPLUSTREE_FLOAT_TYPE:
521 	    	type = B_FLOAT_TYPE;
522 	    	break;
523 		case BPLUSTREE_DOUBLE_TYPE:
524 	    	type = B_DOUBLE_TYPE;
525 	    	break;
526 	}
527    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
528 }
529 
530 
531 status_t
532 BPlusTree::FindKey(bplustree_node *node, const uint8 *key, uint16 keyLength, uint16 *index,
533 	off_t *next)
534 {
535 	if (node->all_key_count == 0) {
536 		if (index)
537 			*index = 0;
538 		if (next)
539 			*next = node->OverflowLink();
540 		return B_ENTRY_NOT_FOUND;
541 	}
542 
543 	off_t *values = node->Values();
544 	int16 saveIndex = -1;
545 
546 	// binary search in the key array
547 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
548 		uint16 i = (first + last) >> 1;
549 
550 		uint16 searchLength;
551 		uint8 *searchKey = node->KeyAt(i, &searchLength);
552 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
553 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
554 			fStream->GetVolume()->Panic();
555 			RETURN_ERROR(B_BAD_DATA);
556 		}
557 
558 		int32 cmp = CompareKeys(key, keyLength, searchKey, searchLength);
559 		if (cmp < 0) {
560 			last = i - 1;
561 			saveIndex = i;
562 		} else if (cmp > 0) {
563 			saveIndex = first = i + 1;
564 		} else {
565 			if (index)
566 				*index = i;
567 			if (next)
568 				*next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
569 			return B_OK;
570 		}
571 	}
572 
573 	if (index)
574 		*index = saveIndex;
575 	if (next) {
576 		if (saveIndex == node->NumKeys())
577 			*next = node->OverflowLink();
578 		else
579 			*next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
580 	}
581 	return B_ENTRY_NOT_FOUND;
582 }
583 
584 
585 /**	Prepares the stack to contain all nodes that were passed while
586  *	following the key, from the root node to the leaf node that could
587  *	or should contain that key.
588  */
589 
590 status_t
591 BPlusTree::SeekDown(Stack<node_and_key> &stack, const uint8 *key, uint16 keyLength)
592 {
593 	// set the root node to begin with
594 	node_and_key nodeAndKey;
595 	nodeAndKey.nodeOffset = fHeader->RootNode();
596 
597 	CachedNode cached(this);
598 	bplustree_node *node;
599 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
600 		// if we are already on leaf level, we're done
601 		if (node->OverflowLink() == BPLUSTREE_NULL) {
602 			// node that the keyIndex is not properly set here (but it's not
603 			// needed in the calling functions anyway)!
604 			nodeAndKey.keyIndex = 0;
605 			stack.Push(nodeAndKey);
606 			return B_OK;
607 		}
608 
609 		off_t nextOffset;
610 		status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex, &nextOffset);
611 
612 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
613 			RETURN_ERROR(B_ERROR);
614 
615 		// put the node offset & the correct keyIndex on the stack
616 		stack.Push(nodeAndKey);
617 
618 		nodeAndKey.nodeOffset = nextOffset;
619 	}
620 	RETURN_ERROR(B_ERROR);
621 }
622 
623 
624 status_t
625 BPlusTree::FindFreeDuplicateFragment(bplustree_node *node, CachedNode *cached, off_t *_offset,
626 	bplustree_node **_fragment, uint32 *_index)
627 {
628 	off_t *values = node->Values();
629 	for (int32 i = 0; i < node->NumKeys(); i++) {
630 		// does the value link to a duplicate fragment?
631 		if (bplustree_node::LinkType(values[i]) != BPLUSTREE_DUPLICATE_FRAGMENT)
632 			continue;
633 
634 		bplustree_node *fragment = cached->SetTo(bplustree_node::FragmentOffset(values[i]), false);
635 		if (fragment == NULL) {
636 			FATAL(("Could not get duplicate fragment at %Ld\n",values[i]));
637 			continue;
638 		}
639 
640 		// see if there is some space left for us
641 		int32 num = (fNodeSize >> 3) / (NUM_FRAGMENT_VALUES + 1);
642 		for (int32 j = 0;j < num;j++) {
643 			duplicate_array *array = fragment->FragmentAt(j);
644 
645 			if (array->count == 0) {
646 				*_offset = bplustree_node::FragmentOffset(values[i]);
647 				*_fragment = fragment;
648 				*_index = j;
649 				return B_OK;
650 			}
651 		}
652 	}
653 	return B_ENTRY_NOT_FOUND;
654 }
655 
656 
657 status_t
658 BPlusTree::InsertDuplicate(Transaction *transaction, CachedNode *cached, bplustree_node *node,
659 	uint16 index, off_t value)
660 {
661 	CachedNode cachedDuplicate(this);
662 	off_t *values = node->Values();
663 	off_t oldValue = values[index];
664 	status_t status;
665 	off_t offset;
666 
667 	if (bplustree_node::IsDuplicate(oldValue)) {
668 		//
669 		// If it's a duplicate fragment, try to insert it into that, or if it
670 		// doesn't fit anymore, create a new duplicate node
671 		//
672 		if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
673 			bplustree_node *duplicate = cachedDuplicate.SetTo(bplustree_node::FragmentOffset(oldValue),false);
674 			if (duplicate == NULL)
675 				return B_IO_ERROR;
676 
677 			duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
678 			if (array->count > NUM_FRAGMENT_VALUES
679 				|| array->count < 1) {
680 				FATAL(("insertDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",bplustree_node::FragmentIndex(oldValue),bplustree_node::FragmentOffset(oldValue),array->count));
681 				return B_BAD_DATA;
682 			}
683 
684 			if (array->count < NUM_FRAGMENT_VALUES) {
685 				array->Insert(value);
686 			} else {
687 				// test if the fragment will be empty if we remove this key's values
688 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
689 					// the node will be empty without our values, so let us
690 					// reuse it as a duplicate node
691 					offset = bplustree_node::FragmentOffset(oldValue);
692 
693 					memmove(duplicate->DuplicateArray(), array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
694 					duplicate->left_link = duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
695 
696 					array = duplicate->DuplicateArray();
697 					array->Insert(value);
698 				} else {
699 					// create a new duplicate node
700 					CachedNode cachedNewDuplicate(this);
701 					bplustree_node *newDuplicate;
702 					status = cachedNewDuplicate.Allocate(transaction, &newDuplicate, &offset);
703 					if (status < B_OK)
704 						return status;
705 
706 					// copy the array from the fragment node to the duplicate node
707 					// and free the old entry (by zero'ing all values)
708 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(array->count);
709 					memcpy(&newDuplicate->all_key_count, &array->values[0],
710 						array->count * sizeof(off_t));
711 					memset(array,0,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
712 
713 					array = newDuplicate->DuplicateArray();
714 					array->Insert(value);
715 
716 					// if this fails, the old fragments node will contain wrong
717 					// data... (but since it couldn't be written, it shouldn't
718 					// be fatal)
719 					if ((status = cachedNewDuplicate.WriteBack(transaction)) < B_OK)
720 						return status;
721 				}
722 
723 				// update the main pointer to link to a duplicate node
724 				values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE, offset);
725 				if ((status = cached->WriteBack(transaction)) < B_OK)
726 					return status;
727 			}
728 
729 			return cachedDuplicate.WriteBack(transaction);
730 		}
731 
732 		//
733 		// Put the value into a dedicated duplicate node
734 		//
735 
736 		// search for free space in the duplicate nodes of that key
737 		duplicate_array *array;
738 		bplustree_node *duplicate;
739 		off_t duplicateOffset;
740 		do {
741 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
742 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
743 			if (duplicate == NULL)
744 				return B_IO_ERROR;
745 
746 			array = duplicate->DuplicateArray();
747 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
748 				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
749 					duplicateOffset,array->count));
750 				return B_BAD_DATA;
751 			}
752 		} while (array->count >= NUM_DUPLICATE_VALUES
753 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
754 
755 		if (array->count < NUM_DUPLICATE_VALUES) {
756 			array->Insert(value);
757 		} else {
758 			// no space left - add a new duplicate node
759 
760 			CachedNode cachedNewDuplicate(this);
761 			bplustree_node *newDuplicate;
762 			status = cachedNewDuplicate.Allocate(transaction, &newDuplicate, &offset);
763 			if (status < B_OK)
764 				return status;
765 
766 			// link the two nodes together
767 			duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
768 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
769 
770 			array = newDuplicate->DuplicateArray();
771 			array->count = 0;
772 			array->Insert(value);
773 
774 			status = cachedNewDuplicate.WriteBack(transaction);
775 			if (status < B_OK)
776 				return status;
777 		}
778 		return cachedDuplicate.WriteBack(transaction);
779 	}
780 
781 	//
782 	// Search for a free duplicate fragment or create a new one
783 	// to insert the duplicate value into
784 	//
785 
786 	uint32 fragmentIndex = 0;
787 	bplustree_node *fragment;
788 	if (FindFreeDuplicateFragment(node, &cachedDuplicate, &offset, &fragment, &fragmentIndex) < B_OK) {
789 		// allocate a new duplicate fragment node
790 		if ((status = cachedDuplicate.Allocate(transaction, &fragment, &offset)) < B_OK)
791 			return status;
792 
793 		memset(fragment, 0, fNodeSize);
794 	}
795 	duplicate_array *array = fragment->FragmentAt(fragmentIndex);
796 	array->Insert(oldValue);
797 	array->Insert(value);
798 
799 	if ((status = cachedDuplicate.WriteBack(transaction)) < B_OK)
800 		return status;
801 
802 	values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex);
803 
804 	return cached->WriteBack(transaction);
805 }
806 
807 
808 void
809 BPlusTree::InsertKey(bplustree_node *node, uint16 index, uint8 *key, uint16 keyLength,
810 	off_t value)
811 {
812 	// should never happen, but who knows?
813 	if (index > node->NumKeys())
814 		return;
815 
816 	off_t *values = node->Values();
817 	uint16 *keyLengths = node->KeyLengths();
818 	uint8 *keys = node->Keys();
819 
820 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
821 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength() + keyLength);
822 
823 	off_t *newValues = node->Values();
824 	uint16 *newKeyLengths = node->KeyLengths();
825 
826 	// move values and copy new value into them
827 	memmove(newValues + index + 1, values + index, sizeof(off_t) * (node->NumKeys() - 1 - index));
828 	memmove(newValues, values, sizeof(off_t) * index);
829 
830 	newValues[index] = value;
831 
832 	// move and update key length index
833 	for (uint16 i = node->NumKeys(); i-- > index + 1;)
834 		newKeyLengths[i] = keyLengths[i - 1] + keyLength;
835 	memmove(newKeyLengths,keyLengths,sizeof(uint16) * index);
836 
837 	int32 keyStart;
838 	newKeyLengths[index] = keyLength + (keyStart = index > 0 ? newKeyLengths[index - 1] : 0);
839 
840 	// move keys and copy new key into them
841 	int32 size = node->AllKeyLength() - newKeyLengths[index];
842 	if (size > 0)
843 		memmove(keys + newKeyLengths[index],keys + newKeyLengths[index] - keyLength,size);
844 
845 	memcpy(keys + keyStart, key, keyLength);
846 }
847 
848 
849 status_t
850 BPlusTree::SplitNode(bplustree_node *node,off_t nodeOffset,bplustree_node *other,
851 	off_t otherOffset, uint16 *_keyIndex, uint8 *key, uint16 *_keyLength, off_t *_value)
852 {
853 	if (*_keyIndex > node->NumKeys() + 1)
854 		return B_BAD_VALUE;
855 
856 	uint16 *inKeyLengths = node->KeyLengths();
857 	off_t *inKeyValues = node->Values();
858 	uint8 *inKeys = node->Keys();
859 	uint8 *outKeys = other->Keys();
860 	int32 keyIndex = *_keyIndex;	// can become less than zero!
861 
862 	// how many keys will fit in one (half) page?
863 	// that loop will find the answer to this question and
864 	// change the key lengths indices for their new home
865 
866 	// "bytes" is the number of bytes written for the new key,
867 	// "bytesBefore" are the bytes before that key
868 	// "bytesAfter" are the bytes after the new key, if any
869 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
870 
871 	size_t size = fNodeSize >> 1;
872 	int32 out,in;
873 	for (in = out = 0; in < node->NumKeys() + 1;) {
874 		if (!bytes)
875 			bytesBefore = in > 0 ? inKeyLengths[in - 1] : 0;
876 
877 		if (in == keyIndex && !bytes) {
878 			bytes = *_keyLength;
879 		} else {
880 			if (keyIndex < out)
881 				bytesAfter = inKeyLengths[in] - bytesBefore;
882 
883 			in++;
884 		}
885 		out++;
886 
887 		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes) +
888 						out * (sizeof(uint16) + sizeof(off_t)) >= size) {
889 			// we have found the number of keys in the new node!
890 			break;
891 		}
892 	}
893 
894 	// if the new key was not inserted, set the length of the keys
895 	// that can be copied directly
896 	if (keyIndex >= out && in > 0)
897 		bytesBefore = inKeyLengths[in - 1];
898 
899 	if (bytesBefore < 0 || bytesAfter < 0)
900 		return B_BAD_DATA;
901 
902 	other->left_link = node->left_link;
903 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
904 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
905 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
906 
907 	uint16 *outKeyLengths = other->KeyLengths();
908 	off_t *outKeyValues = other->Values();
909 	int32 keys = out > keyIndex ? keyIndex : out;
910 
911 	if (bytesBefore) {
912 		// copy the keys
913 		memcpy(outKeys, inKeys, bytesBefore);
914 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
915 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
916 	}
917 	if (bytes) {
918 		// copy the newly inserted key
919 		memcpy(outKeys + bytesBefore, key, bytes);
920 		outKeyLengths[keyIndex] = bytes + bytesBefore;
921 		outKeyValues[keyIndex] = *_value;
922 
923 		if (bytesAfter) {
924 			// copy the keys after the new key
925 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore, bytesAfter);
926 			keys = out - keyIndex - 1;
927 			for (int32 i = 0;i < keys;i++)
928 				outKeyLengths[keyIndex + i + 1] = inKeyLengths[keyIndex + i] + bytes;
929 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex, keys * sizeof(off_t));
930 		}
931 	}
932 
933 	// if the new key was already inserted, we shouldn't use it again
934 	if (in != out)
935 		keyIndex--;
936 
937 	int32 total = bytesBefore + bytesAfter;
938 
939 	// these variables are for the key that will be returned
940 	// to the parent node
941 	uint8 *newKey = NULL;
942 	uint16 newLength;
943 	bool newAllocated = false;
944 
945 	// If we have split an index node, we have to drop the first key
946 	// of the next node (which can also be the new key to insert).
947 	// The dropped key is also the one which has to be inserted in
948 	// the parent node, so we will set the "newKey" already here.
949 	if (node->OverflowLink() != BPLUSTREE_NULL) {
950 		if (in == keyIndex) {
951 			newKey = key;
952 			newLength = *_keyLength;
953 
954 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
955 			keyIndex--;
956 		} else {
957 			// If a key is dropped (is not the new key), we have to copy
958 			// it, because it would be lost if not.
959 			uint8 *droppedKey = node->KeyAt(in,&newLength);
960 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
961 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
962 				fStream->GetVolume()->Panic();
963 				RETURN_ERROR(B_BAD_DATA);
964 			}
965 			newKey = (uint8 *)malloc(newLength);
966 			if (newKey == NULL)
967 				return B_NO_MEMORY;
968 			memcpy(newKey, droppedKey, newLength);
969 
970 			other->overflow_link = inKeyValues[in];
971 			total = inKeyLengths[in++];
972 		}
973 	}
974 
975 	// and now the same game for the other page and the rest of the keys
976 	// (but with memmove() instead of memcpy(), because they may overlap)
977 
978 	bytesBefore = bytesAfter = bytes = 0;
979 	out = 0;
980 	int32 skip = in;
981 	while (in < node->NumKeys() + 1) {
982 		if (in == keyIndex && !bytes) {
983 			// it's enough to set bytesBefore once here, because we do
984 			// not need to know the exact length of all keys in this
985 			// loop
986 			bytesBefore = in > skip ? inKeyLengths[in - 1] : 0;
987 			bytes = *_keyLength;
988 		} else {
989 			if (in < node->NumKeys()) {
990 				inKeyLengths[in] -= total;
991 				if (bytes) {
992 					inKeyLengths[in] += bytes;
993 					bytesAfter = inKeyLengths[in] - bytesBefore - bytes;
994 				}
995 			}
996 			in++;
997 		}
998 
999 		out++;
1000 
1001 		// break out when all keys are done
1002 		if (in > node->NumKeys() && keyIndex < in)
1003 			break;
1004 	}
1005 
1006 	// adjust the byte counts (since we were a bit lazy in the loop)
1007 	if (keyIndex >= in && keyIndex - skip < out)
1008 		bytesAfter = inKeyLengths[in] - bytesBefore - total;
1009 	else if (keyIndex < skip)
1010 		bytesBefore = node->AllKeyLength() - total;
1011 
1012 	if (bytesBefore < 0 || bytesAfter < 0)
1013 		return B_BAD_DATA;
1014 
1015 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1016 		// right link, and overflow link can stay the same
1017 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore + bytesAfter);
1018 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out - 1);
1019 
1020 	// array positions have changed
1021 	outKeyLengths = node->KeyLengths();
1022 	outKeyValues = node->Values();
1023 
1024 	// move the keys in the old node: the order is important here,
1025 	// because we don't want to overwrite any contents
1026 
1027 	keys = keyIndex <= skip ? out : keyIndex - skip;
1028 	keyIndex -= skip;
1029 
1030 	if (bytesBefore)
1031 		memmove(inKeys, inKeys + total, bytesBefore);
1032 	if (bytesAfter)
1033 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore, bytesAfter);
1034 
1035 	if (bytesBefore)
1036 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1037 	in = out - keyIndex - 1;
1038 	if (bytesAfter)
1039 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex, in * sizeof(uint16));
1040 
1041 	if (bytesBefore)
1042 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1043 	if (bytesAfter)
1044 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex, in * sizeof(off_t));
1045 
1046 	if (bytes) {
1047 		// finally, copy the newly inserted key (don't overwrite anything)
1048 		memcpy(inKeys + bytesBefore, key, bytes);
1049 		outKeyLengths[keyIndex] = bytes + bytesBefore;
1050 		outKeyValues[keyIndex] = *_value;
1051 	}
1052 
1053 	// Prepare the key that will be inserted in the parent node which
1054 	// is either the dropped key or the last of the other node.
1055 	// If it's the dropped key, "newKey" was already set earlier.
1056 
1057 	if (newKey == NULL)
1058 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1059 
1060 	memcpy(key,newKey,newLength);
1061 	*_keyLength = newLength;
1062 	*_value = otherOffset;
1063 
1064 	if (newAllocated)
1065 		free(newKey);
1066 
1067 	return B_OK;
1068 }
1069 
1070 
1071 status_t
1072 BPlusTree::Insert(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value)
1073 {
1074 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1075 		RETURN_ERROR(B_BAD_VALUE);
1076 
1077 	// lock access to stream
1078 	WriteLocked locked(fStream->Lock());
1079 
1080 	Stack<node_and_key> stack;
1081 	if (SeekDown(stack, key, keyLength) != B_OK)
1082 		RETURN_ERROR(B_ERROR);
1083 
1084 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1085 
1086 	memcpy(keyBuffer, key, keyLength);
1087 	keyBuffer[keyLength] = 0;
1088 
1089 	node_and_key nodeAndKey;
1090 	bplustree_node *node;
1091 
1092 	CachedNode cached(this);
1093 	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1094 #ifdef DEBUG
1095 		NodeChecker checker(node, fNodeSize);
1096 #endif
1097 		if (node->IsLeaf())	{
1098 			// first round, check for duplicate entries
1099 			status_t status = FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
1100 
1101 			// is this a duplicate entry?
1102 			if (status == B_OK) {
1103 				if (fAllowDuplicates)
1104 					return InsertDuplicate(transaction, &cached, node, nodeAndKey.keyIndex, value);
1105 
1106 				RETURN_ERROR(B_NAME_IN_USE);
1107 			}
1108 		}
1109 
1110 		// is the node big enough to hold the pair?
1111 		if (int32(round_up(sizeof(bplustree_node) + node->AllKeyLength() + keyLength)
1112 			+ (node->NumKeys() + 1) * (sizeof(uint16) + sizeof(off_t))) < fNodeSize)
1113 		{
1114 			InsertKey(node, nodeAndKey.keyIndex, keyBuffer, keyLength, value);
1115 			UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL, nodeAndKey.keyIndex, 0, 1);
1116 
1117 			return cached.WriteBack(transaction);
1118 		} else {
1119 			CachedNode cachedNewRoot(this);
1120 			CachedNode cachedOther(this);
1121 
1122 			// do we need to allocate a new root node? if so, then do
1123 			// it now
1124 			off_t newRoot = BPLUSTREE_NULL;
1125 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1126 				bplustree_node *root;
1127 				status_t status = cachedNewRoot.Allocate(transaction, &root, &newRoot);
1128 				if (status < B_OK) {
1129 					// The tree is most likely corrupted!
1130 					// But it's still sane at leaf level - we could set
1131 					// a flag in the header that forces the tree to be
1132 					// rebuild next time...
1133 					// But since we will have journaling, that's not a big
1134 					// problem anyway.
1135 					RETURN_ERROR(status);
1136 				}
1137 			}
1138 
1139 			// reserve space for the other node
1140 			bplustree_node *other;
1141 			off_t otherOffset;
1142 			status_t status = cachedOther.Allocate(transaction, &other, &otherOffset);
1143 			if (status < B_OK) {
1144 				cachedNewRoot.Free(transaction, newRoot);
1145 				RETURN_ERROR(status);
1146 			}
1147 
1148 			if (SplitNode(node, nodeAndKey.nodeOffset, other, otherOffset,
1149 					&nodeAndKey.keyIndex, keyBuffer, &keyLength, &value) < B_OK) {
1150 				// free root node & other node here
1151 				cachedNewRoot.Free(transaction, newRoot);
1152 				cachedOther.Free(transaction, otherOffset);
1153 
1154 				RETURN_ERROR(B_ERROR);
1155 			}
1156 
1157 			// write the updated nodes back
1158 
1159 			if (cached.WriteBack(transaction) < B_OK
1160 				|| cachedOther.WriteBack(transaction) < B_OK)
1161 				RETURN_ERROR(B_ERROR);
1162 
1163 			UpdateIterators(nodeAndKey.nodeOffset, otherOffset, nodeAndKey.keyIndex,
1164 				node->NumKeys(), 1);
1165 
1166 			// update the right link of the node in the left of the new node
1167 			if ((other = cachedOther.SetTo(other->LeftLink())) != NULL) {
1168 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1169 				if (cachedOther.WriteBack(transaction) < B_OK)
1170 					RETURN_ERROR(B_ERROR);
1171 			}
1172 
1173 			// create a new root if necessary
1174 			if (newRoot != BPLUSTREE_NULL) {
1175 				bplustree_node *root = cachedNewRoot.Node();
1176 
1177 				InsertKey(root, 0, keyBuffer, keyLength, node->LeftLink());
1178 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(nodeAndKey.nodeOffset);
1179 
1180 				if (cachedNewRoot.WriteBack(transaction) < B_OK)
1181 					RETURN_ERROR(B_ERROR);
1182 
1183 				// finally, update header to point to the new root
1184 				fHeader->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1185 				fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(fHeader->MaxNumberOfLevels() + 1);
1186 
1187 				return fCachedHeader.WriteBack(transaction);
1188 			}
1189 		}
1190 	}
1191 	RETURN_ERROR(B_ERROR);
1192 }
1193 
1194 
1195 status_t
1196 BPlusTree::RemoveDuplicate(Transaction *transaction, bplustree_node *node, CachedNode *cached,
1197 	uint16 index, off_t value)
1198 {
1199 	CachedNode cachedDuplicate(this);
1200 	off_t *values = node->Values();
1201 	off_t oldValue = values[index];
1202 	status_t status;
1203 
1204 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1205 	bplustree_node *duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
1206 	if (duplicate == NULL)
1207 		return B_IO_ERROR;
1208 
1209 	// if it's a duplicate fragment, remove the entry from there
1210 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1211 		duplicate_array *array = duplicate->FragmentAt(bplustree_node::FragmentIndex(oldValue));
1212 
1213 		if (array->count > NUM_FRAGMENT_VALUES
1214 			|| array->count < 1) {
1215 			FATAL(("removeDuplicate: Invalid array[%ld] size in fragment %Ld == %Ld!\n",bplustree_node::FragmentIndex(oldValue),duplicateOffset,array->count));
1216 			return B_BAD_DATA;
1217 		}
1218 		if (!array->Remove(value))
1219 			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",value,duplicateOffset));
1220 
1221 		// remove the array from the fragment node if it is empty
1222 		if (array->count == 1) {
1223 			// set the link to the remaining value
1224 			values[index] = array->values[0];
1225 
1226 			// Remove the whole fragment node, if this was the only array,
1227 			// otherwise free the array and write the changes back
1228 			if (duplicate->FragmentsUsed(fNodeSize) == 1)
1229 				status = cachedDuplicate.Free(transaction,duplicateOffset);
1230 			else {
1231 				array->count = 0;
1232 				status = cachedDuplicate.WriteBack(transaction);
1233 			}
1234 			if (status < B_OK)
1235 				return status;
1236 
1237 			return cached->WriteBack(transaction);
1238 		}
1239 		return cachedDuplicate.WriteBack(transaction);
1240 	}
1241 
1242 	//
1243 	// Remove value from a duplicate node!
1244 	//
1245 
1246 	duplicate_array *array;
1247 
1248 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1249 		FATAL(("invalid duplicate node: first left link points to %Ld!\n", duplicate->LeftLink()));
1250 		return B_BAD_DATA;
1251 	}
1252 
1253 	// Search the duplicate nodes until the entry could be found (and removed)
1254 	while (duplicate != NULL) {
1255 		array = duplicate->DuplicateArray();
1256 		if (array->count > NUM_DUPLICATE_VALUES
1257 			|| array->count < 0) {
1258 			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",duplicateOffset,array->count));
1259 			return B_BAD_DATA;
1260 		}
1261 
1262 		if (array->Remove(value))
1263 			break;
1264 
1265 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1266 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1267 
1268 		duplicate = cachedDuplicate.SetTo(duplicateOffset,false);
1269 	}
1270 	if (duplicate == NULL)
1271 		RETURN_ERROR(B_IO_ERROR);
1272 
1273 	while (true) {
1274 		off_t left = duplicate->LeftLink();
1275 		off_t right = duplicate->RightLink();
1276 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1277 
1278 		if (isLast && array->count == 1 || array->count == 0) {
1279 			// Free empty duplicate page, link their siblings together, and
1280 			// update the duplicate link if needed (which should not be, if
1281 			// we are the only one working on that tree...)
1282 
1283 			if (duplicateOffset == bplustree_node::FragmentOffset(oldValue)
1284 				|| array->count == 1) {
1285 				if (array->count == 1 && isLast)
1286 					values[index] = array->values[0];
1287 				else if (isLast) {
1288 					FATAL(("removed last value from duplicate!\n"));
1289 				} else
1290 					values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_NODE,right);
1291 
1292 				if ((status = cached->WriteBack(transaction)) < B_OK)
1293 					return status;
1294 			}
1295 
1296 			if ((status = cachedDuplicate.Free(transaction,duplicateOffset)) < B_OK)
1297 				return status;
1298 
1299 			if (left != BPLUSTREE_NULL
1300 				&& (duplicate = cachedDuplicate.SetTo(left,false)) != NULL) {
1301 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1302 
1303 				// If the next node is the last node, we need to free that node
1304 				// and convert the duplicate entry back into a normal entry
1305 				if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL
1306 					&& duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) {
1307 					duplicateOffset = left;
1308 					continue;
1309 				}
1310 
1311 				status = cachedDuplicate.WriteBack(transaction);
1312 				if (status < B_OK)
1313 					return status;
1314 			}
1315 			if (right != BPLUSTREE_NULL
1316 				&& (duplicate = cachedDuplicate.SetTo(right, false)) != NULL) {
1317 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1318 
1319 				// Again, we may need to turn the duplicate entry back into a normal entry
1320 				array = duplicate->DuplicateArray();
1321 				if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL
1322 					&& duplicate->DuplicateArray()->count <= NUM_FRAGMENT_VALUES) {
1323 					duplicateOffset = right;
1324 					continue;
1325 				}
1326 
1327 				return cachedDuplicate.WriteBack(transaction);
1328 			}
1329 			return status;
1330 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1331 			// If the number of entries fits in a duplicate fragment, then
1332 			// either find a free fragment node, or convert this node to a
1333 			// fragment node.
1334 			CachedNode cachedOther(this);
1335 
1336 			bplustree_node *fragment = NULL;
1337 			uint32 fragmentIndex = 0;
1338 			off_t offset;
1339 			if (FindFreeDuplicateFragment(node,&cachedOther,&offset,&fragment,&fragmentIndex) < B_OK) {
1340 				// convert node
1341 				memmove(duplicate,array,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1342 				memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1,0,fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1343 			} else {
1344 				// move to other node
1345 				duplicate_array *target = fragment->FragmentAt(fragmentIndex);
1346 				memcpy(target,array,(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1347 
1348 				cachedDuplicate.Free(transaction,duplicateOffset);
1349 				duplicateOffset = offset;
1350 			}
1351 			values[index] = bplustree_node::MakeLink(BPLUSTREE_DUPLICATE_FRAGMENT,duplicateOffset,fragmentIndex);
1352 
1353 			if ((status = cached->WriteBack(transaction)) < B_OK)
1354 				return status;
1355 
1356 			if (fragment != NULL)
1357 				return cachedOther.WriteBack(transaction);
1358 		}
1359 		return cachedDuplicate.WriteBack(transaction);
1360 	}
1361 }
1362 
1363 
1364 /** Removes the key with the given index from the specified node.
1365  *	Since it has to get the key from the node anyway (to obtain it's
1366  *	pointer), it's not needed to pass the key & its length, although
1367  *	the calling method (BPlusTree::Remove()) have this data.
1368  */
1369 
1370 void
1371 BPlusTree::RemoveKey(bplustree_node *node,uint16 index)
1372 {
1373 	// should never happen, but who knows?
1374 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1375 		FATAL(("Asked me to remove key outer limits: %u\n",index));
1376 		return;
1377 	}
1378 
1379 	off_t *values = node->Values();
1380 
1381 	// if we would have to drop the overflow link, drop
1382 	// the last key instead and update the overflow link
1383 	// to the value of that one
1384 	if (!node->IsLeaf() && index == node->NumKeys())
1385 		node->overflow_link = values[--index];
1386 
1387 	uint16 length;
1388 	uint8 *key = node->KeyAt(index,&length);
1389 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
1390 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1391 		FATAL(("Key length to long: %s, %u (inode at %ld,%u)\n", key, length, fStream->BlockRun().allocation_group, fStream->BlockRun().start));
1392 		fStream->GetVolume()->Panic();
1393 		return;
1394 	}
1395 
1396 	uint16 *keyLengths = node->KeyLengths();
1397 	uint8 *keys = node->Keys();
1398 
1399 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1400 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length);
1401 
1402 	off_t *newValues = node->Values();
1403 	uint16 *newKeyLengths = node->KeyLengths();
1404 
1405 	// move key data
1406 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1407 
1408 	// move and update key lengths
1409 	if (index > 0 && newKeyLengths != keyLengths)
1410 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1411 	for (uint16 i = index; i < node->NumKeys(); i++)
1412 		newKeyLengths[i] = keyLengths[i + 1] - length;
1413 
1414 	// move values
1415 	if (index > 0)
1416 		memmove(newValues,values,index * sizeof(off_t));
1417 	if (node->NumKeys() > index)
1418 		memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t));
1419 }
1420 
1421 
1422 /**	Removes the specified key from the tree. The "value" parameter is only used
1423  *	for trees which allow duplicates, so you may safely ignore it.
1424  *	It's not an optional parameter, so at least you have to think about it.
1425  */
1426 
1427 status_t
1428 BPlusTree::Remove(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value)
1429 {
1430 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1431 		RETURN_ERROR(B_BAD_VALUE);
1432 
1433 	// lock access to stream
1434 	WriteLocked locked(fStream->Lock());
1435 
1436 	Stack<node_and_key> stack;
1437 	if (SeekDown(stack,key,keyLength) != B_OK)
1438 		RETURN_ERROR(B_ERROR);
1439 
1440 	node_and_key nodeAndKey;
1441 	bplustree_node *node;
1442 
1443 	CachedNode cached(this);
1444 	while (stack.Pop(&nodeAndKey) && (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL)
1445 	{
1446 #ifdef DEBUG
1447 		NodeChecker checker(node, fNodeSize);
1448 #endif
1449 		if (node->IsLeaf())	// first round, check for duplicate entries
1450 		{
1451 			status_t status = FindKey(node,key,keyLength,&nodeAndKey.keyIndex);
1452 			if (status < B_OK)
1453 				RETURN_ERROR(status);
1454 
1455 			// If we will remove the last key, the iterator will be set
1456 			// to the next node after the current - if there aren't any
1457 			// more nodes, we need a way to prevent the TreeIterators to
1458 			// touch the old node again, we use BPLUSTREE_FREE for this
1459 			off_t next = node->RightLink() == BPLUSTREE_NULL ? BPLUSTREE_FREE : node->RightLink();
1460 			UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1 ?
1461 								next : BPLUSTREE_NULL,nodeAndKey.keyIndex,0 , -1);
1462 
1463 			// is this a duplicate entry?
1464 			if (bplustree_node::IsDuplicate(node->Values()[nodeAndKey.keyIndex])) {
1465 				if (fAllowDuplicates)
1466 					return RemoveDuplicate(transaction, node, &cached, nodeAndKey.keyIndex, value);
1467 				else
1468 					RETURN_ERROR(B_NAME_IN_USE);
1469 			}
1470 		}
1471 
1472 		// if it's an empty root node, we have to convert it
1473 		// to a leaf node by dropping the overflow link, or,
1474 		// if it's a leaf node, just empty it
1475 		if (nodeAndKey.nodeOffset == fHeader->RootNode()
1476 			&& node->NumKeys() == 0
1477 			|| node->NumKeys() == 1 && node->IsLeaf()) {
1478 			node->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1479 			node->all_key_count = 0;
1480 			node->all_key_length = 0;
1481 
1482 			if (cached.WriteBack(transaction) < B_OK)
1483 				return B_IO_ERROR;
1484 
1485 			// if we've cleared the root node, reset the maximum
1486 			// number of levels in the header
1487 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1488 				fHeader->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1489 				return fCachedHeader.WriteBack(transaction);
1490 			}
1491 			return B_OK;
1492 		}
1493 
1494 		// if there is only one key left, we don't have to remove
1495 		// it, we can just dump the node (index nodes still have
1496 		// the overflow link, so we have to drop the last key)
1497 		if (node->NumKeys() > 1
1498 			|| !node->IsLeaf() && node->NumKeys() == 1) {
1499 			RemoveKey(node,nodeAndKey.keyIndex);
1500 			return cached.WriteBack(transaction);
1501 		}
1502 
1503 		// when we are here, we can just free the node, but
1504 		// we have to update the right/left link of the
1505 		// siblings first
1506 		CachedNode otherCached(this);
1507 		bplustree_node *other = otherCached.SetTo(node->LeftLink());
1508 		if (other != NULL) {
1509 			other->right_link = node->right_link;
1510 			if (otherCached.WriteBack(transaction) < B_OK)
1511 				return B_IO_ERROR;
1512 		}
1513 
1514 		if ((other = otherCached.SetTo(node->RightLink())) != NULL) {
1515 			other->left_link = node->left_link;
1516 			if (otherCached.WriteBack(transaction) < B_OK)
1517 				return B_IO_ERROR;
1518 		}
1519 
1520 		cached.Free(transaction,nodeAndKey.nodeOffset);
1521 	}
1522 	RETURN_ERROR(B_ERROR);
1523 }
1524 
1525 
1526 /**	Replaces the value for the key in the tree.
1527  *	Returns B_OK if the key could be found and its value replaced,
1528  *	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1529  *	to indicate that something went terribly wrong.
1530  *	Note that this doesn't work with duplicates - it will just
1531  *	return B_BAD_TYPE if you call this function on a tree where
1532  *	duplicates are allowed.
1533  */
1534 
1535 status_t
1536 BPlusTree::Replace(Transaction *transaction, const uint8 *key, uint16 keyLength, off_t value)
1537 {
1538 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1539 		|| key == NULL)
1540 		RETURN_ERROR(B_BAD_VALUE);
1541 
1542 	if (fAllowDuplicates)
1543 		RETURN_ERROR(B_BAD_TYPE);
1544 
1545 	// lock access to stream (a read lock is okay for this purpose)
1546 	ReadLocked locked(fStream->Lock());
1547 
1548 	off_t nodeOffset = fHeader->RootNode();
1549 	CachedNode cached(this);
1550 	bplustree_node *node;
1551 
1552 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1553 		uint16 keyIndex = 0;
1554 		off_t nextOffset;
1555 		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1556 
1557 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1558 			if (status == B_OK) {
1559 				node->Values()[keyIndex] = value;
1560 				return cached.WriteBack(transaction);
1561 			}
1562 
1563 			return status;
1564 		} else if (nextOffset == nodeOffset)
1565 			RETURN_ERROR(B_ERROR);
1566 
1567 		nodeOffset = nextOffset;
1568 	}
1569 	RETURN_ERROR(B_ERROR);
1570 }
1571 
1572 
1573 /**	Searches the key in the tree, and stores the offset found in
1574  *	_value, if successful.
1575  *	It's very similar to BPlusTree::SeekDown(), but doesn't fill
1576  *	a stack while it descends the tree.
1577  *	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND
1578  *	if not. It can also return other errors to indicate that
1579  *	something went wrong.
1580  *	Note that this doesn't work with duplicates - it will just
1581  *	return B_BAD_TYPE if you call this function on a tree where
1582  *	duplicates are allowed.
1583  */
1584 
1585 status_t
1586 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value)
1587 {
1588 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1589 		|| key == NULL)
1590 		RETURN_ERROR(B_BAD_VALUE);
1591 
1592 	if (fAllowDuplicates)
1593 		RETURN_ERROR(B_BAD_TYPE);
1594 
1595 	// lock access to stream
1596 	ReadLocked locked(fStream->Lock());
1597 
1598 	off_t nodeOffset = fHeader->RootNode();
1599 	CachedNode cached(this);
1600 	bplustree_node *node;
1601 
1602 #ifdef DEBUG
1603 	int32 levels = 0;
1604 #endif
1605 
1606 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1607 		uint16 keyIndex = 0;
1608 		off_t nextOffset;
1609 		status_t status = FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1610 
1611 #ifdef DEBUG
1612 		levels++;
1613 #endif
1614 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1615 			if (status == B_OK && _value != NULL)
1616 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1617 
1618 #ifdef DEBUG
1619 			if (levels != (int32)fHeader->MaxNumberOfLevels())
1620 				DEBUGGER(("levels don't match"));
1621 #endif
1622 			return status;
1623 		} else if (nextOffset == nodeOffset)
1624 			RETURN_ERROR(B_ERROR);
1625 
1626 		nodeOffset = nextOffset;
1627 	}
1628 	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
1629 	RETURN_ERROR(B_ERROR);
1630 }
1631 
1632 
1633 //	#pragma mark -
1634 
1635 
1636 TreeIterator::TreeIterator(BPlusTree *tree)
1637 	:
1638 	fTree(tree),
1639 	fCurrentNodeOffset(BPLUSTREE_NULL),
1640 	fNext(NULL)
1641 {
1642 	tree->AddIterator(this);
1643 }
1644 
1645 
1646 TreeIterator::~TreeIterator()
1647 {
1648 	if (fTree)
1649 		fTree->RemoveIterator(this);
1650 }
1651 
1652 
1653 status_t
1654 TreeIterator::Goto(int8 to)
1655 {
1656 	if (fTree == NULL || fTree->fHeader == NULL)
1657 		RETURN_ERROR(B_BAD_VALUE);
1658 
1659 	// lock access to stream
1660 	ReadLocked locked(fTree->fStream->Lock());
1661 
1662 	off_t nodeOffset = fTree->fHeader->RootNode();
1663 	CachedNode cached(fTree);
1664 	bplustree_node *node;
1665 
1666 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1667 		// is the node a leaf node?
1668 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1669 			fCurrentNodeOffset = nodeOffset;
1670 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1671 			fDuplicateNode = BPLUSTREE_NULL;
1672 
1673 			return B_OK;
1674 		}
1675 
1676 		// get the next node offset depending on the direction (and if there
1677 		// are any keys in that node at all)
1678 		off_t nextOffset;
1679 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1680 			nextOffset = node->OverflowLink();
1681 		else {
1682 			if (node->AllKeyLength() > fTree->fNodeSize
1683 				|| (uint32)node->Values() > (uint32)node + fTree->fNodeSize - 8 * node->NumKeys())
1684 				RETURN_ERROR(B_ERROR);
1685 
1686 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1687 		}
1688 		if (nextOffset == nodeOffset)
1689 			break;
1690 
1691 		nodeOffset = nextOffset;
1692 	}
1693 	#if __MWERKS__
1694 		FATAL(("TreeIterator::Goto(int8 to) fails\n"));
1695 	#else
1696 		FATAL(("%s fails\n",__PRETTY_FUNCTION__));
1697 	#endif
1698 	RETURN_ERROR(B_ERROR);
1699 }
1700 
1701 
1702 /**	Iterates through the tree in the specified direction.
1703  *	When it iterates through duplicates, the "key" is only updated for the
1704  *	first entry - if you need to know when this happens, use the "duplicate"
1705  *	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
1706  *	the other duplicates.
1707  *	That's not too nice, but saves the 256 bytes that would be needed to
1708  *	store the last key - if this will ever become an issue, it will be
1709  *	easy to change.
1710  *	The other advantage of this is, that the queries can skip all duplicates
1711  *	at once when they are not relevant to them.
1712  */
1713 
1714 status_t
1715 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength, uint16 maxLength,
1716 	off_t *value, uint16 *duplicate)
1717 {
1718 	if (fTree == NULL)
1719 		return B_INTERRUPTED;
1720 	if (fCurrentNodeOffset == BPLUSTREE_NULL
1721 		&& Goto(direction == BPLUSTREE_FORWARD ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
1722 		RETURN_ERROR(B_ERROR);
1723 
1724 	// if the tree was emptied since the last call
1725 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
1726 		return B_ENTRY_NOT_FOUND;
1727 
1728 	// lock access to stream
1729 	ReadLocked locked(fTree->fStream->Lock());
1730 
1731 	CachedNode cached(fTree);
1732 	bplustree_node *node;
1733 
1734 	if (fDuplicateNode != BPLUSTREE_NULL)
1735 	{
1736 		// regardless of traverse direction the duplicates are always presented in
1737 		// the same order; since they are all considered as equal, this shouldn't
1738 		// cause any problems
1739 
1740 		if (!fIsFragment || fDuplicate < fNumDuplicates)
1741 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1742 		else
1743 			node = NULL;
1744 
1745 		if (node != NULL)
1746 		{
1747 			if (!fIsFragment && fDuplicate >= fNumDuplicates)
1748 			{
1749 				// if the node is out of duplicates, we go directly to the next one
1750 				fDuplicateNode = node->RightLink();
1751 				if (fDuplicateNode != BPLUSTREE_NULL
1752 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL)
1753 				{
1754 					fNumDuplicates = node->CountDuplicates(fDuplicateNode, false);
1755 					fDuplicate = 0;
1756 				}
1757 			}
1758 			if (fDuplicate < fNumDuplicates)
1759 			{
1760 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++);
1761 				if (duplicate)
1762 					*duplicate = 2;
1763 				return B_OK;
1764 			}
1765 		}
1766 		fDuplicateNode = BPLUSTREE_NULL;
1767 	}
1768 
1769 	off_t savedNodeOffset = fCurrentNodeOffset;
1770 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
1771 		RETURN_ERROR(B_ERROR);
1772 
1773 	if (duplicate)
1774 		*duplicate = 0;
1775 
1776 	fCurrentKey += direction;
1777 
1778 	// is the current key in the current node?
1779 	while ((direction == BPLUSTREE_FORWARD && fCurrentKey >= node->NumKeys())
1780 		   || (direction == BPLUSTREE_BACKWARD && fCurrentKey < 0))
1781 	{
1782 		fCurrentNodeOffset = direction == BPLUSTREE_FORWARD ? node->RightLink() : node->LeftLink();
1783 
1784 		// are there any more nodes?
1785 		if (fCurrentNodeOffset != BPLUSTREE_NULL)
1786 		{
1787 			node = cached.SetTo(fCurrentNodeOffset);
1788 			if (!node)
1789 				RETURN_ERROR(B_ERROR);
1790 
1791 			// reset current key
1792 			fCurrentKey = direction == BPLUSTREE_FORWARD ? 0 : node->NumKeys();
1793 		}
1794 		else
1795 		{
1796 			// there are no nodes left, so turn back to the last key
1797 			fCurrentNodeOffset = savedNodeOffset;
1798 			fCurrentKey = direction == BPLUSTREE_FORWARD ? node->NumKeys() : -1;
1799 
1800 			return B_ENTRY_NOT_FOUND;
1801 		}
1802 	}
1803 
1804 	if (node->all_key_count == 0)
1805 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
1806 
1807 	uint16 length;
1808 	uint8 *keyStart = node->KeyAt(fCurrentKey, &length);
1809 	if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize
1810 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1811 		fTree->fStream->GetVolume()->Panic();
1812 		RETURN_ERROR(B_BAD_DATA);
1813 	}
1814 
1815 	length = min_c(length, maxLength);
1816 	memcpy(key, keyStart, length);
1817 
1818 	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	// terminate string type
1819 	{
1820 		if (length == maxLength)
1821 			length--;
1822 		((char *)key)[length] = '\0';
1823 	}
1824 	*keyLength = length;
1825 
1826 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
1827 
1828 	// duplicate fragments?
1829 	uint8 type = bplustree_node::LinkType(offset);
1830 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE)
1831 	{
1832 		fDuplicateNode = offset;
1833 
1834 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1835 		if (node == NULL)
1836 			RETURN_ERROR(B_ERROR);
1837 
1838 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
1839 
1840 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
1841 		if (fNumDuplicates)
1842 		{
1843 			offset = node->DuplicateAt(offset, fIsFragment, 0);
1844 			fDuplicate = 1;
1845 			if (duplicate)
1846 				*duplicate = 1;
1847 		}
1848 		else
1849 		{
1850 			// shouldn't happen, but we're dealing here with potentially corrupt disks...
1851 			fDuplicateNode = BPLUSTREE_NULL;
1852 			offset = 0;
1853 		}
1854 	}
1855 	*value = offset;
1856 
1857 	return B_OK;
1858 }
1859 
1860 
1861 /**	This is more or less a copy of BPlusTree::Find() - but it just
1862  *	sets the current position in the iterator, regardless of if the
1863  *	key could be found or not.
1864  */
1865 
1866 status_t
1867 TreeIterator::Find(const uint8 *key, uint16 keyLength)
1868 {
1869 	if (fTree == NULL)
1870 		return B_INTERRUPTED;
1871 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
1872 		|| key == NULL)
1873 		RETURN_ERROR(B_BAD_VALUE);
1874 
1875 	// lock access to stream
1876 	ReadLocked locked(fTree->fStream->Lock());
1877 
1878 	off_t nodeOffset = fTree->fHeader->RootNode();
1879 
1880 	CachedNode cached(fTree);
1881 	bplustree_node *node;
1882 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1883 		uint16 keyIndex = 0;
1884 		off_t nextOffset;
1885 		status_t status = fTree->FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1886 
1887 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1888 			fCurrentNodeOffset = nodeOffset;
1889 			fCurrentKey = keyIndex - 1;
1890 			fDuplicateNode = BPLUSTREE_NULL;
1891 
1892 			return status;
1893 		} else if (nextOffset == nodeOffset)
1894 			RETURN_ERROR(B_ERROR);
1895 
1896 		nodeOffset = nextOffset;
1897 	}
1898 	RETURN_ERROR(B_ERROR);
1899 }
1900 
1901 
1902 void
1903 TreeIterator::SkipDuplicates()
1904 {
1905 	fDuplicateNode = BPLUSTREE_NULL;
1906 }
1907 
1908 
1909 void
1910 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change)
1911 {
1912 	if (offset != fCurrentNodeOffset)
1913 		return;
1914 
1915 	if (nextOffset != BPLUSTREE_NULL) {
1916 		fCurrentNodeOffset = nextOffset;
1917 		if (splitAt <= fCurrentKey) {
1918 			fCurrentKey -= splitAt;
1919 			keyIndex -= splitAt;
1920 		}
1921 	}
1922 
1923 	// Adjust fCurrentKey to point to the same key as before.
1924 	// Note, that if a key is inserted at the current position
1925 	// it won't be included in this tree transition.
1926 	if (keyIndex <= fCurrentKey)
1927 		fCurrentKey += change;
1928 
1929 	// ToDo: duplicate handling!
1930 }
1931 
1932 
1933 void
1934 TreeIterator::Stop()
1935 {
1936 	fTree = NULL;
1937 }
1938 
1939 
1940 #ifdef DEBUG
1941 void
1942 TreeIterator::Dump()
1943 {
1944 	__out("TreeIterator at %p:\n", this);
1945 	__out("\tfTree = %p\n", fTree);
1946 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
1947 	__out("\tfCurrentKey = %ld\n", fCurrentKey);
1948 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode);
1949 	__out("\tfDuplicate = %u\n", fDuplicate);
1950 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
1951 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
1952 }
1953 #endif
1954 
1955 
1956 //	#pragma mark -
1957 
1958 
1959 void
1960 bplustree_node::Initialize()
1961 {
1962 	left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1963 	all_key_count = 0;
1964 	all_key_length = 0;
1965 }
1966 
1967 
1968 uint8 *
1969 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const
1970 {
1971 	if (index < 0 || index > NumKeys())
1972 		return NULL;
1973 
1974 	uint8 *keyStart = Keys();
1975 	uint16 *keyLengths = KeyLengths();
1976 
1977 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
1978 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
1979 	if (index > 0)
1980 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
1981 
1982 	return keyStart;
1983 }
1984 
1985 
1986 uint8
1987 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
1988 {
1989 	// the duplicate fragment handling is currently hard-coded to a node size
1990 	// of 1024 bytes - with future versions of BFS, this may be a problem
1991 
1992 	if (isFragment) {
1993 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
1994 
1995 		return ((off_t *)this)[fragment];
1996 	}
1997 	return OverflowLink();
1998 }
1999 
2000 
2001 off_t
2002 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2003 {
2004 	uint32 start;
2005 	if (isFragment)
2006 		start = 8 * ((uint64)offset & 0x3ff);
2007 	else
2008 		start = 2;
2009 
2010 	return ((off_t *)this)[start + 1 + index];
2011 }
2012 
2013 
2014 /**	Although the name suggests it, this function doesn't return the real
2015  *	used fragment count; at least, it can only count to two: it returns
2016  *	0, if there is no fragment used, 1 if there is only one fragment
2017  *	used, and 2 if there are at least 2 fragments used.
2018  */
2019 
2020 int32
2021 bplustree_node::FragmentsUsed(uint32 nodeSize)
2022 {
2023 	uint32 used = 0;
2024 	for (uint32 i = 0; i < nodeSize / ((NUM_FRAGMENT_VALUES + 1) * sizeof(off_t)); i++) {
2025 		duplicate_array *array = FragmentAt(i);
2026 		if (array->count > 0 && ++used > 1)
2027 			return used;
2028 	}
2029 	return used;
2030 }
2031 
2032 
2033 #ifdef DEBUG
2034 void
2035 bplustree_node::CheckIntegrity(uint32 nodeSize)
2036 {
2037 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2038 		DEBUGGER(("invalid node: key/length count"));
2039 
2040 	for (int32 i = 0; i < NumKeys(); i++) {
2041 		uint16 length;
2042 		uint8 *key = KeyAt(i, &length);
2043 		if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize
2044 			|| length > BPLUSTREE_MAX_KEY_LENGTH)
2045 			DEBUGGER(("invalid node: keys corrupted"));
2046 	}
2047 }
2048 #endif
2049 
2050 
2051 //	#pragma mark -
2052 
2053 
2054 int32
2055 compareKeys(type_code type,const void *key1, int keyLength1, const void *key2, int keyLength2)
2056 {
2057 	// if one of the keys is NULL, bail out gracefully
2058 	if (key1 == NULL || key2 == NULL)
2059 		return -1;
2060 
2061 	switch (type)
2062 	{
2063 	    case B_STRING_TYPE:
2064     	{
2065 			int len = min_c(keyLength1, keyLength2);
2066 			int result = strncmp((const char *)key1, (const char *)key2, len);
2067 
2068 			if (result == 0
2069 				&& !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0'))
2070 				result = keyLength1 - keyLength2;
2071 
2072 			return result;
2073 		}
2074 
2075 		case B_SSIZE_T_TYPE:
2076 		case B_INT32_TYPE:
2077 			return *(int32 *)key1 - *(int32 *)key2;
2078 
2079 		case B_SIZE_T_TYPE:
2080 		case B_UINT32_TYPE:
2081 			if (*(uint32 *)key1 == *(uint32 *)key2)
2082 				return 0;
2083 			else if (*(uint32 *)key1 > *(uint32 *)key2)
2084 				return 1;
2085 
2086 			return -1;
2087 
2088 		case B_OFF_T_TYPE:
2089 		case B_INT64_TYPE:
2090 			if (*(int64 *)key1 == *(int64 *)key2)
2091 				return 0;
2092 			else if (*(int64 *)key1 > *(int64 *)key2)
2093 				return 1;
2094 
2095 			return -1;
2096 
2097 		case B_UINT64_TYPE:
2098 			if (*(uint64 *)key1 == *(uint64 *)key2)
2099 				return 0;
2100 			else if (*(uint64 *)key1 > *(uint64 *)key2)
2101 				return 1;
2102 
2103 			return -1;
2104 
2105 		case B_FLOAT_TYPE:
2106 		{
2107 			float result = *(float *)key1 - *(float *)key2;
2108 			if (result == 0.0f)
2109 				return 0;
2110 
2111 			return (result < 0.0f) ? -1 : 1;
2112 		}
2113 
2114 		case B_DOUBLE_TYPE:
2115 		{
2116 			double result = *(double *)key1 - *(double *)key2;
2117 			if (result == 0.0)
2118 				return 0;
2119 
2120 			return (result < 0.0) ? -1 : 1;
2121 		}
2122 	}
2123 
2124 	// if the type is unknown, the entries don't match...
2125 	return -1;
2126 }
2127 
2128 
2129