xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision b6b0567fbd186f8ce8a0c90bdc7a7b5b4c649678)
1 /*
2  * Copyright 2001-2009, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 //! B+Tree implementation
10 
11 
12 #include "Debug.h"
13 #include "BPlusTree.h"
14 #include "Inode.h"
15 #include "Utility.h"
16 
17 
18 #ifdef DEBUG
19 class NodeChecker {
20 public:
21 	NodeChecker(const bplustree_node* node, int32 nodeSize, const char* text)
22 		:
23 		fNode(node),
24 		fSize(nodeSize),
25 		fText(text)
26 	{
27 		Check("integrity check failed on construction.");
28 	}
29 
30 	~NodeChecker()
31 	{
32 		Check("integrity check failed on destruction.");
33 	}
34 
35 	void
36 	Check(const char* message)
37 	{
38 		if (fNode->CheckIntegrity(fSize) < B_OK)
39 			dprintf("%s: %s\n", fText, message);
40 	}
41 
42 private:
43 	const bplustree_node*	fNode;
44 	int32					fSize;
45 	const char*				fText;
46 };
47 #endif
48 
49 
50 // Node Caching for the BPlusTree class
51 //
52 // With write support, there is the need for a function that allocates new
53 // nodes by either returning empty nodes, or by growing the file's data stream
54 //
55 // !! The CachedNode class assumes that you have properly locked the stream
56 // !! before asking for nodes.
57 //
58 // Note: This code will fail if the block size is smaller than the node size!
59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
60 // is hard-coded to 1024 bytes, that's not an issue now.
61 
62 void
63 CachedNode::UnsetUnchanged(Transaction& transaction)
64 {
65 	if (fTree == NULL || fTree->fStream == NULL)
66 		return;
67 
68 	if (fNode != NULL) {
69 		void* cache = fTree->fStream->GetVolume()->BlockCache();
70 
71 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
72 		block_cache_put(cache, fBlockNumber);
73 		fNode = NULL;
74 	}
75 }
76 
77 
78 void
79 CachedNode::Unset()
80 {
81 	if (fTree == NULL || fTree->fStream == NULL)
82 		return;
83 
84 	if (fNode != NULL) {
85 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(),
86 			fBlockNumber);
87 		fNode = NULL;
88 	}
89 }
90 
91 
92 const bplustree_node*
93 CachedNode::SetTo(off_t offset, bool check)
94 {
95 	if (fTree == NULL || fTree->fStream == NULL) {
96 		REPORT_ERROR(B_BAD_VALUE);
97 		return NULL;
98 	}
99 
100 	Unset();
101 
102 	// You can only ask for nodes at valid positions - you can't
103 	// even access the b+tree header with this method (use SetToHeader()
104 	// instead)
105 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
106 		|| offset <= 0
107 		|| (offset % fTree->fNodeSize) != 0)
108 		return NULL;
109 
110 	if (InternalSetTo(NULL, offset) != NULL && check) {
111 		// sanity checks (links, all_key_count)
112 		if (!fTree->fHeader->CheckNode(fNode)) {
113 			FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode "
114 				"at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID()));
115 			return NULL;
116 		}
117 	}
118 	return fNode;
119 }
120 
121 
122 bplustree_node*
123 CachedNode::SetToWritable(Transaction& transaction, off_t offset, bool check)
124 {
125 	if (fTree == NULL || fTree->fStream == NULL) {
126 		REPORT_ERROR(B_BAD_VALUE);
127 		return NULL;
128 	}
129 
130 	Unset();
131 
132 	// You can only ask for nodes at valid positions - you can't
133 	// even access the b+tree header with this method (use SetToHeader()
134 	// instead)
135 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
136 		|| offset <= 0
137 		|| (offset % fTree->fNodeSize) != 0)
138 		return NULL;
139 
140 	if (InternalSetTo(&transaction, offset) != NULL && check) {
141 		// sanity checks (links, all_key_count)
142 		if (!fTree->fHeader->CheckNode(fNode)) {
143 			FATAL(("invalid node [%p] read from offset %Ld (block %Ld), inode "
144 				"at %Ld\n", fNode, offset, fBlockNumber, fTree->fStream->ID()));
145 			return NULL;
146 		}
147 	}
148 	return fNode;
149 }
150 
151 
152 bplustree_node*
153 CachedNode::MakeWritable(Transaction& transaction)
154 {
155 	if (fNode == NULL)
156 		return NULL;
157 
158 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
159 			fBlockNumber, transaction.ID()) == B_OK)
160 		return fNode;
161 
162 	return NULL;
163 }
164 
165 
166 bplustree_header*
167 CachedNode::MakeWritableHeader(Transaction& transaction)
168 {
169 	return (bplustree_header*)MakeWritable(transaction);
170 }
171 
172 
173 const bplustree_header*
174 CachedNode::SetToHeader()
175 {
176 	if (fTree == NULL || fTree->fStream == NULL) {
177 		REPORT_ERROR(B_BAD_VALUE);
178 		return NULL;
179 	}
180 
181 	Unset();
182 
183 	InternalSetTo(NULL, 0LL);
184 	return (bplustree_header*)fNode;
185 }
186 
187 
188 bplustree_header*
189 CachedNode::SetToWritableHeader(Transaction& transaction)
190 {
191 	if (fTree == NULL || fTree->fStream == NULL) {
192 		REPORT_ERROR(B_BAD_VALUE);
193 		return NULL;
194 	}
195 
196 	Unset();
197 
198 	InternalSetTo(&transaction, 0LL);
199 	return (bplustree_header*)fNode;
200 }
201 
202 
203 bplustree_node*
204 CachedNode::InternalSetTo(Transaction* transaction, off_t offset)
205 {
206 	fNode = NULL;
207 	fOffset = offset;
208 
209 	off_t fileOffset;
210 	block_run run;
211 	if (offset < fTree->fStream->Size()
212 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
213 		Volume* volume = fTree->fStream->GetVolume();
214 
215 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
216 		fBlockNumber = volume->ToBlock(run) + blockOffset;
217 		uint8* block;
218 
219 		if (transaction != NULL) {
220 			block = (uint8*)block_cache_get_writable(volume->BlockCache(),
221 				fBlockNumber, transaction->ID());
222 			fWritable = true;
223 		} else {
224 			block = (uint8*)block_cache_get(volume->BlockCache(), fBlockNumber);
225 			fWritable = false;
226 		}
227 
228 		if (block) {
229 			// The node is somewhere in that block...
230 			// (confusing offset calculation)
231 			fNode = (bplustree_node*)(block + offset
232 				- (fileOffset + (blockOffset << volume->BlockShift())));
233 		} else
234 			REPORT_ERROR(B_IO_ERROR);
235 	}
236 	return fNode;
237 }
238 
239 
240 status_t
241 CachedNode::Free(Transaction& transaction, off_t offset)
242 {
243 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
244 		RETURN_ERROR(B_BAD_VALUE);
245 
246 	// TODO: scan the free nodes list and remove all nodes at the end
247 	// of the tree - perhaps that shouldn't be done everytime that
248 	// function is called, perhaps it should be done when the directory
249 	// inode is closed or based on some calculation or whatever...
250 
251 	bplustree_header* header
252 		= fTree->fCachedHeader.MakeWritableHeader(transaction);
253 	if (header == NULL)
254 		return B_IO_ERROR;
255 
256 #if 0
257 	// TODO: temporarily disabled because CheckNode() doesn't like this...
258 	// 		Also, it's such an edge case that it's almost useless, anyway.
259 	// if the node is the last one in the tree, we shrink
260 	// the tree and file size by one node
261 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
262 	if (offset == lastOffset) {
263 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
264 		if (status < B_OK)
265 			return status;
266 
267 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
268 		return B_OK;
269 	}
270 #endif
271 
272 	// add the node to the free nodes list
273 	fNode->left_link = header->free_node_pointer;
274 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
275 
276 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
277 	return B_OK;
278 }
279 
280 
281 status_t
282 CachedNode::Allocate(Transaction& transaction, bplustree_node** _node,
283 	off_t* _offset)
284 {
285 	if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL)
286 		RETURN_ERROR(B_BAD_VALUE);
287 
288 	Unset();
289 
290 	bplustree_header* header;
291 	status_t status;
292 
293 	// if there are any free nodes, recycle them
294 	if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) {
295 		header = fTree->fCachedHeader.MakeWritableHeader(transaction);
296 		if (header == NULL)
297 			return B_IO_ERROR;
298 
299 		*_offset = header->FreeNode();
300 		*_node = fNode;
301 
302 		// set new free node pointer
303 		header->free_node_pointer = fNode->left_link;
304 		fNode->Initialize();
305 		return B_OK;
306 	}
307 
308 	// allocate space for a new node
309 	Inode* stream = fTree->fStream;
310 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
311 		return status;
312 
313 	header = fTree->fCachedHeader.MakeWritableHeader(transaction);
314 	if (header == NULL)
315 		return B_IO_ERROR;
316 
317 	// the maximum_size has to be changed before the call to SetTo() - or
318 	// else it will fail because the requested node is out of bounds
319 	off_t offset = header->MaximumSize();
320 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
321 		+ fTree->fNodeSize);
322 
323 	if (SetToWritable(transaction, offset, false) != NULL) {
324 		fNode->Initialize();
325 
326 		*_offset = offset;
327 		*_node = fNode;
328 		return B_OK;
329 	}
330 
331 	// revert header size to old value
332 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
333 		- fTree->fNodeSize);
334 	RETURN_ERROR(B_ERROR);
335 }
336 
337 
338 //	#pragma mark -
339 
340 
341 BPlusTree::BPlusTree(Transaction& transaction, Inode* stream, int32 nodeSize)
342 	:
343 	fStream(NULL),
344 	fHeader(NULL),
345 	fCachedHeader(this)
346 {
347 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
348 	SetTo(transaction, stream);
349 }
350 
351 
352 BPlusTree::BPlusTree(Inode* stream)
353 	:
354 	fStream(NULL),
355 	fHeader(NULL),
356 	fCachedHeader(this)
357 {
358 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
359 	SetTo(stream);
360 }
361 
362 
363 BPlusTree::BPlusTree()
364 	:
365 	fStream(NULL),
366 	fHeader(NULL),
367 	fCachedHeader(this),
368 	fNodeSize(BPLUSTREE_NODE_SIZE),
369 	fAllowDuplicates(true),
370 	fStatus(B_NO_INIT)
371 {
372 	mutex_init(&fIteratorLock, "bfs b+tree iterator");
373 }
374 
375 
376 BPlusTree::~BPlusTree()
377 {
378 	// if there are any TreeIterators left, we need to stop them
379 	// (can happen when the tree's inode gets deleted while
380 	// traversing the tree - a TreeIterator doesn't lock the inode)
381 	mutex_lock(&fIteratorLock);
382 
383 	SinglyLinkedList<TreeIterator>::Iterator iterator
384 		= fIterators.GetIterator();
385 	while (iterator.HasNext())
386 		iterator.Next()->Stop();
387 
388 	mutex_destroy(&fIteratorLock);
389 }
390 
391 
392 /*! Create a new B+Tree on the specified stream */
393 status_t
394 BPlusTree::SetTo(Transaction& transaction, Inode* stream, int32 nodeSize)
395 {
396 	// initializes in-memory B+Tree
397 
398 	fStream = stream;
399 
400 	bplustree_header* header = fCachedHeader.SetToWritableHeader(transaction);
401 	if (header == NULL) {
402 		// allocate space for new header + node!
403 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
404 		if (fStatus < B_OK)
405 			RETURN_ERROR(fStatus);
406 
407 		header = fCachedHeader.SetToWritableHeader(transaction);
408 		if (header == NULL)
409 			RETURN_ERROR(fStatus = B_ERROR);
410 	}
411 
412 	fHeader = header;
413 	fAllowDuplicates = stream->IsIndex()
414 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
415 
416 	fNodeSize = nodeSize;
417 
418 	// initialize b+tree header
419  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
420  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
421  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
422  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
423  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
424  	header->free_node_pointer
425  		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
426  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
427 
428 	// initialize b+tree root node
429 	CachedNode cached(this);
430 	cached.SetToWritable(transaction, header->RootNode(), false);
431 	if (cached.Node() == NULL)
432 		RETURN_ERROR(B_IO_ERROR);
433 
434 	cached.Node()->Initialize();
435 
436 	return fStatus = B_OK;
437 }
438 
439 
440 status_t
441 BPlusTree::SetTo(Inode* stream)
442 {
443 	if (stream == NULL)
444 		RETURN_ERROR(fStatus = B_BAD_VALUE);
445 
446 	// get on-disk B+Tree header
447 
448 	fCachedHeader.Unset();
449 	fStream = stream;
450 
451 	fHeader = fCachedHeader.SetToHeader();
452 	if (fHeader == NULL)
453 		RETURN_ERROR(fStatus = B_NO_INIT);
454 
455 	// is header valid?
456 
457 	if (fHeader->MaximumSize() != stream->Size()) {
458 		dprintf("B+tree header size %Ld doesn't fit file size %Ld!\n",
459 			fHeader->MaximumSize(), stream->Size());
460 		// we can't change the header since we don't have a transaction
461 		//fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
462 	}
463 	if (fHeader->Magic() != BPLUSTREE_MAGIC
464 		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
465 		|| !fHeader->IsValidLink(fHeader->RootNode())
466 		|| !fHeader->IsValidLink(fHeader->FreeNode())) {
467 #ifdef DEBUG
468 		dump_bplustree_header(fHeader);
469 		dump_block((const char*)fHeader, 128);
470 #endif
471 		RETURN_ERROR(fStatus = B_BAD_DATA);
472 	}
473 
474 	fNodeSize = fHeader->NodeSize();
475 
476 	// validity check
477 	static const uint32 kToMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
478 		S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
479 		S_DOUBLE_INDEX};
480 	uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
481 		| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
482 		| S_FLOAT_INDEX | S_DOUBLE_INDEX);
483 
484 	if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
485 		|| ((stream->Mode() & S_INDEX_DIR) != 0
486 			&& kToMode[fHeader->DataType()] != mode)
487 		|| !stream->IsContainer()) {
488 		D(	dump_bplustree_header(fHeader);
489 			dump_inode(&stream->Node());
490 		);
491 		RETURN_ERROR(fStatus = B_BAD_TYPE);
492 	}
493 
494 	// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
495 	// in the original BFS code - we will honour it nevertheless
496 	fAllowDuplicates = stream->IsIndex()
497 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
498 
499 	CachedNode cached(this, fHeader->RootNode());
500 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
501 }
502 
503 
504 status_t
505 BPlusTree::InitCheck()
506 {
507 	return fStatus;
508 }
509 
510 
511 int32
512 BPlusTree::TypeCodeToKeyType(type_code code)
513 {
514 	switch (code) {
515 		case B_STRING_TYPE:
516 			return BPLUSTREE_STRING_TYPE;
517 		case B_SSIZE_T_TYPE:
518 		case B_INT32_TYPE:
519 			return BPLUSTREE_INT32_TYPE;
520 		case B_SIZE_T_TYPE:
521 		case B_UINT32_TYPE:
522 			return BPLUSTREE_UINT32_TYPE;
523 		case B_OFF_T_TYPE:
524 		case B_INT64_TYPE:
525 			return BPLUSTREE_INT64_TYPE;
526 		case B_UINT64_TYPE:
527 			return BPLUSTREE_UINT64_TYPE;
528 		case B_FLOAT_TYPE:
529 			return BPLUSTREE_FLOAT_TYPE;
530 		case B_DOUBLE_TYPE:
531 			return BPLUSTREE_DOUBLE_TYPE;
532 	}
533 	return -1;
534 }
535 
536 
537 int32
538 BPlusTree::ModeToKeyType(mode_t mode)
539 {
540 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
541 			| S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
542 		case S_INT_INDEX:
543 			return BPLUSTREE_INT32_TYPE;
544 		case S_UINT_INDEX:
545 			return BPLUSTREE_UINT32_TYPE;
546 		case S_LONG_LONG_INDEX:
547 			return BPLUSTREE_INT64_TYPE;
548 		case S_ULONG_LONG_INDEX:
549 			return BPLUSTREE_UINT64_TYPE;
550 		case S_FLOAT_INDEX:
551 			return BPLUSTREE_FLOAT_TYPE;
552 		case S_DOUBLE_INDEX:
553 			return BPLUSTREE_DOUBLE_TYPE;
554 		case S_STR_INDEX:
555 		default:
556 			// default is for standard directories
557 			return BPLUSTREE_STRING_TYPE;
558 	}
559 }
560 
561 
562 //	#pragma mark -
563 
564 
565 void
566 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
567 	uint16 splitAt, int8 change)
568 {
569 	// Although every iterator which is affected by this update currently
570 	// waits on a semaphore, other iterators could be added/removed at
571 	// any time, so we need to protect this loop
572 	MutexLocker _(fIteratorLock);
573 
574 	SinglyLinkedList<TreeIterator>::Iterator iterator
575 		= fIterators.GetIterator();
576 	while (iterator.HasNext())
577 		iterator.Next()->Update(offset, nextOffset, keyIndex, splitAt, change);
578 }
579 
580 
581 void
582 BPlusTree::_AddIterator(TreeIterator* iterator)
583 {
584 	MutexLocker _(fIteratorLock);
585 	fIterators.Add(iterator);
586 }
587 
588 
589 void
590 BPlusTree::_RemoveIterator(TreeIterator* iterator)
591 {
592 	MutexLocker _(fIteratorLock);
593 	fIterators.Remove(iterator);
594 }
595 
596 
597 int32
598 BPlusTree::_CompareKeys(const void* key1, int keyLength1, const void* key2,
599 	int keyLength2)
600 {
601 	type_code type = 0;
602 	switch (fHeader->data_type) {
603 	    case BPLUSTREE_STRING_TYPE:
604 	    	type = B_STRING_TYPE;
605 	    	break;
606 		case BPLUSTREE_INT32_TYPE:
607 	    	type = B_INT32_TYPE;
608 	    	break;
609 		case BPLUSTREE_UINT32_TYPE:
610 	    	type = B_UINT32_TYPE;
611 	    	break;
612 		case BPLUSTREE_INT64_TYPE:
613 	    	type = B_INT64_TYPE;
614 	    	break;
615 		case BPLUSTREE_UINT64_TYPE:
616 	    	type = B_UINT64_TYPE;
617 	    	break;
618 		case BPLUSTREE_FLOAT_TYPE:
619 	    	type = B_FLOAT_TYPE;
620 	    	break;
621 		case BPLUSTREE_DOUBLE_TYPE:
622 	    	type = B_DOUBLE_TYPE;
623 	    	break;
624 	}
625    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
626 }
627 
628 
629 status_t
630 BPlusTree::_FindKey(const bplustree_node* node, const uint8* key,
631 	uint16 keyLength, uint16* _index, off_t* _next)
632 {
633 #ifdef DEBUG
634 		NodeChecker checker(node, fNodeSize, "find");
635 #endif
636 
637 	if (node->all_key_count == 0) {
638 		if (_index)
639 			*_index = 0;
640 		if (_next)
641 			*_next = node->OverflowLink();
642 		return B_ENTRY_NOT_FOUND;
643 	}
644 
645 	off_t* values = node->Values();
646 	int16 saveIndex = -1;
647 
648 	// binary search in the key array
649 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
650 		uint16 i = (first + last) >> 1;
651 
652 		uint16 searchLength;
653 		uint8* searchKey = node->KeyAt(i, &searchLength);
654 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
655 				> (uint8*)node + fNodeSize
656 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
657 			fStream->GetVolume()->Panic();
658 			RETURN_ERROR(B_BAD_DATA);
659 		}
660 
661 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
662 		if (cmp < 0) {
663 			last = i - 1;
664 			saveIndex = i;
665 		} else if (cmp > 0) {
666 			saveIndex = first = i + 1;
667 		} else {
668 			if (_index)
669 				*_index = i;
670 			if (_next)
671 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
672 			return B_OK;
673 		}
674 	}
675 
676 	if (_index)
677 		*_index = saveIndex;
678 	if (_next) {
679 		if (saveIndex == node->NumKeys())
680 			*_next = node->OverflowLink();
681 		else
682 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
683 	}
684 	return B_ENTRY_NOT_FOUND;
685 }
686 
687 
688 /*!	Prepares the stack to contain all nodes that were passed while
689 	following the key, from the root node to the leaf node that could
690 	or should contain that key.
691 */
692 status_t
693 BPlusTree::_SeekDown(Stack<node_and_key>& stack, const uint8* key,
694 	uint16 keyLength)
695 {
696 	// set the root node to begin with
697 	node_and_key nodeAndKey;
698 	nodeAndKey.nodeOffset = fHeader->RootNode();
699 
700 	CachedNode cached(this);
701 	const bplustree_node* node;
702 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
703 		// if we are already on leaf level, we're done
704 		if (node->OverflowLink() == BPLUSTREE_NULL) {
705 			// node that the keyIndex is not properly set here (but it's not
706 			// needed in the calling functions anyway)!
707 			nodeAndKey.keyIndex = 0;
708 			stack.Push(nodeAndKey);
709 			return B_OK;
710 		}
711 
712 		off_t nextOffset;
713 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
714 			&nextOffset);
715 
716 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
717 			RETURN_ERROR(B_ERROR);
718 
719 		if ((uint32)stack.CountItems() > fHeader->MaxNumberOfLevels()) {
720 			dprintf("BPlusTree::_SeekDown() node walked too deep.\n");
721 			break;
722 		}
723 
724 		// put the node offset & the correct keyIndex on the stack
725 		stack.Push(nodeAndKey);
726 
727 		nodeAndKey.nodeOffset = nextOffset;
728 	}
729 
730 	FATAL(("BPlusTree::_SeekDown() could not open node %Ld\n",
731 		nodeAndKey.nodeOffset));
732 	return B_ERROR;
733 }
734 
735 
736 /*!	This will find a free duplicate fragment in the given bplustree_node.
737 	The CachedNode will be set to the writable fragment on success.
738 */
739 status_t
740 BPlusTree::_FindFreeDuplicateFragment(Transaction& transaction,
741 	const bplustree_node* node, CachedNode& cached,
742 	off_t* _offset, bplustree_node** _fragment, uint32* _index)
743 {
744 	off_t* values = node->Values();
745 	for (int32 i = 0; i < node->NumKeys(); i++) {
746 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
747 
748 		// does the value link to a duplicate fragment?
749 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
750 			continue;
751 
752 		const bplustree_node* fragment = cached.SetTo(
753 			bplustree_node::FragmentOffset(value), false);
754 		if (fragment == NULL) {
755 			FATAL(("Could not get duplicate fragment at %Ld\n", value));
756 			continue;
757 		}
758 
759 		// see if there is some space left for us
760 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
761 		for (uint32 j = 0; j < num; j++) {
762 			duplicate_array* array = fragment->FragmentAt(j);
763 
764 			if (array->count == 0) {
765 				// found an unused fragment
766 				*_fragment = cached.MakeWritable(transaction);
767 				if (*_fragment == NULL)
768 					return B_IO_ERROR;
769 
770 				*_offset = bplustree_node::FragmentOffset(value);
771 				*_index = j;
772 				return B_OK;
773 			}
774 		}
775 	}
776 	return B_ENTRY_NOT_FOUND;
777 }
778 
779 
780 status_t
781 BPlusTree::_InsertDuplicate(Transaction& transaction, CachedNode& cached,
782 	const bplustree_node* node, uint16 index, off_t value)
783 {
784 	CachedNode cachedDuplicate(this);
785 	off_t* values = node->Values();
786 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
787 	status_t status;
788 	off_t offset;
789 
790 	if (bplustree_node::IsDuplicate(oldValue)) {
791 		// If it's a duplicate fragment, try to insert it into that, or if it
792 		// doesn't fit anymore, create a new duplicate node
793 
794 		if (bplustree_node::LinkType(oldValue)
795 				== BPLUSTREE_DUPLICATE_FRAGMENT) {
796 			bplustree_node* duplicate
797 				= cachedDuplicate.SetToWritable(transaction,
798 				bplustree_node::FragmentOffset(oldValue), false);
799 			if (duplicate == NULL)
800 				return B_IO_ERROR;
801 
802 			duplicate_array* array = duplicate->FragmentAt(
803 				bplustree_node::FragmentIndex(oldValue));
804 			if (array->count > NUM_FRAGMENT_VALUES
805 				|| array->count < 1) {
806 				FATAL(("insertDuplicate: Invalid array[%d] size in fragment "
807 					"%Ld == %Ld!\n",
808 					(int)bplustree_node::FragmentIndex(oldValue),
809 					bplustree_node::FragmentOffset(oldValue), array->count));
810 				return B_BAD_DATA;
811 			}
812 
813 			if (array->count < NUM_FRAGMENT_VALUES) {
814 				array->Insert(value);
815 			} else {
816 				// Test if the fragment will be empty if we remove this key's
817 				// values
818 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
819 					// The node will be empty without our values, so let us
820 					// reuse it as a duplicate node
821 					offset = bplustree_node::FragmentOffset(oldValue);
822 
823 					memmove(duplicate->DuplicateArray(), array,
824 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
825 					duplicate->left_link = duplicate->right_link
826 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
827 
828 					array = duplicate->DuplicateArray();
829 					array->Insert(value);
830 				} else {
831 					// Create a new duplicate node
832 
833 					cachedDuplicate.UnsetUnchanged(transaction);
834 						// The old duplicate has not been touched, so we can
835 						// reuse it
836 
837 					bplustree_node* newDuplicate;
838 					status = cachedDuplicate.Allocate(transaction,
839 						&newDuplicate, &offset);
840 					if (status < B_OK)
841 						RETURN_ERROR(status);
842 
843 					// Copy the array from the fragment node to the duplicate
844 					// node and free the old entry (by zero'ing all values)
845 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
846 						array->count);
847 					memcpy(&newDuplicate->all_key_count, &array->values[0],
848 						array->count * sizeof(off_t));
849 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
850 
851 					array = newDuplicate->DuplicateArray();
852 					array->Insert(value);
853 				}
854 
855 				// Update the main pointer to link to a duplicate node
856 				if (cached.MakeWritable(transaction) == NULL)
857 					return B_IO_ERROR;
858 
859 				values[index]
860 					= HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
861 						BPLUSTREE_DUPLICATE_NODE, offset));
862 			}
863 
864 			return B_OK;
865 		}
866 
867 		// Put the value into a dedicated duplicate node
868 
869 		// search for free space in the duplicate nodes of that key
870 		duplicate_array* array;
871 		const bplustree_node* duplicate;
872 		off_t duplicateOffset;
873 		do {
874 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
875 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
876 			if (duplicate == NULL)
877 				return B_IO_ERROR;
878 
879 			array = duplicate->DuplicateArray();
880 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
881 				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld "
882 					"== %Ld!\n", duplicateOffset, array->count));
883 				return B_BAD_DATA;
884 			}
885 		} while (array->count >= NUM_DUPLICATE_VALUES
886 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
887 
888 		bplustree_node* writableDuplicate
889 			= cachedDuplicate.MakeWritable(transaction);
890 		if (writableDuplicate == NULL)
891 			return B_IO_ERROR;
892 
893 		if (array->count < NUM_DUPLICATE_VALUES) {
894 			array = writableDuplicate->DuplicateArray();
895 			array->Insert(value);
896 		} else {
897 			// no space left - add a new duplicate node
898 
899 			bplustree_node* newDuplicate;
900 			status = cachedDuplicate.Allocate(transaction, &newDuplicate,
901 				&offset);
902 			if (status != B_OK)
903 				RETURN_ERROR(status);
904 
905 			// link the two nodes together
906 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
907 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
908 
909 			array = newDuplicate->DuplicateArray();
910 			array->count = 0;
911 			array->Insert(value);
912 		}
913 		return B_OK;
914 	}
915 
916 	// Search for a free duplicate fragment or create a new one
917 	// to insert the duplicate value into
918 
919 	uint32 fragmentIndex = 0;
920 	bplustree_node* fragment;
921 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
922 			&offset, &fragment, &fragmentIndex) != B_OK) {
923 		// allocate a new duplicate fragment node
924 		status = cachedDuplicate.Allocate(transaction, &fragment, &offset);
925 		if (status != B_OK)
926 			RETURN_ERROR(status);
927 
928 		memset(fragment, 0, fNodeSize);
929 	}
930 
931 	duplicate_array* array = fragment->FragmentAt(fragmentIndex);
932 	array->Insert(oldValue);
933 	array->Insert(value);
934 
935 	if (cached.MakeWritable(transaction) == NULL)
936 		return B_IO_ERROR;
937 
938 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
939 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
940 
941 	return B_OK;
942 }
943 
944 
945 void
946 BPlusTree::_InsertKey(bplustree_node* node, uint16 index, uint8* key,
947 	uint16 keyLength, off_t value)
948 {
949 	// should never happen, but who knows?
950 	if (index > node->NumKeys())
951 		return;
952 
953 	off_t* values = node->Values();
954 	uint16* keyLengths = node->KeyLengths();
955 	uint8* keys = node->Keys();
956 
957 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
958 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
959 		+ keyLength);
960 
961 	off_t* newValues = node->Values();
962 	uint16* newKeyLengths = node->KeyLengths();
963 
964 	// move values and copy new value into them
965 	memmove(newValues + index + 1, values + index,
966 		sizeof(off_t) * (node->NumKeys() - 1 - index));
967 	memmove(newValues, values, sizeof(off_t) * index);
968 
969 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
970 
971 	// move and update key length index
972 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
973 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
974 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
975 	}
976 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
977 
978 	int32 keyStart;
979 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
980 		+ (keyStart = index > 0
981 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
982 
983 	// move keys and copy new key into them
984 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
985 	int32 size = node->AllKeyLength() - length;
986 	if (size > 0)
987 		memmove(keys + length, keys + length - keyLength, size);
988 
989 	memcpy(keys + keyStart, key, keyLength);
990 }
991 
992 
993 /*!	Splits the \a node into two halves - the other half will be put into
994 	\a other. It also takes care to create a new overflow link if the node
995 	to split is an index node.
996 */
997 status_t
998 BPlusTree::_SplitNode(bplustree_node* node, off_t nodeOffset,
999 	bplustree_node* other, off_t otherOffset, uint16* _keyIndex, uint8* key,
1000 	uint16* _keyLength, off_t* _value)
1001 {
1002 	if (*_keyIndex > node->NumKeys() + 1)
1003 		return B_BAD_VALUE;
1004 
1005 	uint16* inKeyLengths = node->KeyLengths();
1006 	off_t* inKeyValues = node->Values();
1007 	uint8* inKeys = node->Keys();
1008 	uint8* outKeys = other->Keys();
1009 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1010 
1011 	if (keyIndex > node->NumKeys()) {
1012 		FATAL(("key index out of bounds: %d, num keys: %u\n", (int)keyIndex,
1013 			node->NumKeys()));
1014 		return B_BAD_VALUE;
1015 	}
1016 
1017 	// how many keys will fit in one (half) page?
1018 	// that loop will find the answer to this question and
1019 	// change the key lengths indices for their new home
1020 
1021 	// "bytes" is the number of bytes written for the new key,
1022 	// "bytesBefore" are the bytes before that key
1023 	// "bytesAfter" are the bytes after the new key, if any
1024 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1025 
1026 	size_t size = fNodeSize >> 1;
1027 	int32 out, in;
1028 	for (in = out = 0; in < node->NumKeys() + 1;) {
1029 		if (!bytes) {
1030 			bytesBefore = in > 0
1031 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1032 		}
1033 
1034 		if (in == keyIndex && !bytes) {
1035 			bytes = *_keyLength;
1036 		} else {
1037 			if (keyIndex < out) {
1038 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1039 					- bytesBefore;
1040 			}
1041 
1042 			in++;
1043 		}
1044 		out++;
1045 
1046 		if (key_align(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1047 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1048 			// we have found the number of keys in the new node!
1049 			break;
1050 		}
1051 	}
1052 
1053 	// if the new key was not inserted, set the length of the keys
1054 	// that can be copied directly
1055 	if (keyIndex >= out && in > 0)
1056 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1057 
1058 	if (bytesBefore < 0 || bytesAfter < 0)
1059 		return B_BAD_DATA;
1060 
1061 	other->left_link = node->left_link;
1062 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1063 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1064 		+ bytesAfter);
1065 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1066 
1067 	uint16* outKeyLengths = other->KeyLengths();
1068 	off_t* outKeyValues = other->Values();
1069 	int32 keys = out > keyIndex ? keyIndex : out;
1070 
1071 	if (bytesBefore) {
1072 		// copy the keys
1073 		memcpy(outKeys, inKeys, bytesBefore);
1074 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1075 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1076 	}
1077 	if (bytes) {
1078 		// copy the newly inserted key
1079 		memcpy(outKeys + bytesBefore, key, bytes);
1080 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1081 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1082 
1083 		if (bytesAfter) {
1084 			// copy the keys after the new key
1085 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1086 				bytesAfter);
1087 			keys = out - keyIndex - 1;
1088 			for (int32 i = 0;i < keys;i++) {
1089 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1090 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i])
1091 						+ bytes);
1092 			}
1093 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1094 				keys * sizeof(off_t));
1095 		}
1096 	}
1097 
1098 	// if the new key was already inserted, we shouldn't use it again
1099 	if (in != out)
1100 		keyIndex--;
1101 
1102 	int32 total = bytesBefore + bytesAfter;
1103 
1104 	// these variables are for the key that will be returned
1105 	// to the parent node
1106 	uint8* newKey = NULL;
1107 	uint16 newLength;
1108 	bool newAllocated = false;
1109 
1110 	// If we have split an index node, we have to drop the first key
1111 	// of the next node (which can also be the new key to insert).
1112 	// The dropped key is also the one which has to be inserted in
1113 	// the parent node, so we will set the "newKey" already here.
1114 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1115 		if (in == keyIndex) {
1116 			newKey = key;
1117 			newLength = *_keyLength;
1118 
1119 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1120 			keyIndex--;
1121 		} else {
1122 			// If a key is dropped (is not the new key), we have to copy
1123 			// it, because it would be lost if not.
1124 			uint8* droppedKey = node->KeyAt(in, &newLength);
1125 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1126 					> (uint8*)node + fNodeSize
1127 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1128 				fStream->GetVolume()->Panic();
1129 				RETURN_ERROR(B_BAD_DATA);
1130 			}
1131 			newKey = (uint8*)malloc(newLength);
1132 			if (newKey == NULL)
1133 				return B_NO_MEMORY;
1134 
1135 			newAllocated = true;
1136 			memcpy(newKey, droppedKey, newLength);
1137 
1138 			other->overflow_link = inKeyValues[in];
1139 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1140 		}
1141 	}
1142 
1143 	// and now the same game for the other page and the rest of the keys
1144 	// (but with memmove() instead of memcpy(), because they may overlap)
1145 
1146 	bytesBefore = bytesAfter = bytes = 0;
1147 	out = 0;
1148 	int32 skip = in;
1149 	while (in < node->NumKeys() + 1) {
1150 		if (in == keyIndex && !bytes) {
1151 			// it's enough to set bytesBefore once here, because we do
1152 			// not need to know the exact length of all keys in this
1153 			// loop
1154 			bytesBefore = in > skip
1155 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1156 			bytes = *_keyLength;
1157 			out++;
1158 		} else {
1159 			if (in < node->NumKeys()) {
1160 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1161 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1162 
1163 				if (bytes) {
1164 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1165 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1166 
1167 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1168 						- bytesBefore - bytes;
1169 				}
1170 				out++;
1171 			}
1172 			in++;
1173 		}
1174 	}
1175 
1176 	// adjust the byte counts (since we were a bit lazy in the loop)
1177 	if (keyIndex < skip)
1178 		bytesBefore = node->AllKeyLength() - total;
1179 
1180 	if (bytesBefore < 0 || bytesAfter < 0) {
1181 		if (newAllocated)
1182 			free(newKey);
1183 		return B_BAD_DATA;
1184 	}
1185 
1186 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1187 		// right link, and overflow link can stay the same
1188 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1189 		+ bytesAfter);
1190 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1191 
1192 	// array positions have changed
1193 	outKeyLengths = node->KeyLengths();
1194 	outKeyValues = node->Values();
1195 
1196 	// move the keys in the old node: the order is important here,
1197 	// because we don't want to overwrite any contents
1198 
1199 	keys = keyIndex <= skip ? out : keyIndex - skip;
1200 	keyIndex -= skip;
1201 	in = out - keyIndex - 1;
1202 		// Note: keyIndex and in will contain invalid values when the new key
1203 		// went to the other node. But in this case bytes and bytesAfter are
1204 		// 0 and subsequently we never use keyIndex and in.
1205 
1206 	if (bytesBefore)
1207 		memmove(inKeys, inKeys + total, bytesBefore);
1208 	if (bytesAfter) {
1209 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1210 			bytesAfter);
1211 	}
1212 
1213 	if (bytesBefore)
1214 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1215 	if (bytesAfter) {
1216 		// if byteAfter is > 0, keyIndex is larger than skip
1217 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1218 			in * sizeof(uint16));
1219 	}
1220 
1221 	if (bytesBefore)
1222 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1223 	if (bytesAfter) {
1224 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1225 			in * sizeof(off_t));
1226 	}
1227 
1228 	if (bytes) {
1229 		// finally, copy the newly inserted key (don't overwrite anything)
1230 		memcpy(inKeys + bytesBefore, key, bytes);
1231 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1232 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1233 	}
1234 
1235 	// Prepare the key that will be inserted in the parent node which
1236 	// is either the dropped key or the last of the other node.
1237 	// If it's the dropped key, "newKey" was already set earlier.
1238 
1239 	if (newKey == NULL)
1240 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1241 
1242 	memcpy(key, newKey, newLength);
1243 	*_keyLength = newLength;
1244 	*_value = otherOffset;
1245 
1246 	if (newAllocated)
1247 		free(newKey);
1248 
1249 	return B_OK;
1250 }
1251 
1252 
1253 /*!	This inserts a key into the tree. The changes made to the tree will
1254 	all be part of the \a transaction.
1255 	You need to have the inode write locked.
1256 */
1257 status_t
1258 BPlusTree::Insert(Transaction& transaction, const uint8* key, uint16 keyLength,
1259 	off_t value)
1260 {
1261 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1262 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1263 		RETURN_ERROR(B_BAD_VALUE);
1264 #ifdef DEBUG
1265 	if (value < 0)
1266 		panic("tried to insert invalid value %Ld!\n", value);
1267 #endif
1268 
1269 	ASSERT_WRITE_LOCKED_INODE(fStream);
1270 
1271 	Stack<node_and_key> stack;
1272 	if (_SeekDown(stack, key, keyLength) != B_OK)
1273 		RETURN_ERROR(B_ERROR);
1274 
1275 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1276 
1277 	memcpy(keyBuffer, key, keyLength);
1278 	keyBuffer[keyLength] = 0;
1279 
1280 	node_and_key nodeAndKey;
1281 	const bplustree_node* node;
1282 
1283 	CachedNode cached(this);
1284 	while (stack.Pop(&nodeAndKey)
1285 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1286 #ifdef DEBUG
1287 		NodeChecker checker(node, fNodeSize, "insert");
1288 #endif
1289 		if (node->IsLeaf()) {
1290 			// first round, check for duplicate entries
1291 			status_t status = _FindKey(node, key, keyLength,
1292 				&nodeAndKey.keyIndex);
1293 
1294 			// is this a duplicate entry?
1295 			if (status == B_OK) {
1296 				if (fAllowDuplicates) {
1297 					status = _InsertDuplicate(transaction, cached, node,
1298 						nodeAndKey.keyIndex, value);
1299 					if (status != B_OK)
1300 						RETURN_ERROR(status);
1301 					return B_OK;
1302 				}
1303 
1304 				return B_NAME_IN_USE;
1305 			}
1306 		}
1307 
1308 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1309 		if (writableNode == NULL)
1310 			return B_IO_ERROR;
1311 
1312 		// is the node big enough to hold the pair?
1313 		if (int32(key_align(sizeof(bplustree_node)
1314 				+ writableNode->AllKeyLength() + keyLength)
1315 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1316 				+ sizeof(off_t))) < fNodeSize) {
1317 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1318 				keyBuffer, keyLength, value);
1319 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1320 				nodeAndKey.keyIndex, 0, 1);
1321 
1322 			return B_OK;
1323 		} else {
1324 			CachedNode cachedNewRoot(this);
1325 			CachedNode cachedOther(this);
1326 
1327 			// do we need to allocate a new root node? if so, then do
1328 			// it now
1329 			off_t newRoot = BPLUSTREE_NULL;
1330 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1331 				bplustree_node* root;
1332 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1333 					&newRoot);
1334 				if (status < B_OK) {
1335 					// The tree is most likely corrupted!
1336 					// But it's still sane at leaf level - we could set
1337 					// a flag in the header that forces the tree to be
1338 					// rebuild next time...
1339 					// But since we will have journaling, that's not a big
1340 					// problem anyway.
1341 					RETURN_ERROR(status);
1342 				}
1343 			}
1344 
1345 			// reserve space for the other node
1346 			bplustree_node* other;
1347 			off_t otherOffset;
1348 			status_t status = cachedOther.Allocate(transaction, &other,
1349 				&otherOffset);
1350 			if (status < B_OK) {
1351 				cachedNewRoot.Free(transaction, newRoot);
1352 				RETURN_ERROR(status);
1353 			}
1354 
1355 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1356 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1357 					&value) < B_OK) {
1358 				// free root node & other node here
1359 				cachedOther.Free(transaction, otherOffset);
1360 				cachedNewRoot.Free(transaction, newRoot);
1361 
1362 				RETURN_ERROR(B_ERROR);
1363 			}
1364 #ifdef DEBUG
1365 			checker.Check("insert split");
1366 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1367 #endif
1368 
1369 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1370 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1371 
1372 			// update the right link of the node in the left of the new node
1373 			if ((other = cachedOther.SetToWritable(transaction,
1374 					other->LeftLink())) != NULL) {
1375 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1376 			}
1377 
1378 			// create a new root if necessary
1379 			if (newRoot != BPLUSTREE_NULL) {
1380 				bplustree_node* root = cachedNewRoot.Node();
1381 
1382 				_InsertKey(root, 0, keyBuffer, keyLength,
1383 					writableNode->LeftLink());
1384 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1385 					nodeAndKey.nodeOffset);
1386 
1387 				bplustree_header* header
1388 					= fCachedHeader.MakeWritableHeader(transaction);
1389 				if (header == NULL)
1390 					return B_IO_ERROR;
1391 
1392 				// finally, update header to point to the new root
1393 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1394 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1395 					header->MaxNumberOfLevels() + 1);
1396 
1397 				return B_OK;
1398 			}
1399 		}
1400 	}
1401 	RETURN_ERROR(B_ERROR);
1402 }
1403 
1404 
1405 /*!	Removes the duplicate index/value pair from the tree.
1406 	It's part of the private tree interface.
1407 */
1408 status_t
1409 BPlusTree::_RemoveDuplicate(Transaction& transaction,
1410 	const bplustree_node* node, CachedNode& cached, uint16 index,
1411 	off_t value)
1412 {
1413 	off_t* values = node->Values();
1414 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1415 
1416 	CachedNode cachedDuplicate(this);
1417 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1418 	bplustree_node* duplicate = cachedDuplicate.SetToWritable(transaction,
1419 		duplicateOffset, false);
1420 	if (duplicate == NULL)
1421 		return B_IO_ERROR;
1422 
1423 	// if it's a duplicate fragment, remove the entry from there
1424 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1425 		duplicate_array* array = duplicate->FragmentAt(
1426 			bplustree_node::FragmentIndex(oldValue));
1427 
1428 		if (array->count > NUM_FRAGMENT_VALUES
1429 			|| array->count < 1) {
1430 			FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld "
1431 				"== %Ld!\n", (int)bplustree_node::FragmentIndex(oldValue),
1432 				duplicateOffset, array->count));
1433 			return B_BAD_DATA;
1434 		}
1435 		if (!array->Remove(value)) {
1436 			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",
1437 				value, duplicateOffset));
1438 			return B_ENTRY_NOT_FOUND;
1439 		}
1440 
1441 		// remove the array from the fragment node if it is empty
1442 		if (array->count == 1) {
1443 			// set the link to the remaining value
1444 			if (cached.MakeWritable(transaction) == NULL)
1445 				return B_IO_ERROR;
1446 
1447 			values[index] = array->values[0];
1448 
1449 			// Remove the whole fragment node, if this was the only array,
1450 			// otherwise free just the array
1451 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1452 				status_t status = cachedDuplicate.Free(transaction,
1453 					duplicateOffset);
1454 				if (status < B_OK)
1455 					return status;
1456 			} else
1457 				array->count = 0;
1458 		}
1459 		return B_OK;
1460 	}
1461 
1462 	// Remove value from a duplicate node!
1463 
1464 	duplicate_array* array = NULL;
1465 
1466 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1467 		FATAL(("invalid duplicate node: first left link points to %Ld!\n",
1468 			duplicate->LeftLink()));
1469 		return B_BAD_DATA;
1470 	}
1471 
1472 	// Search the duplicate nodes until the entry could be found (and removed)
1473 	while (duplicate != NULL) {
1474 		array = duplicate->DuplicateArray();
1475 		if (array->count > NUM_DUPLICATE_VALUES
1476 			|| array->count < 0) {
1477 			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == "
1478 				"%Ld!\n", duplicateOffset, array->count));
1479 			return B_BAD_DATA;
1480 		}
1481 
1482 		if (array->Remove(value))
1483 			break;
1484 
1485 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1486 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1487 
1488 		cachedDuplicate.UnsetUnchanged(transaction);
1489 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset,
1490 			false);
1491 	}
1492 	if (duplicate == NULL)
1493 		RETURN_ERROR(B_IO_ERROR);
1494 
1495 	// The entry got removed from the duplicate node, but we might want to free
1496 	// it now in case it's empty
1497 
1498 	while (true) {
1499 		off_t left = duplicate->LeftLink();
1500 		off_t right = duplicate->RightLink();
1501 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1502 
1503 		if ((isLast && array->count == 1) || array->count == 0) {
1504 			// Free empty duplicate page, link their siblings together, and
1505 			// update the duplicate link if needed (ie. when we either remove
1506 			// the last duplicate node or have a new first one)
1507 
1508 			if (left == BPLUSTREE_NULL) {
1509 				// the duplicate link points to us
1510 				if (cached.MakeWritable(transaction) == NULL)
1511 					return B_IO_ERROR;
1512 
1513 				if (array->count == 1) {
1514 					// This is the last node, and there is only one value left;
1515 					// replace the duplicate link with that value, it's no
1516 					// duplicate anymore
1517 					values[index] = array->values[0];
1518 				} else {
1519 					// Move the duplicate link to the next node
1520 					values[index] = HOST_ENDIAN_TO_BFS_INT64(
1521 						bplustree_node::MakeLink(
1522 							BPLUSTREE_DUPLICATE_NODE, right));
1523 				}
1524 			}
1525 
1526 			status_t status = cachedDuplicate.Free(transaction,
1527 				duplicateOffset);
1528 			if (status < B_OK)
1529 				return status;
1530 
1531 			if (left != BPLUSTREE_NULL
1532 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left,
1533 						false)) != NULL) {
1534 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1535 
1536 				// If the next node is the last node, we need to free that node
1537 				// and convert the duplicate entry back into a normal entry
1538 				array = duplicate->DuplicateArray();
1539 				if (right == BPLUSTREE_NULL
1540 					&& duplicate->LeftLink() == BPLUSTREE_NULL
1541 					&& array->count <= NUM_FRAGMENT_VALUES) {
1542 					duplicateOffset = left;
1543 					continue;
1544 				}
1545 			}
1546 			if (right != BPLUSTREE_NULL
1547 				&& (duplicate = cachedDuplicate.SetToWritable(transaction,
1548 						right, false)) != NULL) {
1549 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1550 
1551 				// Again, we may need to turn the duplicate entry back into a
1552 				// normal entry
1553 				array = duplicate->DuplicateArray();
1554 				if (left == BPLUSTREE_NULL
1555 					&& duplicate->RightLink() == BPLUSTREE_NULL
1556 					&& array->count <= NUM_FRAGMENT_VALUES) {
1557 					duplicateOffset = right;
1558 					continue;
1559 				}
1560 			}
1561 			return B_OK;
1562 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1563 			// If the number of entries fits in a duplicate fragment, then
1564 			// either find a free fragment node, or convert this node to a
1565 			// fragment node.
1566 			CachedNode cachedOther(this);
1567 
1568 			bplustree_node* fragment = NULL;
1569 			uint32 fragmentIndex = 0;
1570 			off_t offset;
1571 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
1572 					&offset, &fragment, &fragmentIndex) == B_OK) {
1573 				// move to other node
1574 				duplicate_array* target = fragment->FragmentAt(fragmentIndex);
1575 				memcpy(target, array,
1576 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1577 
1578 				cachedDuplicate.Free(transaction, duplicateOffset);
1579 				duplicateOffset = offset;
1580 			} else {
1581 				// convert node
1582 				memmove(duplicate, array,
1583 					(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1584 				memset((off_t*)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1585 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1586 			}
1587 
1588 			if (cached.MakeWritable(transaction) == NULL)
1589 				return B_IO_ERROR;
1590 
1591 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1592 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1593 		}
1594 		return B_OK;
1595 	}
1596 }
1597 
1598 
1599 /*!	Removes the key with the given index from the specified node.
1600 	Since it has to get the key from the node anyway (to obtain it's
1601 	pointer), it's not needed to pass the key & its length, although
1602 	the calling method (BPlusTree::Remove()) have this data.
1603 */
1604 void
1605 BPlusTree::_RemoveKey(bplustree_node* node, uint16 index)
1606 {
1607 	// should never happen, but who knows?
1608 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1609 		FATAL(("Asked me to remove key outer limits: %u\n", index));
1610 		return;
1611 	}
1612 
1613 	off_t* values = node->Values();
1614 
1615 	// if we would have to drop the overflow link, drop
1616 	// the last key instead and update the overflow link
1617 	// to the value of that one
1618 	if (!node->IsLeaf() && index == node->NumKeys())
1619 		node->overflow_link = values[--index];
1620 
1621 	uint16 length;
1622 	uint8* key = node->KeyAt(index, &length);
1623 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8*)node + fNodeSize
1624 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1625 		FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length,
1626 			(int)fStream->BlockRun().allocation_group,
1627 			fStream->BlockRun().start));
1628 		fStream->GetVolume()->Panic();
1629 		return;
1630 	}
1631 
1632 	uint16* keyLengths = node->KeyLengths();
1633 	uint8* keys = node->Keys();
1634 
1635 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1636 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(
1637 		node->AllKeyLength() - length);
1638 
1639 	off_t* newValues = node->Values();
1640 	uint16* newKeyLengths = node->KeyLengths();
1641 
1642 	// move key data
1643 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1644 
1645 	// move and update key lengths
1646 	if (index > 0 && newKeyLengths != keyLengths)
1647 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1648 	for (uint16 i = index; i < node->NumKeys(); i++) {
1649 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1650 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
1651 	}
1652 
1653 	// move values
1654 	if (index > 0)
1655 		memmove(newValues, values, index * sizeof(off_t));
1656 	if (node->NumKeys() > index) {
1657 		memmove(newValues + index, values + index + 1,
1658 			(node->NumKeys() - index) * sizeof(off_t));
1659 	}
1660 }
1661 
1662 
1663 /*!	Removes the specified key from the tree. The "value" parameter is only used
1664 	for trees which allow duplicates, so you may safely ignore it.
1665 	It's not an optional parameter, so at least you have to think about it.
1666 	You need to have the inode write locked.
1667 */
1668 status_t
1669 BPlusTree::Remove(Transaction& transaction, const uint8* key, uint16 keyLength,
1670 	off_t value)
1671 {
1672 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1673 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1674 		RETURN_ERROR(B_BAD_VALUE);
1675 
1676 	ASSERT_WRITE_LOCKED_INODE(fStream);
1677 
1678 	Stack<node_and_key> stack;
1679 	if (_SeekDown(stack, key, keyLength) != B_OK)
1680 		RETURN_ERROR(B_ERROR);
1681 
1682 	node_and_key nodeAndKey;
1683 	const bplustree_node* node;
1684 
1685 	CachedNode cached(this);
1686 	while (stack.Pop(&nodeAndKey)
1687 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1688 #ifdef DEBUG
1689 		NodeChecker checker(node, fNodeSize, "remove");
1690 #endif
1691 		if (node->IsLeaf()) {
1692 			// first round, check for duplicate entries
1693 			status_t status = _FindKey(node, key, keyLength,
1694 				&nodeAndKey.keyIndex);
1695 			if (status != B_OK)
1696 				RETURN_ERROR(status);
1697 
1698 			// Is this a duplicate entry?
1699 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
1700 					node->Values()[nodeAndKey.keyIndex]))) {
1701 				if (fAllowDuplicates) {
1702 					return _RemoveDuplicate(transaction, node, cached,
1703 						nodeAndKey.keyIndex, value);
1704 				}
1705 
1706 				FATAL(("dupliate node found where no duplicates are "
1707 					"allowed!\n"));
1708 				RETURN_ERROR(B_ERROR);
1709 			} else {
1710 				if (node->Values()[nodeAndKey.keyIndex] != value)
1711 					return B_ENTRY_NOT_FOUND;
1712 
1713 				// If we will remove the last key, the iterator will be set
1714 				// to the next node after the current - if there aren't any
1715 				// more nodes, we need a way to prevent the TreeIterators to
1716 				// touch the old node again, we use BPLUSTREE_FREE for this
1717 				off_t next = node->RightLink() == BPLUSTREE_NULL
1718 					? BPLUSTREE_FREE : node->RightLink();
1719 				_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
1720 					? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
1721 			}
1722 		}
1723 
1724 		bplustree_node* writableNode = cached.MakeWritable(transaction);
1725 		if (writableNode == NULL)
1726 			return B_IO_ERROR;
1727 
1728 		// if it's an empty root node, we have to convert it
1729 		// to a leaf node by dropping the overflow link, or,
1730 		// if it's already a leaf node, just empty it
1731 		if (nodeAndKey.nodeOffset == fHeader->RootNode()
1732 			&& (node->NumKeys() == 0
1733 				|| (node->NumKeys() == 1 && node->IsLeaf()))) {
1734 			writableNode->overflow_link
1735 				= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1736 			writableNode->all_key_count = 0;
1737 			writableNode->all_key_length = 0;
1738 
1739 			// if we've made a leaf node out of the root node, we need
1740 			// to reset the maximum number of levels in the header
1741 			if (fHeader->MaxNumberOfLevels() != 1) {
1742 				bplustree_header* header
1743 					= fCachedHeader.MakeWritableHeader(transaction);
1744 				if (header == NULL)
1745 					return B_IO_ERROR;
1746 
1747 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1748 			}
1749 			return B_OK;
1750 		}
1751 
1752 		// if there is only one key left, we don't have to remove
1753 		// it, we can just dump the node (index nodes still have
1754 		// the overflow link, so we have to drop the last key)
1755 		if (writableNode->NumKeys() > 1
1756 			|| (!writableNode->IsLeaf() && writableNode->NumKeys() == 1)) {
1757 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
1758 			return B_OK;
1759 		}
1760 
1761 		// when we are here, we can just free the node, but
1762 		// we have to update the right/left link of the
1763 		// siblings first
1764 		CachedNode otherCached(this);
1765 		bplustree_node* other = otherCached.SetToWritable(transaction,
1766 			writableNode->LeftLink());
1767 		if (other != NULL)
1768 			other->right_link = writableNode->right_link;
1769 
1770 		if ((other = otherCached.SetToWritable(transaction, node->RightLink()))
1771 				!= NULL) {
1772 			other->left_link = writableNode->left_link;
1773 		}
1774 
1775 		cached.Free(transaction, nodeAndKey.nodeOffset);
1776 	}
1777 	RETURN_ERROR(B_ERROR);
1778 }
1779 
1780 
1781 /*!	Replaces the value for the key in the tree.
1782 	Returns B_OK if the key could be found and its value replaced,
1783 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1784 	to indicate that something went terribly wrong.
1785 	Note that this doesn't work with duplicates - it will just
1786 	return B_BAD_TYPE if you call this function on a tree where
1787 	duplicates are allowed.
1788 	You need to have the inode write locked.
1789 */
1790 status_t
1791 BPlusTree::Replace(Transaction& transaction, const uint8* key, uint16 keyLength,
1792 	off_t value)
1793 {
1794 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1795 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1796 		|| key == NULL)
1797 		RETURN_ERROR(B_BAD_VALUE);
1798 
1799 	if (fAllowDuplicates)
1800 		RETURN_ERROR(B_BAD_TYPE);
1801 
1802 	ASSERT_WRITE_LOCKED_INODE(fStream);
1803 
1804 	off_t nodeOffset = fHeader->RootNode();
1805 	CachedNode cached(this);
1806 	const bplustree_node* node;
1807 
1808 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1809 		uint16 keyIndex = 0;
1810 		off_t nextOffset;
1811 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
1812 			&nextOffset);
1813 
1814 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1815 			if (status == B_OK) {
1816 				bplustree_node* writableNode = cached.MakeWritable(transaction);
1817 				if (writableNode != NULL) {
1818 					writableNode->Values()[keyIndex]
1819 						= HOST_ENDIAN_TO_BFS_INT64(value);
1820 				} else
1821 					status = B_IO_ERROR;
1822 			}
1823 
1824 			return status;
1825 		} else if (nextOffset == nodeOffset)
1826 			RETURN_ERROR(B_ERROR);
1827 
1828 		nodeOffset = nextOffset;
1829 	}
1830 	RETURN_ERROR(B_ERROR);
1831 }
1832 
1833 
1834 /*!	Searches the key in the tree, and stores the offset found in _value,
1835 	if successful.
1836 	It's very similar to BPlusTree::SeekDown(), but doesn't fill a stack
1837 	while it descends the tree.
1838 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND if not.
1839 	It can also return other errors to indicate that something went wrong.
1840 	Note that this doesn't work with duplicates - it will just return
1841 	B_BAD_TYPE if you call this function on a tree where duplicates are
1842 	allowed.
1843 	You need to have the inode read or write locked.
1844 */
1845 status_t
1846 BPlusTree::Find(const uint8* key, uint16 keyLength, off_t* _value)
1847 {
1848 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1849 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1850 		|| key == NULL)
1851 		RETURN_ERROR(B_BAD_VALUE);
1852 
1853 	if (fAllowDuplicates)
1854 		RETURN_ERROR(B_BAD_TYPE);
1855 
1856 	ASSERT_READ_LOCKED_INODE(fStream);
1857 
1858 	off_t nodeOffset = fHeader->RootNode();
1859 	CachedNode cached(this);
1860 	const bplustree_node* node;
1861 
1862 #ifdef DEBUG
1863 	int32 levels = 0;
1864 #endif
1865 
1866 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1867 		uint16 keyIndex = 0;
1868 		off_t nextOffset;
1869 		status_t status = _FindKey(node, key, keyLength, &keyIndex,
1870 			&nextOffset);
1871 
1872 #ifdef DEBUG
1873 		levels++;
1874 #endif
1875 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1876 			if (status == B_OK && _value != NULL)
1877 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1878 
1879 #ifdef DEBUG
1880 			if (levels != (int32)fHeader->MaxNumberOfLevels())
1881 				DEBUGGER(("levels don't match"));
1882 #endif
1883 			return status;
1884 		} else if (nextOffset == nodeOffset)
1885 			RETURN_ERROR(B_ERROR);
1886 
1887 		nodeOffset = nextOffset;
1888 	}
1889 	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
1890 	RETURN_ERROR(B_ERROR);
1891 }
1892 
1893 
1894 //	#pragma mark -
1895 
1896 
1897 TreeIterator::TreeIterator(BPlusTree* tree)
1898 	:
1899 	fTree(tree),
1900 	fCurrentNodeOffset(BPLUSTREE_NULL)
1901 {
1902 	tree->_AddIterator(this);
1903 }
1904 
1905 
1906 TreeIterator::~TreeIterator()
1907 {
1908 	if (fTree)
1909 		fTree->_RemoveIterator(this);
1910 }
1911 
1912 
1913 status_t
1914 TreeIterator::Goto(int8 to)
1915 {
1916 	if (fTree == NULL || fTree->fHeader == NULL)
1917 		RETURN_ERROR(B_BAD_VALUE);
1918 
1919 	// lock access to stream
1920 	InodeReadLocker locker(fTree->fStream);
1921 
1922 	off_t nodeOffset = fTree->fHeader->RootNode();
1923 	CachedNode cached(fTree);
1924 	const bplustree_node* node;
1925 
1926 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1927 		// is the node a leaf node?
1928 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1929 			fCurrentNodeOffset = nodeOffset;
1930 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1931 			fDuplicateNode = BPLUSTREE_NULL;
1932 
1933 			return B_OK;
1934 		}
1935 
1936 		// get the next node offset depending on the direction (and if there
1937 		// are any keys in that node at all)
1938 		off_t nextOffset;
1939 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1940 			nextOffset = node->OverflowLink();
1941 		else {
1942 			if (node->AllKeyLength() > fTree->fNodeSize
1943 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
1944 					- 8 * node->NumKeys())
1945 				RETURN_ERROR(B_ERROR);
1946 
1947 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1948 		}
1949 		if (nextOffset == nodeOffset)
1950 			break;
1951 
1952 		nodeOffset = nextOffset;
1953 	}
1954 	FATAL(("%s fails\n", __FUNCTION__));
1955 
1956 	RETURN_ERROR(B_ERROR);
1957 }
1958 
1959 
1960 /*!	Iterates through the tree in the specified direction.
1961 	When it iterates through duplicates, the "key" is only updated for the
1962 	first entry - if you need to know when this happens, use the "duplicate"
1963 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
1964 	the other duplicates.
1965 	That's not too nice, but saves the 256 bytes that would be needed to
1966 	store the last key - if this will ever become an issue, it will be
1967 	easy to change.
1968 	The other advantage of this is, that the queries can skip all duplicates
1969 	at once when they are not relevant to them.
1970 */
1971 status_t
1972 TreeIterator::Traverse(int8 direction, void* key, uint16* keyLength,
1973 	uint16 maxLength, off_t* value, uint16* duplicate)
1974 {
1975 	if (fTree == NULL)
1976 		return B_INTERRUPTED;
1977 
1978 	bool forward = direction == BPLUSTREE_FORWARD;
1979 
1980 	if (fCurrentNodeOffset == BPLUSTREE_NULL
1981 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
1982 		RETURN_ERROR(B_ERROR);
1983 
1984 	// if the tree was emptied since the last call
1985 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
1986 		return B_ENTRY_NOT_FOUND;
1987 
1988 	// lock access to stream
1989 	InodeReadLocker locker(fTree->fStream);
1990 
1991 	CachedNode cached(fTree);
1992 	const bplustree_node* node;
1993 
1994 	if (fDuplicateNode != BPLUSTREE_NULL) {
1995 		// regardless of traverse direction the duplicates are always presented
1996 		// in the same order; since they are all considered as equal, this
1997 		// shouldn't cause any problems
1998 
1999 		if (!fIsFragment || fDuplicate < fNumDuplicates) {
2000 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2001 				false);
2002 		} else
2003 			node = NULL;
2004 
2005 		if (node != NULL) {
2006 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
2007 				// If the node is out of duplicates, we go directly to the next
2008 				// one
2009 				fDuplicateNode = node->RightLink();
2010 				if (fDuplicateNode != BPLUSTREE_NULL
2011 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
2012 					fNumDuplicates = node->CountDuplicates(fDuplicateNode,
2013 						false);
2014 					fDuplicate = 0;
2015 				}
2016 			}
2017 			if (fDuplicate < fNumDuplicates) {
2018 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment,
2019 					fDuplicate++);
2020 				if (duplicate)
2021 					*duplicate = 2;
2022 				return B_OK;
2023 			}
2024 		}
2025 		fDuplicateNode = BPLUSTREE_NULL;
2026 	}
2027 
2028 	off_t savedNodeOffset = fCurrentNodeOffset;
2029 	int32 savedKey = fCurrentKey;
2030 
2031 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2032 		RETURN_ERROR(B_ERROR);
2033 
2034 	if (duplicate)
2035 		*duplicate = 0;
2036 
2037 	fCurrentKey += direction;
2038 
2039 	// is the current key in the current node?
2040 	while ((forward && fCurrentKey >= node->NumKeys())
2041 			|| (!forward && fCurrentKey < 0)) {
2042 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2043 
2044 		// are there any more nodes?
2045 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2046 			node = cached.SetTo(fCurrentNodeOffset);
2047 			if (!node)
2048 				RETURN_ERROR(B_ERROR);
2049 
2050 			// reset current key
2051 			fCurrentKey = forward ? 0 : node->NumKeys();
2052 		} else {
2053 			// there are no nodes left, so turn back to the last key
2054 			fCurrentNodeOffset = savedNodeOffset;
2055 			fCurrentKey = savedKey;
2056 
2057 			return B_ENTRY_NOT_FOUND;
2058 		}
2059 	}
2060 
2061 	if (node->all_key_count == 0)
2062 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2063 
2064 	uint16 length;
2065 	uint8* keyStart = node->KeyAt(fCurrentKey, &length);
2066 	if (keyStart + length + sizeof(off_t) + sizeof(uint16)
2067 			> (uint8*)node + fTree->fNodeSize
2068 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2069 		fTree->fStream->GetVolume()->Panic();
2070 		RETURN_ERROR(B_BAD_DATA);
2071 	}
2072 
2073 	length = min_c(length, maxLength);
2074 	memcpy(key, keyStart, length);
2075 
2076 	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	{
2077 		// terminate string type
2078 		if (length == maxLength)
2079 			length--;
2080 		((char*)key)[length] = '\0';
2081 	}
2082 	*keyLength = length;
2083 
2084 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2085 
2086 	// duplicate fragments?
2087 	uint8 type = bplustree_node::LinkType(offset);
2088 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT
2089 		|| type == BPLUSTREE_DUPLICATE_NODE) {
2090 		fDuplicateNode = offset;
2091 
2092 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode),
2093 			false);
2094 		if (node == NULL)
2095 			RETURN_ERROR(B_ERROR);
2096 
2097 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2098 
2099 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2100 		if (fNumDuplicates) {
2101 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2102 			fDuplicate = 1;
2103 			if (duplicate)
2104 				*duplicate = 1;
2105 		} else {
2106 			// Shouldn't happen, but we're dealing here with potentially
2107 			// corrupt disks...
2108 			fDuplicateNode = BPLUSTREE_NULL;
2109 			offset = 0;
2110 		}
2111 	}
2112 	*value = offset;
2113 
2114 	return B_OK;
2115 }
2116 
2117 
2118 /*!	This is more or less a copy of BPlusTree::Find() - but it just
2119 	sets the current position in the iterator, regardless of if the
2120 	key could be found or not.
2121 */
2122 status_t
2123 TreeIterator::Find(const uint8* key, uint16 keyLength)
2124 {
2125 	if (fTree == NULL)
2126 		return B_INTERRUPTED;
2127 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
2128 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
2129 		|| key == NULL)
2130 		RETURN_ERROR(B_BAD_VALUE);
2131 
2132 	// lock access to stream
2133 	InodeReadLocker locker(fTree->fStream);
2134 
2135 	off_t nodeOffset = fTree->fHeader->RootNode();
2136 
2137 	CachedNode cached(fTree);
2138 	const bplustree_node* node;
2139 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2140 		uint16 keyIndex = 0;
2141 		off_t nextOffset;
2142 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2143 			&nextOffset);
2144 
2145 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2146 			fCurrentNodeOffset = nodeOffset;
2147 			fCurrentKey = keyIndex - 1;
2148 			fDuplicateNode = BPLUSTREE_NULL;
2149 
2150 			return status;
2151 		} else if (nextOffset == nodeOffset)
2152 			RETURN_ERROR(B_ERROR);
2153 
2154 		nodeOffset = nextOffset;
2155 	}
2156 	RETURN_ERROR(B_ERROR);
2157 }
2158 
2159 
2160 void
2161 TreeIterator::SkipDuplicates()
2162 {
2163 	fDuplicateNode = BPLUSTREE_NULL;
2164 }
2165 
2166 
2167 void
2168 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex,
2169 	uint16 splitAt, int8 change)
2170 {
2171 	if (offset != fCurrentNodeOffset)
2172 		return;
2173 
2174 	if (nextOffset != BPLUSTREE_NULL) {
2175 		fCurrentNodeOffset = nextOffset;
2176 		if (splitAt <= fCurrentKey) {
2177 			fCurrentKey -= splitAt;
2178 			keyIndex -= splitAt;
2179 		}
2180 	}
2181 
2182 	// Adjust fCurrentKey to point to the same key as before.
2183 	// Note, that if a key is inserted at the current position
2184 	// it won't be included in this tree transition.
2185 	if (keyIndex <= fCurrentKey)
2186 		fCurrentKey += change;
2187 
2188 	// TODO: duplicate handling!
2189 }
2190 
2191 
2192 void
2193 TreeIterator::Stop()
2194 {
2195 	fTree = NULL;
2196 }
2197 
2198 
2199 #ifdef DEBUG
2200 void
2201 TreeIterator::Dump()
2202 {
2203 	__out("TreeIterator at %p:\n", this);
2204 	__out("\tfTree = %p\n", fTree);
2205 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2206 	__out("\tfCurrentKey = %ld\n", fCurrentKey);
2207 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n",
2208 		bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode,
2209 		fDuplicateNode);
2210 	__out("\tfDuplicate = %u\n", fDuplicate);
2211 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2212 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2213 }
2214 #endif
2215 
2216 
2217 //	#pragma mark -
2218 
2219 
2220 void
2221 bplustree_node::Initialize()
2222 {
2223 	left_link = right_link = overflow_link
2224 		= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2225 	all_key_count = 0;
2226 	all_key_length = 0;
2227 }
2228 
2229 
2230 uint8*
2231 bplustree_node::KeyAt(int32 index, uint16* keyLength) const
2232 {
2233 	if (index < 0 || index > NumKeys())
2234 		return NULL;
2235 
2236 	uint8* keyStart = Keys();
2237 	uint16* keyLengths = KeyLengths();
2238 
2239 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2240 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2241 	if (index > 0)
2242 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2243 
2244 	return keyStart;
2245 }
2246 
2247 
2248 uint8
2249 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2250 {
2251 	// the duplicate fragment handling is currently hard-coded to a node size
2252 	// of 1024 bytes - with future versions of BFS, this may be a problem
2253 
2254 	if (isFragment) {
2255 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2256 
2257 		return ((off_t*)this)[fragment];
2258 	}
2259 	return OverflowLink();
2260 }
2261 
2262 
2263 off_t
2264 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2265 {
2266 	uint32 start;
2267 	if (isFragment)
2268 		start = 8 * ((uint64)offset & 0x3ff);
2269 	else
2270 		start = 2;
2271 
2272 	return ((off_t*)this)[start + 1 + index];
2273 }
2274 
2275 
2276 /*!	Although the name suggests it, this function doesn't return the real
2277 	used fragment count; at least, it can only count to two: it returns
2278 	0, if there is no fragment used, 1 if there is only one fragment
2279 	used, and 2 if there are at least 2 fragments used.
2280 */
2281 uint32
2282 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2283 {
2284 	uint32 used = 0;
2285 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2286 		duplicate_array* array = FragmentAt(i);
2287 		if (array->count > 0 && ++used > 1)
2288 			return used;
2289 	}
2290 	return used;
2291 }
2292 
2293 
2294 #ifdef DEBUG
2295 status_t
2296 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2297 {
2298 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2299 		DEBUGGER(("invalid node: key/length count"));
2300 
2301 	for (int32 i = 0; i < NumKeys(); i++) {
2302 		uint16 length;
2303 		uint8* key = KeyAt(i, &length);
2304 		if (key + length + sizeof(off_t) + sizeof(uint16)
2305 				> (uint8*)this + nodeSize
2306 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2307 			dprintf("node %p, key %ld\n", this, i);
2308 			DEBUGGER(("invalid node: keys corrupted"));
2309 			return B_BAD_DATA;
2310 		}
2311 		if (Values()[i] == -1) {
2312 			dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]);
2313 			DEBUGGER(("invalid node: values corrupted"));
2314 			return B_BAD_DATA;
2315 		}
2316 	}
2317 	return B_OK;
2318 }
2319 #endif
2320 
2321 
2322 //	#pragma mark -
2323 
2324 
2325 int32
2326 compareKeys(type_code type, const void* key1, int keyLength1,
2327 	const void* key2, int keyLength2)
2328 {
2329 	// if one of the keys is NULL, bail out gracefully
2330 	if (key1 == NULL || key2 == NULL) {
2331 		// even if it's most probably a bug in the calling code, we try to
2332 		// give a meaningful result
2333 		if (key1 == NULL && key2 != NULL)
2334 			return 1;
2335 		if (key2 == NULL && key1 != NULL)
2336 			return -1;
2337 
2338 		return 0;
2339 	}
2340 
2341 	switch (type) {
2342 	    case B_STRING_TYPE:
2343     	{
2344 			int result = memcmp(key1, key2, min_c(keyLength1, keyLength2));
2345 			if (result == 0) {
2346 				// ignore trailing null bytes
2347 				if ((keyLength1 == keyLength2 + 1
2348 						&& ((uint8*)key1)[keyLength2] == '\0')
2349 					|| (keyLength2 == keyLength1 + 1
2350 						&& ((uint8*)key2)[keyLength1] == '\0'))
2351 					return 0;
2352 
2353 				result = keyLength1 - keyLength2;
2354 			}
2355 
2356 			return result;
2357 		}
2358 
2359 		case B_SSIZE_T_TYPE:
2360 		case B_INT32_TYPE:
2361 			return *(int32*)key1 - *(int32*)key2;
2362 
2363 		case B_SIZE_T_TYPE:
2364 		case B_UINT32_TYPE:
2365 			if (*(uint32*)key1 == *(uint32*)key2)
2366 				return 0;
2367 			if (*(uint32*)key1 > *(uint32*)key2)
2368 				return 1;
2369 
2370 			return -1;
2371 
2372 		case B_OFF_T_TYPE:
2373 		case B_INT64_TYPE:
2374 			if (*(int64*)key1 == *(int64*)key2)
2375 				return 0;
2376 			if (*(int64*)key1 > *(int64*)key2)
2377 				return 1;
2378 
2379 			return -1;
2380 
2381 		case B_UINT64_TYPE:
2382 			if (*(uint64*)key1 == *(uint64*)key2)
2383 				return 0;
2384 			if (*(uint64*)key1 > *(uint64*)key2)
2385 				return 1;
2386 
2387 			return -1;
2388 
2389 		case B_FLOAT_TYPE:
2390 		{
2391 			float result = *(float*)key1 - *(float*)key2;
2392 			if (result == 0.0f)
2393 				return 0;
2394 
2395 			return (result < 0.0f) ? -1 : 1;
2396 		}
2397 
2398 		case B_DOUBLE_TYPE:
2399 		{
2400 			double result = *(double*)key1 - *(double*)key2;
2401 			if (result == 0.0)
2402 				return 0;
2403 
2404 			return (result < 0.0) ? -1 : 1;
2405 		}
2406 	}
2407 
2408 	// if the type is unknown, the entries don't match...
2409 	return -1;
2410 }
2411 
2412