xref: /haiku/src/add-ons/kernel/file_systems/bfs/BPlusTree.cpp (revision 1214ef1b2100f2b3299fc9d8d6142e46f70a4c3f)
1 /*
2  * Copyright 2001-2007, Axel Dörfler, axeld@pinc-software.de.
3  * This file may be used under the terms of the MIT License.
4  *
5  * Roughly based on 'btlib' written by Marcus J. Ranum - it shares
6  * no code but achieves binary compatibility with the on disk format.
7  */
8 
9 //! B+Tree implementation
10 
11 
12 #include "Debug.h"
13 #include "BPlusTree.h"
14 #include "Inode.h"
15 #include "Utility.h"
16 
17 
18 #ifdef DEBUG
19 class NodeChecker {
20 	public:
21 		NodeChecker(const bplustree_node *node, int32 nodeSize, const char *text)
22 			:
23 			fNode(node),
24 			fSize(nodeSize),
25 			fText(text)
26 		{
27 			Check("integrity check failed on construction.");
28 		}
29 
30 		~NodeChecker()
31 		{
32 			Check("integrity check failed on destruction.");
33 		}
34 
35 		void
36 		Check(const char *message)
37 		{
38 			if (fNode->CheckIntegrity(fSize) < B_OK)
39 				dprintf("%s: %s\n", fText, message);
40 		}
41 
42 	private:
43 		const bplustree_node	*fNode;
44 		int32					fSize;
45 		const char				*fText;
46 };
47 #endif
48 
49 
50 // Node Caching for the BPlusTree class
51 //
52 // With write support, there is the need for a function that allocates new
53 // nodes by either returning empty nodes, or by growing the file's data stream
54 //
55 // !! The CachedNode class assumes that you have properly locked the stream
56 // !! before asking for nodes.
57 //
58 // Note: This code will fail if the block size is smaller than the node size!
59 // Since BFS supports block sizes of 1024 bytes or greater, and the node size
60 // is hard-coded to 1024 bytes, that's not an issue now.
61 
62 void
63 CachedNode::UnsetUnchanged(Transaction &transaction)
64 {
65 	if (fTree == NULL || fTree->fStream == NULL)
66 		return;
67 
68 	if (fNode != NULL) {
69 		void *cache = fTree->fStream->GetVolume()->BlockCache();
70 
71 		block_cache_set_dirty(cache, fBlockNumber, false, transaction.ID());
72 		block_cache_put(cache, fBlockNumber);
73 		fNode = NULL;
74 	}
75 }
76 
77 
78 void
79 CachedNode::Unset()
80 {
81 	if (fTree == NULL || fTree->fStream == NULL)
82 		return;
83 
84 	if (fNode != NULL) {
85 		block_cache_put(fTree->fStream->GetVolume()->BlockCache(), fBlockNumber);
86 		fNode = NULL;
87 	}
88 }
89 
90 
91 const bplustree_node *
92 CachedNode::SetTo(off_t offset, bool check)
93 {
94 	if (fTree == NULL || fTree->fStream == NULL) {
95 		REPORT_ERROR(B_BAD_VALUE);
96 		return NULL;
97 	}
98 
99 	Unset();
100 
101 	// You can only ask for nodes at valid positions - you can't
102 	// even access the b+tree header with this method (use SetToHeader()
103 	// instead)
104 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
105 		|| offset <= 0
106 		|| (offset % fTree->fNodeSize) != 0)
107 		return NULL;
108 
109 	if (InternalSetTo(NULL, offset) != NULL && check) {
110 		// sanity checks (links, all_key_count)
111 		if (!fTree->fHeader->CheckNode(fNode)) {
112 			FATAL(("invalid node (%p) read from offset %Ld, inode at %Ld\n",
113 				fNode, offset, fTree->fStream->ID()));
114 			return NULL;
115 		}
116 	}
117 	return fNode;
118 }
119 
120 
121 bplustree_node *
122 CachedNode::SetToWritable(Transaction &transaction, off_t offset, bool check)
123 {
124 	if (fTree == NULL || fTree->fStream == NULL) {
125 		REPORT_ERROR(B_BAD_VALUE);
126 		return NULL;
127 	}
128 
129 	Unset();
130 
131 	// You can only ask for nodes at valid positions - you can't
132 	// even access the b+tree header with this method (use SetToHeader()
133 	// instead)
134 	if (offset > fTree->fHeader->MaximumSize() - fTree->fNodeSize
135 		|| offset <= 0
136 		|| (offset % fTree->fNodeSize) != 0)
137 		return NULL;
138 
139 	if (InternalSetTo(&transaction, offset) != NULL && check) {
140 		// sanity checks (links, all_key_count)
141 		if (!fTree->fHeader->CheckNode(fNode)) {
142 			FATAL(("invalid node [%p] read from offset %Ld, inode at %Ld\n",
143 				fNode, offset, fTree->fStream->ID()));
144 			return NULL;
145 		}
146 	}
147 	return fNode;
148 }
149 
150 
151 bplustree_node *
152 CachedNode::MakeWritable(Transaction &transaction)
153 {
154 	if (fNode == NULL)
155 		return NULL;
156 
157 	if (block_cache_make_writable(transaction.GetVolume()->BlockCache(),
158 			fBlockNumber, transaction.ID()) == B_OK)
159 		return fNode;
160 
161 	return NULL;
162 }
163 
164 
165 bplustree_header *
166 CachedNode::MakeWritableHeader(Transaction &transaction)
167 {
168 	return (bplustree_header *)MakeWritable(transaction);
169 }
170 
171 
172 const bplustree_header *
173 CachedNode::SetToHeader()
174 {
175 	if (fTree == NULL || fTree->fStream == NULL) {
176 		REPORT_ERROR(B_BAD_VALUE);
177 		return NULL;
178 	}
179 
180 	Unset();
181 
182 	InternalSetTo(NULL, 0LL);
183 	return (bplustree_header *)fNode;
184 }
185 
186 
187 bplustree_header *
188 CachedNode::SetToWritableHeader(Transaction &transaction)
189 {
190 	if (fTree == NULL || fTree->fStream == NULL) {
191 		REPORT_ERROR(B_BAD_VALUE);
192 		return NULL;
193 	}
194 
195 	Unset();
196 
197 	InternalSetTo(&transaction, 0LL);
198 	return (bplustree_header *)fNode;
199 }
200 
201 
202 bplustree_node *
203 CachedNode::InternalSetTo(Transaction *transaction, off_t offset)
204 {
205 	fNode = NULL;
206 	fOffset = offset;
207 
208 	off_t fileOffset;
209 	block_run run;
210 	if (offset < fTree->fStream->Size()
211 		&& fTree->fStream->FindBlockRun(offset, run, fileOffset) == B_OK) {
212 		Volume *volume = fTree->fStream->GetVolume();
213 
214 		int32 blockOffset = (offset - fileOffset) / volume->BlockSize();
215 		fBlockNumber = volume->ToBlock(run) + blockOffset;
216 		uint8 *block;
217 
218 		if (transaction != NULL) {
219 			block = (uint8 *)block_cache_get_writable(volume->BlockCache(),
220 				fBlockNumber, transaction->ID());
221 			fWritable = true;
222 		} else {
223 			block = (uint8 *)block_cache_get(volume->BlockCache(), fBlockNumber);
224 			fWritable = false;
225 		}
226 
227 		if (block) {
228 			// the node is somewhere in that block... (confusing offset calculation)
229 			fNode = (bplustree_node *)(block + offset -
230 						(fileOffset + (blockOffset << volume->BlockShift())));
231 		} else
232 			REPORT_ERROR(B_IO_ERROR);
233 	}
234 	return fNode;
235 }
236 
237 
238 status_t
239 CachedNode::Free(Transaction &transaction, off_t offset)
240 {
241 	if (fTree == NULL || fTree->fStream == NULL || offset == BPLUSTREE_NULL)
242 		RETURN_ERROR(B_BAD_VALUE);
243 
244 	// ToDo: scan the free nodes list and remove all nodes at the end
245 	// of the tree - perhaps that shouldn't be done everytime that
246 	// function is called, perhaps it should be done when the directory
247 	// inode is closed or based on some calculation or whatever...
248 
249 	bplustree_header *header = fTree->fCachedHeader.MakeWritableHeader(transaction);
250 	if (header == NULL)
251 		return B_IO_ERROR;
252 
253 	// if the node is the last one in the tree, we shrink
254 	// the tree and file size by one node
255 	off_t lastOffset = header->MaximumSize() - fTree->fNodeSize;
256 	if (offset == lastOffset) {
257 		status_t status = fTree->fStream->SetFileSize(transaction, lastOffset);
258 		if (status < B_OK)
259 			return status;
260 
261 		header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(lastOffset);
262 		return B_OK;
263 	}
264 
265 	// add the node to the free nodes list
266 	fNode->left_link = header->free_node_pointer;
267 	fNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_FREE);
268 
269 	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64(offset);
270 	return B_OK;
271 }
272 
273 
274 status_t
275 CachedNode::Allocate(Transaction &transaction, bplustree_node **_node,
276 	off_t *_offset)
277 {
278 	if (fTree == NULL || fTree->fHeader == NULL || fTree->fStream == NULL)
279 		RETURN_ERROR(B_BAD_VALUE);
280 
281 	Unset();
282 
283 	bplustree_header *header;
284 	status_t status;
285 
286 	// if there are any free nodes, recycle them
287 	if (SetToWritable(transaction, fTree->fHeader->FreeNode(), false) != NULL) {
288 		header = fTree->fCachedHeader.MakeWritableHeader(transaction);
289 		if (header == NULL)
290 			return B_IO_ERROR;
291 
292 		*_offset = header->FreeNode();
293 		*_node = fNode;
294 
295 		// set new free node pointer
296 		header->free_node_pointer = fNode->left_link;
297 		fNode->Initialize();
298 		return B_OK;
299 	}
300 
301 	// allocate space for a new node
302 	Inode *stream = fTree->fStream;
303 	if ((status = stream->Append(transaction, fTree->fNodeSize)) < B_OK)
304 		return status;
305 
306 	header = fTree->fCachedHeader.MakeWritableHeader(transaction);
307 	if (header == NULL)
308 		return B_IO_ERROR;
309 
310 	// the maximum_size has to be changed before the call to SetTo() - or
311 	// else it will fail because the requested node is out of bounds
312 	off_t offset = header->MaximumSize();
313 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
314 		+ fTree->fNodeSize);
315 
316 	if (SetToWritable(transaction, offset, false) != NULL) {
317 		fNode->Initialize();
318 
319 		*_offset = offset;
320 		*_node = fNode;
321 		return B_OK;
322 	}
323 
324 	// revert header size to old value
325 	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(header->MaximumSize()
326 		- fTree->fNodeSize);
327 	RETURN_ERROR(B_ERROR);
328 }
329 
330 
331 //	#pragma mark -
332 
333 
334 BPlusTree::BPlusTree(Transaction &transaction, Inode *stream, int32 nodeSize)
335 	:
336 	fStream(NULL),
337 	fHeader(NULL),
338 	fCachedHeader(this)
339 {
340 	SetTo(transaction, stream);
341 }
342 
343 
344 BPlusTree::BPlusTree(Inode *stream)
345 	:
346 	fStream(NULL),
347 	fHeader(NULL),
348 	fCachedHeader(this)
349 {
350 	SetTo(stream);
351 }
352 
353 
354 BPlusTree::BPlusTree()
355 	:
356 	fStream(NULL),
357 	fHeader(NULL),
358 	fCachedHeader(this),
359 	fNodeSize(BPLUSTREE_NODE_SIZE),
360 	fAllowDuplicates(true),
361 	fStatus(B_NO_INIT)
362 {
363 }
364 
365 
366 BPlusTree::~BPlusTree()
367 {
368 	// if there are any TreeIterators left, we need to stop them
369 	// (can happen when the tree's inode gets deleted while
370 	// traversing the tree - a TreeIterator doesn't lock the inode)
371 	if (fIteratorLock.Lock() < B_OK)
372 		return;
373 
374 	TreeIterator *iterator = NULL;
375 	while ((iterator = fIterators.Next(iterator)) != NULL)
376 		iterator->Stop();
377 
378 	fIteratorLock.Unlock();
379 }
380 
381 
382 /*! Create a new B+Tree on the specified stream */
383 status_t
384 BPlusTree::SetTo(Transaction &transaction, Inode *stream, int32 nodeSize)
385 {
386 	// initializes in-memory B+Tree
387 
388 	fStream = stream;
389 
390 	bplustree_header *header = fCachedHeader.SetToWritableHeader(transaction);
391 	if (header == NULL) {
392 		// allocate space for new header + node!
393 		fStatus = stream->SetFileSize(transaction, nodeSize * 2);
394 		if (fStatus < B_OK)
395 			RETURN_ERROR(fStatus);
396 
397 		header = fCachedHeader.SetToWritableHeader(transaction);
398 		if (header == NULL)
399 			RETURN_ERROR(fStatus = B_ERROR);
400 	}
401 
402 	fHeader = header;
403 	fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
404 			&& stream->BlockRun() != stream->Parent())
405 		|| (stream->Mode() & S_ALLOW_DUPS) != 0;
406 
407 	fNodeSize = nodeSize;
408 
409 	// initialize b+tree header
410  	header->magic = HOST_ENDIAN_TO_BFS_INT32(BPLUSTREE_MAGIC);
411  	header->node_size = HOST_ENDIAN_TO_BFS_INT32(fNodeSize);
412  	header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
413  	header->data_type = HOST_ENDIAN_TO_BFS_INT32(ModeToKeyType(stream->Mode()));
414  	header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(nodeSize);
415  	header->free_node_pointer = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
416  	header->maximum_size = HOST_ENDIAN_TO_BFS_INT64(nodeSize * 2);
417 
418 	// initialize b+tree root node
419 	CachedNode cached(this);
420 	cached.SetToWritable(transaction, header->RootNode(), false);
421 	if (cached.Node() == NULL)
422 		RETURN_ERROR(B_IO_ERROR);
423 
424 	cached.Node()->Initialize();
425 
426 	return fStatus = B_OK;
427 }
428 
429 
430 status_t
431 BPlusTree::SetTo(Inode *stream)
432 {
433 	if (stream == NULL)
434 		RETURN_ERROR(fStatus = B_BAD_VALUE);
435 
436 	// get on-disk B+Tree header
437 
438 	fCachedHeader.Unset();
439 	fStream = stream;
440 
441 	fHeader = fCachedHeader.SetToHeader();
442 	if (fHeader == NULL)
443 		RETURN_ERROR(fStatus = B_NO_INIT);
444 
445 	// is header valid?
446 
447 	if (fHeader->MaximumSize() != stream->Size()) {
448 		FATAL(("B+tree header size %Ld doesn't fit file size %Ld!\n",
449 			fHeader->MaximumSize(), stream->Size()));
450 		// we can't change the header since we don't have a transaction
451 		//fHeader->maximum_size = HOST_ENDIAN_TO_BFS_INT64(stream->Size());
452 	}
453 	if (fHeader->Magic() != BPLUSTREE_MAGIC
454 		|| (fHeader->RootNode() % fHeader->NodeSize()) != 0
455 		|| !fHeader->IsValidLink(fHeader->RootNode())
456 		|| !fHeader->IsValidLink(fHeader->FreeNode())) {
457 #ifdef DEBUG
458 		dump_bplustree_header(fHeader);
459 		dump_block((const char *)fHeader, 128);
460 #endif
461 		RETURN_ERROR(fStatus = B_BAD_DATA);
462 	}
463 
464 	fNodeSize = fHeader->NodeSize();
465 
466 	{
467 		uint32 toMode[] = {S_STR_INDEX, S_INT_INDEX, S_UINT_INDEX,
468 			S_LONG_LONG_INDEX, S_ULONG_LONG_INDEX, S_FLOAT_INDEX,
469 			S_DOUBLE_INDEX};
470 		uint32 mode = stream->Mode() & (S_STR_INDEX | S_INT_INDEX
471 			| S_UINT_INDEX | S_LONG_LONG_INDEX | S_ULONG_LONG_INDEX
472 			| S_FLOAT_INDEX | S_DOUBLE_INDEX);
473 
474 		if (fHeader->DataType() > BPLUSTREE_DOUBLE_TYPE
475 			|| (stream->Mode() & S_INDEX_DIR)
476 				&& toMode[fHeader->DataType()] != mode
477 			|| !stream->IsContainer()) {
478 			D(	dump_bplustree_header(fHeader);
479 				dump_inode(&stream->Node());
480 			);
481 			RETURN_ERROR(fStatus = B_BAD_TYPE);
482 		}
483 
484 		// although it's in stat.h, the S_ALLOW_DUPS flag is obviously unused
485 		// in the original BFS code - we will honour it nevertheless
486 		fAllowDuplicates = ((stream->Mode() & S_INDEX_DIR) == S_INDEX_DIR
487 			&& stream->BlockRun() != stream->Parent())
488 			|| (stream->Mode() & S_ALLOW_DUPS) != 0;
489 	}
490 
491 	CachedNode cached(this, fHeader->RootNode());
492 	RETURN_ERROR(fStatus = cached.Node() ? B_OK : B_BAD_DATA);
493 }
494 
495 
496 status_t
497 BPlusTree::InitCheck()
498 {
499 	return fStatus;
500 }
501 
502 
503 int32
504 BPlusTree::TypeCodeToKeyType(type_code code)
505 {
506 	switch (code) {
507 		case B_STRING_TYPE:
508 			return BPLUSTREE_STRING_TYPE;
509 		case B_SSIZE_T_TYPE:
510 		case B_INT32_TYPE:
511 			return BPLUSTREE_INT32_TYPE;
512 		case B_SIZE_T_TYPE:
513 		case B_UINT32_TYPE:
514 			return BPLUSTREE_UINT32_TYPE;
515 		case B_OFF_T_TYPE:
516 		case B_INT64_TYPE:
517 			return BPLUSTREE_INT64_TYPE;
518 		case B_UINT64_TYPE:
519 			return BPLUSTREE_UINT64_TYPE;
520 		case B_FLOAT_TYPE:
521 			return BPLUSTREE_FLOAT_TYPE;
522 		case B_DOUBLE_TYPE:
523 			return BPLUSTREE_DOUBLE_TYPE;
524 	}
525 	return -1;
526 }
527 
528 
529 int32
530 BPlusTree::ModeToKeyType(mode_t mode)
531 {
532 	switch (mode & (S_STR_INDEX | S_INT_INDEX | S_UINT_INDEX | S_LONG_LONG_INDEX
533 				   | S_ULONG_LONG_INDEX | S_FLOAT_INDEX | S_DOUBLE_INDEX)) {
534 		case S_INT_INDEX:
535 			return BPLUSTREE_INT32_TYPE;
536 		case S_UINT_INDEX:
537 			return BPLUSTREE_UINT32_TYPE;
538 		case S_LONG_LONG_INDEX:
539 			return BPLUSTREE_INT64_TYPE;
540 		case S_ULONG_LONG_INDEX:
541 			return BPLUSTREE_UINT64_TYPE;
542 		case S_FLOAT_INDEX:
543 			return BPLUSTREE_FLOAT_TYPE;
544 		case S_DOUBLE_INDEX:
545 			return BPLUSTREE_DOUBLE_TYPE;
546 		case S_STR_INDEX:
547 		default:
548 			// default is for standard directories
549 			return BPLUSTREE_STRING_TYPE;
550 	}
551 }
552 
553 
554 //	#pragma mark -
555 
556 
557 void
558 BPlusTree::_UpdateIterators(off_t offset, off_t nextOffset, uint16 keyIndex,
559 	uint16 splitAt, int8 change)
560 {
561 	// Although every iterator which is affected by this update currently
562 	// waits on a semaphore, other iterators could be added/removed at
563 	// any time, so we need to protect this loop
564 	if (fIteratorLock.Lock() < B_OK)
565 		return;
566 
567 	TreeIterator *iterator = NULL;
568 	while ((iterator = fIterators.Next(iterator)) != NULL)
569 		iterator->Update(offset, nextOffset, keyIndex, splitAt, change);
570 
571 	fIteratorLock.Unlock();
572 }
573 
574 
575 void
576 BPlusTree::_AddIterator(TreeIterator *iterator)
577 {
578 	if (fIteratorLock.Lock() < B_OK)
579 		return;
580 
581 	fIterators.Add(iterator);
582 
583 	fIteratorLock.Unlock();
584 }
585 
586 
587 void
588 BPlusTree::_RemoveIterator(TreeIterator *iterator)
589 {
590 	if (fIteratorLock.Lock() < B_OK)
591 		return;
592 
593 	fIterators.Remove(iterator);
594 
595 	fIteratorLock.Unlock();
596 }
597 
598 
599 int32
600 BPlusTree::_CompareKeys(const void *key1, int keyLength1, const void *key2,
601 	int keyLength2)
602 {
603 	type_code type = 0;
604 	switch (fHeader->data_type) {
605 	    case BPLUSTREE_STRING_TYPE:
606 	    	type = B_STRING_TYPE;
607 	    	break;
608 		case BPLUSTREE_INT32_TYPE:
609 	    	type = B_INT32_TYPE;
610 	    	break;
611 		case BPLUSTREE_UINT32_TYPE:
612 	    	type = B_UINT32_TYPE;
613 	    	break;
614 		case BPLUSTREE_INT64_TYPE:
615 	    	type = B_INT64_TYPE;
616 	    	break;
617 		case BPLUSTREE_UINT64_TYPE:
618 	    	type = B_UINT64_TYPE;
619 	    	break;
620 		case BPLUSTREE_FLOAT_TYPE:
621 	    	type = B_FLOAT_TYPE;
622 	    	break;
623 		case BPLUSTREE_DOUBLE_TYPE:
624 	    	type = B_DOUBLE_TYPE;
625 	    	break;
626 	}
627    	return compareKeys(type, key1, keyLength1, key2, keyLength2);
628 }
629 
630 
631 status_t
632 BPlusTree::_FindKey(const bplustree_node *node, const uint8 *key,
633 	uint16 keyLength, uint16 *_index, off_t *_next)
634 {
635 #ifdef DEBUG
636 		NodeChecker checker(node, fNodeSize, "find");
637 #endif
638 
639 	if (node->all_key_count == 0) {
640 		if (_index)
641 			*_index = 0;
642 		if (_next)
643 			*_next = node->OverflowLink();
644 		return B_ENTRY_NOT_FOUND;
645 	}
646 
647 	off_t *values = node->Values();
648 	int16 saveIndex = -1;
649 
650 	// binary search in the key array
651 	for (int16 first = 0, last = node->NumKeys() - 1; first <= last;) {
652 		uint16 i = (first + last) >> 1;
653 
654 		uint16 searchLength;
655 		uint8 *searchKey = node->KeyAt(i, &searchLength);
656 		if (searchKey + searchLength + sizeof(off_t) + sizeof(uint16)
657 				> (uint8 *)node + fNodeSize
658 			|| searchLength > BPLUSTREE_MAX_KEY_LENGTH) {
659 			fStream->GetVolume()->Panic();
660 			RETURN_ERROR(B_BAD_DATA);
661 		}
662 
663 		int32 cmp = _CompareKeys(key, keyLength, searchKey, searchLength);
664 		if (cmp < 0) {
665 			last = i - 1;
666 			saveIndex = i;
667 		} else if (cmp > 0) {
668 			saveIndex = first = i + 1;
669 		} else {
670 			if (_index)
671 				*_index = i;
672 			if (_next)
673 				*_next = BFS_ENDIAN_TO_HOST_INT64(values[i]);
674 			return B_OK;
675 		}
676 	}
677 
678 	if (_index)
679 		*_index = saveIndex;
680 	if (_next) {
681 		if (saveIndex == node->NumKeys())
682 			*_next = node->OverflowLink();
683 		else
684 			*_next = BFS_ENDIAN_TO_HOST_INT64(values[saveIndex]);
685 	}
686 	return B_ENTRY_NOT_FOUND;
687 }
688 
689 
690 /*!
691 	Prepares the stack to contain all nodes that were passed while
692 	following the key, from the root node to the leaf node that could
693 	or should contain that key.
694 */
695 status_t
696 BPlusTree::_SeekDown(Stack<node_and_key> &stack, const uint8 *key,
697 	uint16 keyLength)
698 {
699 	// set the root node to begin with
700 	node_and_key nodeAndKey;
701 	nodeAndKey.nodeOffset = fHeader->RootNode();
702 
703 	CachedNode cached(this);
704 	const bplustree_node *node;
705 	while ((node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
706 		// if we are already on leaf level, we're done
707 		if (node->OverflowLink() == BPLUSTREE_NULL) {
708 			// node that the keyIndex is not properly set here (but it's not
709 			// needed in the calling functions anyway)!
710 			nodeAndKey.keyIndex = 0;
711 			stack.Push(nodeAndKey);
712 			return B_OK;
713 		}
714 
715 		off_t nextOffset;
716 		status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex,
717 			&nextOffset);
718 
719 		if (status == B_ENTRY_NOT_FOUND && nextOffset == nodeAndKey.nodeOffset)
720 			RETURN_ERROR(B_ERROR);
721 
722 		// put the node offset & the correct keyIndex on the stack
723 		stack.Push(nodeAndKey);
724 
725 		nodeAndKey.nodeOffset = nextOffset;
726 	}
727 
728 	FATAL(("BPlusTree::SeekDown() could not open node %Ld\n",
729 		nodeAndKey.nodeOffset));
730 	return B_ERROR;
731 }
732 
733 
734 /*!
735 	This will find a free duplicate fragment in the given bplustree_node.
736 	The CachedNode will be set to the writable fragment on success.
737 */
738 status_t
739 BPlusTree::_FindFreeDuplicateFragment(Transaction &transaction,
740 	const bplustree_node *node, CachedNode &cached,
741 	off_t *_offset, bplustree_node **_fragment, uint32 *_index)
742 {
743 	off_t *values = node->Values();
744 	for (int32 i = 0; i < node->NumKeys(); i++) {
745 		off_t value = BFS_ENDIAN_TO_HOST_INT64(values[i]);
746 
747 		// does the value link to a duplicate fragment?
748 		if (bplustree_node::LinkType(value) != BPLUSTREE_DUPLICATE_FRAGMENT)
749 			continue;
750 
751 		const bplustree_node *fragment = cached.SetTo(
752 			bplustree_node::FragmentOffset(value), false);
753 		if (fragment == NULL) {
754 			FATAL(("Could not get duplicate fragment at %Ld\n", value));
755 			continue;
756 		}
757 
758 		// see if there is some space left for us
759 		uint32 num = bplustree_node::MaxFragments(fNodeSize);
760 		for (uint32 j = 0; j < num; j++) {
761 			duplicate_array *array = fragment->FragmentAt(j);
762 
763 			if (array->count == 0) {
764 				// found an unused fragment
765 				*_fragment = cached.MakeWritable(transaction);
766 				if (*_fragment == NULL)
767 					return B_IO_ERROR;
768 
769 				*_offset = bplustree_node::FragmentOffset(value);
770 				*_index = j;
771 				return B_OK;
772 			}
773 		}
774 	}
775 	return B_ENTRY_NOT_FOUND;
776 }
777 
778 
779 status_t
780 BPlusTree::_InsertDuplicate(Transaction &transaction, CachedNode &cached,
781 	const bplustree_node *node, uint16 index, off_t value)
782 {
783 	CachedNode cachedDuplicate(this);
784 	off_t *values = node->Values();
785 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
786 	status_t status;
787 	off_t offset;
788 
789 	if (bplustree_node::IsDuplicate(oldValue)) {
790 		// If it's a duplicate fragment, try to insert it into that, or if it
791 		// doesn't fit anymore, create a new duplicate node
792 
793 		if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
794 			bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
795 				bplustree_node::FragmentOffset(oldValue), false);
796 			if (duplicate == NULL)
797 				return B_IO_ERROR;
798 
799 			duplicate_array *array = duplicate->FragmentAt(
800 				bplustree_node::FragmentIndex(oldValue));
801 			if (array->count > NUM_FRAGMENT_VALUES
802 				|| array->count < 1) {
803 				FATAL(("insertDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n",
804 					(int)bplustree_node::FragmentIndex(oldValue),
805 					bplustree_node::FragmentOffset(oldValue),
806 					array->count));
807 				return B_BAD_DATA;
808 			}
809 
810 			if (array->count < NUM_FRAGMENT_VALUES) {
811 				array->Insert(value);
812 			} else {
813 				// test if the fragment will be empty if we remove this key's values
814 				if (duplicate->FragmentsUsed(fNodeSize) < 2) {
815 					// the node will be empty without our values, so let us
816 					// reuse it as a duplicate node
817 					offset = bplustree_node::FragmentOffset(oldValue);
818 
819 					memmove(duplicate->DuplicateArray(), array,
820 						(NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
821 					duplicate->left_link = duplicate->right_link
822 						= HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
823 
824 					array = duplicate->DuplicateArray();
825 					array->Insert(value);
826 				} else {
827 					// create a new duplicate node
828 
829 					cachedDuplicate.UnsetUnchanged(transaction);
830 						// the old duplicate has not been touched, so we can reuse it
831 
832 					bplustree_node *newDuplicate;
833 					status = cachedDuplicate.Allocate(transaction,
834 						&newDuplicate, &offset);
835 					if (status < B_OK)
836 						RETURN_ERROR(status);
837 
838 					// copy the array from the fragment node to the duplicate node
839 					// and free the old entry (by zero'ing all values)
840 					newDuplicate->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
841 						array->count);
842 					memcpy(&newDuplicate->all_key_count, &array->values[0],
843 						array->count * sizeof(off_t));
844 					memset(array, 0, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
845 
846 					array = newDuplicate->DuplicateArray();
847 					array->Insert(value);
848 				}
849 
850 				// update the main pointer to link to a duplicate node
851 				if (cached.MakeWritable(transaction) == NULL)
852 					return B_IO_ERROR;
853 
854 				values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
855 					BPLUSTREE_DUPLICATE_NODE, offset));
856 			}
857 
858 			return B_OK;
859 		}
860 
861 		// Put the value into a dedicated duplicate node
862 
863 		// search for free space in the duplicate nodes of that key
864 		duplicate_array *array;
865 		const bplustree_node *duplicate;
866 		off_t duplicateOffset;
867 		do {
868 			duplicateOffset = bplustree_node::FragmentOffset(oldValue);
869 			duplicate = cachedDuplicate.SetTo(duplicateOffset, false);
870 			if (duplicate == NULL)
871 				return B_IO_ERROR;
872 
873 			array = duplicate->DuplicateArray();
874 			if (array->count > NUM_DUPLICATE_VALUES || array->count < 0) {
875 				FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
876 					duplicateOffset, array->count));
877 				return B_BAD_DATA;
878 			}
879 		} while (array->count >= NUM_DUPLICATE_VALUES
880 				&& (oldValue = duplicate->RightLink()) != BPLUSTREE_NULL);
881 
882 		bplustree_node *writableDuplicate = cachedDuplicate.MakeWritable(transaction);
883 		if (writableDuplicate == NULL)
884 			return B_IO_ERROR;
885 
886 		if (array->count < NUM_DUPLICATE_VALUES) {
887 			array = writableDuplicate->DuplicateArray();
888 			array->Insert(value);
889 		} else {
890 			// no space left - add a new duplicate node
891 
892 			bplustree_node *newDuplicate;
893 			status = cachedDuplicate.Allocate(transaction, &newDuplicate, &offset);
894 			if (status < B_OK)
895 				RETURN_ERROR(status);
896 
897 			// link the two nodes together
898 			writableDuplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(offset);
899 			newDuplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(duplicateOffset);
900 
901 			array = newDuplicate->DuplicateArray();
902 			array->count = 0;
903 			array->Insert(value);
904 		}
905 		return B_OK;
906 	}
907 
908 	// Search for a free duplicate fragment or create a new one
909 	// to insert the duplicate value into
910 
911 	uint32 fragmentIndex = 0;
912 	bplustree_node *fragment;
913 	if (_FindFreeDuplicateFragment(transaction, node, cachedDuplicate,
914 			&offset, &fragment, &fragmentIndex) != B_OK) {
915 		// allocate a new duplicate fragment node
916 		if ((status = cachedDuplicate.Allocate(transaction, &fragment,
917 				&offset)) < B_OK)
918 			RETURN_ERROR(status);
919 
920 		memset(fragment, 0, fNodeSize);
921 	}
922 
923 	duplicate_array *array = fragment->FragmentAt(fragmentIndex);
924 	array->Insert(oldValue);
925 	array->Insert(value);
926 
927 	if (cached.MakeWritable(transaction) == NULL)
928 		return B_IO_ERROR;
929 
930 	values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
931 		BPLUSTREE_DUPLICATE_FRAGMENT, offset, fragmentIndex));
932 
933 	return B_OK;
934 }
935 
936 
937 void
938 BPlusTree::_InsertKey(bplustree_node *node, uint16 index, uint8 *key,
939 	uint16 keyLength, off_t value)
940 {
941 	// should never happen, but who knows?
942 	if (index > node->NumKeys())
943 		return;
944 
945 	off_t *values = node->Values();
946 	uint16 *keyLengths = node->KeyLengths();
947 	uint8 *keys = node->Keys();
948 
949 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() + 1);
950 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(node->AllKeyLength()
951 		+ keyLength);
952 
953 	off_t *newValues = node->Values();
954 	uint16 *newKeyLengths = node->KeyLengths();
955 
956 	// move values and copy new value into them
957 	memmove(newValues + index + 1, values + index,
958 		sizeof(off_t) * (node->NumKeys() - 1 - index));
959 	memmove(newValues, values, sizeof(off_t) * index);
960 
961 	newValues[index] = HOST_ENDIAN_TO_BFS_INT64(value);
962 
963 	// move and update key length index
964 	for (uint16 i = node->NumKeys(); i-- > index + 1;) {
965 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
966 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i - 1]) + keyLength);
967 	}
968 	memmove(newKeyLengths, keyLengths, sizeof(uint16) * index);
969 
970 	int32 keyStart;
971 	newKeyLengths[index] = HOST_ENDIAN_TO_BFS_INT16(keyLength
972 		+ (keyStart = index > 0
973 			? BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index - 1]) : 0));
974 
975 	// move keys and copy new key into them
976 	uint16 length = BFS_ENDIAN_TO_HOST_INT16(newKeyLengths[index]);
977 	int32 size = node->AllKeyLength() - length;
978 	if (size > 0)
979 		memmove(keys + length, keys + length - keyLength, size);
980 
981 	memcpy(keys + keyStart, key, keyLength);
982 }
983 
984 
985 /*!
986 	Splits the \a node into two halves - the other half will be put into \a other.
987 	It also takes care to create a new overflow link if the node to split is an
988 	index node.
989 */
990 status_t
991 BPlusTree::_SplitNode(bplustree_node *node, off_t nodeOffset,
992 	bplustree_node *other, off_t otherOffset, uint16 *_keyIndex, uint8 *key,
993 	uint16 *_keyLength, off_t *_value)
994 {
995 	if (*_keyIndex > node->NumKeys() + 1)
996 		return B_BAD_VALUE;
997 
998 	uint16 *inKeyLengths = node->KeyLengths();
999 	off_t *inKeyValues = node->Values();
1000 	uint8 *inKeys = node->Keys();
1001 	uint8 *outKeys = other->Keys();
1002 	int32 keyIndex = *_keyIndex;	// can become less than zero!
1003 
1004 	if (keyIndex > node->NumKeys()) {
1005 		FATAL(("key index out of bounds: %ld, num keys: %u\n", keyIndex,
1006 			node->NumKeys()));
1007 		return B_BAD_VALUE;
1008 	}
1009 
1010 	// how many keys will fit in one (half) page?
1011 	// that loop will find the answer to this question and
1012 	// change the key lengths indices for their new home
1013 
1014 	// "bytes" is the number of bytes written for the new key,
1015 	// "bytesBefore" are the bytes before that key
1016 	// "bytesAfter" are the bytes after the new key, if any
1017 	int32 bytes = 0, bytesBefore = 0, bytesAfter = 0;
1018 
1019 	size_t size = fNodeSize >> 1;
1020 	int32 out, in;
1021 	for (in = out = 0; in < node->NumKeys() + 1;) {
1022 		if (!bytes) {
1023 			bytesBefore = in > 0
1024 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1025 		}
1026 
1027 		if (in == keyIndex && !bytes) {
1028 			bytes = *_keyLength;
1029 		} else {
1030 			if (keyIndex < out) {
1031 				bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1032 					- bytesBefore;
1033 			}
1034 
1035 			in++;
1036 		}
1037 		out++;
1038 
1039 		if (round_up(sizeof(bplustree_node) + bytesBefore + bytesAfter + bytes)
1040 				+ out * (sizeof(uint16) + sizeof(off_t)) >= size) {
1041 			// we have found the number of keys in the new node!
1042 			break;
1043 		}
1044 	}
1045 
1046 	// if the new key was not inserted, set the length of the keys
1047 	// that can be copied directly
1048 	if (keyIndex >= out && in > 0)
1049 		bytesBefore = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]);
1050 
1051 	if (bytesBefore < 0 || bytesAfter < 0)
1052 		return B_BAD_DATA;
1053 
1054 	other->left_link = node->left_link;
1055 	other->right_link = HOST_ENDIAN_TO_BFS_INT64(nodeOffset);
1056 	other->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1057 		+ bytesAfter);
1058 	other->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1059 
1060 	uint16 *outKeyLengths = other->KeyLengths();
1061 	off_t *outKeyValues = other->Values();
1062 	int32 keys = out > keyIndex ? keyIndex : out;
1063 
1064 	if (bytesBefore) {
1065 		// copy the keys
1066 		memcpy(outKeys, inKeys, bytesBefore);
1067 		memcpy(outKeyLengths, inKeyLengths, keys * sizeof(uint16));
1068 		memcpy(outKeyValues, inKeyValues, keys * sizeof(off_t));
1069 	}
1070 	if (bytes) {
1071 		// copy the newly inserted key
1072 		memcpy(outKeys + bytesBefore, key, bytes);
1073 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1074 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1075 
1076 		if (bytesAfter) {
1077 			// copy the keys after the new key
1078 			memcpy(outKeys + bytesBefore + bytes, inKeys + bytesBefore,
1079 				bytesAfter);
1080 			keys = out - keyIndex - 1;
1081 			for (int32 i = 0;i < keys;i++) {
1082 				outKeyLengths[keyIndex + i + 1] = HOST_ENDIAN_TO_BFS_INT16(
1083 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[keyIndex + i]) + bytes);
1084 			}
1085 			memcpy(outKeyValues + keyIndex + 1, inKeyValues + keyIndex,
1086 				keys * sizeof(off_t));
1087 		}
1088 	}
1089 
1090 	// if the new key was already inserted, we shouldn't use it again
1091 	if (in != out)
1092 		keyIndex--;
1093 
1094 	int32 total = bytesBefore + bytesAfter;
1095 
1096 	// these variables are for the key that will be returned
1097 	// to the parent node
1098 	uint8 *newKey = NULL;
1099 	uint16 newLength;
1100 	bool newAllocated = false;
1101 
1102 	// If we have split an index node, we have to drop the first key
1103 	// of the next node (which can also be the new key to insert).
1104 	// The dropped key is also the one which has to be inserted in
1105 	// the parent node, so we will set the "newKey" already here.
1106 	if (node->OverflowLink() != BPLUSTREE_NULL) {
1107 		if (in == keyIndex) {
1108 			newKey = key;
1109 			newLength = *_keyLength;
1110 
1111 			other->overflow_link = HOST_ENDIAN_TO_BFS_INT64(*_value);
1112 			keyIndex--;
1113 		} else {
1114 			// If a key is dropped (is not the new key), we have to copy
1115 			// it, because it would be lost if not.
1116 			uint8 *droppedKey = node->KeyAt(in, &newLength);
1117 			if (droppedKey + newLength + sizeof(off_t) + sizeof(uint16)
1118 					> (uint8 *)node + fNodeSize
1119 				|| newLength > BPLUSTREE_MAX_KEY_LENGTH) {
1120 				fStream->GetVolume()->Panic();
1121 				RETURN_ERROR(B_BAD_DATA);
1122 			}
1123 			newKey = (uint8 *)malloc(newLength);
1124 			if (newKey == NULL)
1125 				return B_NO_MEMORY;
1126 
1127 			newAllocated = true;
1128 			memcpy(newKey, droppedKey, newLength);
1129 
1130 			other->overflow_link = inKeyValues[in];
1131 			total = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in++]);
1132 		}
1133 	}
1134 
1135 	// and now the same game for the other page and the rest of the keys
1136 	// (but with memmove() instead of memcpy(), because they may overlap)
1137 
1138 	bytesBefore = bytesAfter = bytes = 0;
1139 	out = 0;
1140 	int32 skip = in;
1141 	while (in < node->NumKeys() + 1) {
1142 		if (in == keyIndex && !bytes) {
1143 			// it's enough to set bytesBefore once here, because we do
1144 			// not need to know the exact length of all keys in this
1145 			// loop
1146 			bytesBefore = in > skip
1147 				? BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in - 1]) : 0;
1148 			bytes = *_keyLength;
1149 			out++;
1150 		} else {
1151 			if (in < node->NumKeys()) {
1152 				inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1153 					BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) - total);
1154 
1155 				if (bytes) {
1156 					inKeyLengths[in] = HOST_ENDIAN_TO_BFS_INT16(
1157 						BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in]) + bytes);
1158 
1159 					bytesAfter = BFS_ENDIAN_TO_HOST_INT16(inKeyLengths[in])
1160 						- bytesBefore - bytes;
1161 				}
1162 				out++;
1163 			}
1164 			in++;
1165 		}
1166 	}
1167 
1168 	// adjust the byte counts (since we were a bit lazy in the loop)
1169 	if (keyIndex < skip)
1170 		bytesBefore = node->AllKeyLength() - total;
1171 
1172 	if (bytesBefore < 0 || bytesAfter < 0) {
1173 		if (newAllocated)
1174 			free(newKey);
1175 		return B_BAD_DATA;
1176 	}
1177 
1178 	node->left_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1179 		// right link, and overflow link can stay the same
1180 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore
1181 		+ bytesAfter);
1182 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(out);
1183 
1184 	// array positions have changed
1185 	outKeyLengths = node->KeyLengths();
1186 	outKeyValues = node->Values();
1187 
1188 	// move the keys in the old node: the order is important here,
1189 	// because we don't want to overwrite any contents
1190 
1191 	keys = keyIndex <= skip ? out : keyIndex - skip;
1192 	keyIndex -= skip;
1193 	in = out - keyIndex - 1;
1194 		// Note: keyIndex and in will contain invalid values when the new key
1195 		// went to the other node. But in this case bytes and bytesAfter are
1196 		// 0 and subsequently we never use keyIndex and in.
1197 
1198 	if (bytesBefore)
1199 		memmove(inKeys, inKeys + total, bytesBefore);
1200 	if (bytesAfter) {
1201 		memmove(inKeys + bytesBefore + bytes, inKeys + total + bytesBefore,
1202 			bytesAfter);
1203 	}
1204 
1205 	if (bytesBefore)
1206 		memmove(outKeyLengths, inKeyLengths + skip, keys * sizeof(uint16));
1207 	if (bytesAfter) {
1208 		// if byteAfter is > 0, keyIndex is larger than skip
1209 		memmove(outKeyLengths + keyIndex + 1, inKeyLengths + skip + keyIndex,
1210 			in * sizeof(uint16));
1211 	}
1212 
1213 	if (bytesBefore)
1214 		memmove(outKeyValues, inKeyValues + skip, keys * sizeof(off_t));
1215 	if (bytesAfter) {
1216 		memmove(outKeyValues + keyIndex + 1, inKeyValues + skip + keyIndex,
1217 			in * sizeof(off_t));
1218 	}
1219 
1220 	if (bytes) {
1221 		// finally, copy the newly inserted key (don't overwrite anything)
1222 		memcpy(inKeys + bytesBefore, key, bytes);
1223 		outKeyLengths[keyIndex] = HOST_ENDIAN_TO_BFS_INT16(bytes + bytesBefore);
1224 		outKeyValues[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(*_value);
1225 	}
1226 
1227 	// Prepare the key that will be inserted in the parent node which
1228 	// is either the dropped key or the last of the other node.
1229 	// If it's the dropped key, "newKey" was already set earlier.
1230 
1231 	if (newKey == NULL)
1232 		newKey = other->KeyAt(other->NumKeys() - 1, &newLength);
1233 
1234 	memcpy(key, newKey, newLength);
1235 	*_keyLength = newLength;
1236 	*_value = otherOffset;
1237 
1238 	if (newAllocated)
1239 		free(newKey);
1240 
1241 	return B_OK;
1242 }
1243 
1244 
1245 /*!
1246 	This inserts a key into the tree. The changes made to the tree will
1247 	all be part of the \a transaction.
1248 */
1249 status_t
1250 BPlusTree::Insert(Transaction &transaction, const uint8 *key, uint16 keyLength,
1251 	off_t value)
1252 {
1253 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1254 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1255 		RETURN_ERROR(B_BAD_VALUE);
1256 #ifdef DEBUG
1257 	if (value < 0)
1258 		panic("tried to insert invalid value %Ld!\n", value);
1259 #endif
1260 
1261 	// lock access to stream
1262 	WriteLocked locked(fStream->Lock());
1263 
1264 	Stack<node_and_key> stack;
1265 	if (_SeekDown(stack, key, keyLength) != B_OK)
1266 		RETURN_ERROR(B_ERROR);
1267 
1268 	uint8 keyBuffer[BPLUSTREE_MAX_KEY_LENGTH + 1];
1269 
1270 	memcpy(keyBuffer, key, keyLength);
1271 	keyBuffer[keyLength] = 0;
1272 
1273 	node_and_key nodeAndKey;
1274 	const bplustree_node *node;
1275 
1276 	CachedNode cached(this);
1277 	while (stack.Pop(&nodeAndKey)
1278 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1279 #ifdef DEBUG
1280 		NodeChecker checker(node, fNodeSize, "insert");
1281 #endif
1282 		if (node->IsLeaf()) {
1283 			// first round, check for duplicate entries
1284 			status_t status = _FindKey(node, key, keyLength,
1285 				&nodeAndKey.keyIndex);
1286 
1287 			// is this a duplicate entry?
1288 			if (status == B_OK) {
1289 				if (fAllowDuplicates) {
1290 					status = _InsertDuplicate(transaction, cached, node,
1291 						nodeAndKey.keyIndex, value);
1292 					if (status != B_OK)
1293 						RETURN_ERROR(status);
1294 					return B_OK;
1295 				}
1296 
1297 				RETURN_ERROR(B_NAME_IN_USE);
1298 			}
1299 		}
1300 
1301 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1302 		if (writableNode == NULL)
1303 			return B_IO_ERROR;
1304 
1305 		// is the node big enough to hold the pair?
1306 		if (int32(round_up(sizeof(bplustree_node)
1307 				+ writableNode->AllKeyLength() + keyLength)
1308 				+ (writableNode->NumKeys() + 1) * (sizeof(uint16)
1309 				+ sizeof(off_t))) < fNodeSize) {
1310 			_InsertKey(writableNode, nodeAndKey.keyIndex,
1311 				keyBuffer, keyLength, value);
1312 			_UpdateIterators(nodeAndKey.nodeOffset, BPLUSTREE_NULL,
1313 				nodeAndKey.keyIndex, 0, 1);
1314 
1315 			return B_OK;
1316 		} else {
1317 			CachedNode cachedNewRoot(this);
1318 			CachedNode cachedOther(this);
1319 
1320 			// do we need to allocate a new root node? if so, then do
1321 			// it now
1322 			off_t newRoot = BPLUSTREE_NULL;
1323 			if (nodeAndKey.nodeOffset == fHeader->RootNode()) {
1324 				bplustree_node *root;
1325 				status_t status = cachedNewRoot.Allocate(transaction, &root,
1326 					&newRoot);
1327 				if (status < B_OK) {
1328 					// The tree is most likely corrupted!
1329 					// But it's still sane at leaf level - we could set
1330 					// a flag in the header that forces the tree to be
1331 					// rebuild next time...
1332 					// But since we will have journaling, that's not a big
1333 					// problem anyway.
1334 					RETURN_ERROR(status);
1335 				}
1336 			}
1337 
1338 			// reserve space for the other node
1339 			bplustree_node *other;
1340 			off_t otherOffset;
1341 			status_t status = cachedOther.Allocate(transaction, &other,
1342 				&otherOffset);
1343 			if (status < B_OK) {
1344 				cachedNewRoot.Free(transaction, newRoot);
1345 				RETURN_ERROR(status);
1346 			}
1347 
1348 			if (_SplitNode(writableNode, nodeAndKey.nodeOffset, other,
1349 					otherOffset, &nodeAndKey.keyIndex, keyBuffer, &keyLength,
1350 					&value) < B_OK) {
1351 				// free root node & other node here
1352 				cachedOther.Free(transaction, otherOffset);
1353 				cachedNewRoot.Free(transaction, newRoot);
1354 
1355 				RETURN_ERROR(B_ERROR);
1356 			}
1357 #ifdef DEBUG
1358 			checker.Check("insert split");
1359 			NodeChecker otherChecker(other, fNodeSize, "insert split other");
1360 #endif
1361 
1362 			_UpdateIterators(nodeAndKey.nodeOffset, otherOffset,
1363 				nodeAndKey.keyIndex, writableNode->NumKeys(), 1);
1364 
1365 			// update the right link of the node in the left of the new node
1366 			if ((other = cachedOther.SetToWritable(transaction,
1367 					other->LeftLink())) != NULL) {
1368 				other->right_link = HOST_ENDIAN_TO_BFS_INT64(otherOffset);
1369 			}
1370 
1371 			// create a new root if necessary
1372 			if (newRoot != BPLUSTREE_NULL) {
1373 				bplustree_node *root = cachedNewRoot.Node();
1374 
1375 				_InsertKey(root, 0, keyBuffer, keyLength,
1376 					writableNode->LeftLink());
1377 				root->overflow_link = HOST_ENDIAN_TO_BFS_INT64(
1378 					nodeAndKey.nodeOffset);
1379 
1380 				bplustree_header *header
1381 					= fCachedHeader.MakeWritableHeader(transaction);
1382 				if (header == NULL)
1383 					return B_IO_ERROR;
1384 
1385 				// finally, update header to point to the new root
1386 				header->root_node_pointer = HOST_ENDIAN_TO_BFS_INT64(newRoot);
1387 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(
1388 					header->MaxNumberOfLevels() + 1);
1389 
1390 				return B_OK;
1391 			}
1392 		}
1393 	}
1394 	RETURN_ERROR(B_ERROR);
1395 }
1396 
1397 
1398 /*!
1399 	Removes the duplicate index/value pair from the tree.
1400 	It's part of the private tree interface.
1401 */
1402 status_t
1403 BPlusTree::_RemoveDuplicate(Transaction &transaction,
1404 	const bplustree_node *node, CachedNode &cached, uint16 index,
1405 	off_t value)
1406 {
1407 	off_t *values = node->Values();
1408 	off_t oldValue = BFS_ENDIAN_TO_HOST_INT64(values[index]);
1409 
1410 	CachedNode cachedDuplicate(this);
1411 	off_t duplicateOffset = bplustree_node::FragmentOffset(oldValue);
1412 	bplustree_node *duplicate = cachedDuplicate.SetToWritable(transaction,
1413 		duplicateOffset, false);
1414 	if (duplicate == NULL)
1415 		return B_IO_ERROR;
1416 
1417 	// if it's a duplicate fragment, remove the entry from there
1418 	if (bplustree_node::LinkType(oldValue) == BPLUSTREE_DUPLICATE_FRAGMENT) {
1419 		duplicate_array *array = duplicate->FragmentAt(
1420 			bplustree_node::FragmentIndex(oldValue));
1421 
1422 		if (array->count > NUM_FRAGMENT_VALUES
1423 			|| array->count < 1) {
1424 			FATAL(("removeDuplicate: Invalid array[%d] size in fragment %Ld == %Ld!\n",
1425 				(int)bplustree_node::FragmentIndex(oldValue), duplicateOffset, array->count));
1426 			return B_BAD_DATA;
1427 		}
1428 		if (!array->Remove(value)) {
1429 			FATAL(("Oh no, value %Ld not found in fragments of node %Ld...\n",
1430 				value, duplicateOffset));
1431 			return B_ENTRY_NOT_FOUND;
1432 		}
1433 
1434 		// remove the array from the fragment node if it is empty
1435 		if (array->count == 1) {
1436 			// set the link to the remaining value
1437 			if (cached.MakeWritable(transaction) == NULL)
1438 				return B_IO_ERROR;
1439 
1440 			values[index] = array->values[0];
1441 
1442 			// Remove the whole fragment node, if this was the only array,
1443 			// otherwise free just the array
1444 			if (duplicate->FragmentsUsed(fNodeSize) == 1) {
1445 				status_t status = cachedDuplicate.Free(transaction, duplicateOffset);
1446 				if (status < B_OK)
1447 					return status;
1448 			} else
1449 				array->count = 0;
1450 		}
1451 		return B_OK;
1452 	}
1453 
1454 	// Remove value from a duplicate node!
1455 
1456 	duplicate_array *array = NULL;
1457 
1458 	if (duplicate->LeftLink() != BPLUSTREE_NULL) {
1459 		FATAL(("invalid duplicate node: first left link points to %Ld!\n",
1460 			duplicate->LeftLink()));
1461 		return B_BAD_DATA;
1462 	}
1463 
1464 	// Search the duplicate nodes until the entry could be found (and removed)
1465 	while (duplicate != NULL) {
1466 		array = duplicate->DuplicateArray();
1467 		if (array->count > NUM_DUPLICATE_VALUES
1468 			|| array->count < 0) {
1469 			FATAL(("removeDuplicate: Invalid array size in duplicate %Ld == %Ld!\n",
1470 				duplicateOffset, array->count));
1471 			return B_BAD_DATA;
1472 		}
1473 
1474 		if (array->Remove(value))
1475 			break;
1476 
1477 		if ((duplicateOffset = duplicate->RightLink()) == BPLUSTREE_NULL)
1478 			RETURN_ERROR(B_ENTRY_NOT_FOUND);
1479 
1480 		cachedDuplicate.UnsetUnchanged(transaction);
1481 		duplicate = cachedDuplicate.SetToWritable(transaction, duplicateOffset, false);
1482 	}
1483 	if (duplicate == NULL)
1484 		RETURN_ERROR(B_IO_ERROR);
1485 
1486 	// The entry got removed from the duplicate node, but we might want to free
1487 	// it now in case it's empty
1488 
1489 	while (true) {
1490 		off_t left = duplicate->LeftLink();
1491 		off_t right = duplicate->RightLink();
1492 		bool isLast = left == BPLUSTREE_NULL && right == BPLUSTREE_NULL;
1493 
1494 		if ((isLast && array->count == 1) || array->count == 0) {
1495 			// Free empty duplicate page, link their siblings together, and
1496 			// update the duplicate link if needed (ie. when we either remove
1497 			// the last duplicate node or have a new first one)
1498 
1499 			if (left == BPLUSTREE_NULL) {
1500 				// the duplicate link points to us
1501 				if (cached.MakeWritable(transaction) == NULL)
1502 					return B_IO_ERROR;
1503 
1504 				if (array->count == 1) {
1505 					// this is the last node, and there is only one value left;
1506 					// replace the duplicate link with that value, it's no duplicate
1507 					// anymore
1508 					values[index] = array->values[0];
1509 				} else {
1510 					// move the duplicate link to the next node
1511 					values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1512 						BPLUSTREE_DUPLICATE_NODE, right));
1513 				}
1514 			}
1515 
1516 			status_t status;
1517 			if ((status = cachedDuplicate.Free(transaction, duplicateOffset)) < B_OK)
1518 				return status;
1519 
1520 			if (left != BPLUSTREE_NULL
1521 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, left, false)) != NULL) {
1522 				duplicate->right_link = HOST_ENDIAN_TO_BFS_INT64(right);
1523 
1524 				// If the next node is the last node, we need to free that node
1525 				// and convert the duplicate entry back into a normal entry
1526 				array = duplicate->DuplicateArray();
1527 				if (right == BPLUSTREE_NULL && duplicate->LeftLink() == BPLUSTREE_NULL
1528 					&& array->count <= NUM_FRAGMENT_VALUES) {
1529 					duplicateOffset = left;
1530 					continue;
1531 				}
1532 			}
1533 			if (right != BPLUSTREE_NULL
1534 				&& (duplicate = cachedDuplicate.SetToWritable(transaction, right, false)) != NULL) {
1535 				duplicate->left_link = HOST_ENDIAN_TO_BFS_INT64(left);
1536 
1537 				// Again, we may need to turn the duplicate entry back into a normal entry
1538 				array = duplicate->DuplicateArray();
1539 				if (left == BPLUSTREE_NULL && duplicate->RightLink() == BPLUSTREE_NULL
1540 					&& array->count <= NUM_FRAGMENT_VALUES) {
1541 					duplicateOffset = right;
1542 					continue;
1543 				}
1544 			}
1545 			return B_OK;
1546 		} else if (isLast && array->count <= NUM_FRAGMENT_VALUES) {
1547 			// If the number of entries fits in a duplicate fragment, then
1548 			// either find a free fragment node, or convert this node to a
1549 			// fragment node.
1550 			CachedNode cachedOther(this);
1551 
1552 			bplustree_node *fragment = NULL;
1553 			uint32 fragmentIndex = 0;
1554 			off_t offset;
1555 			if (_FindFreeDuplicateFragment(transaction, node, cachedOther,
1556 					&offset, &fragment, &fragmentIndex) == B_OK) {
1557 				// move to other node
1558 				duplicate_array *target = fragment->FragmentAt(fragmentIndex);
1559 				memcpy(target, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1560 
1561 				cachedDuplicate.Free(transaction, duplicateOffset);
1562 				duplicateOffset = offset;
1563 			} else {
1564 				// convert node
1565 				memmove(duplicate, array, (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1566 				memset((off_t *)duplicate + NUM_FRAGMENT_VALUES + 1, 0,
1567 					fNodeSize - (NUM_FRAGMENT_VALUES + 1) * sizeof(off_t));
1568 			}
1569 
1570 			if (cached.MakeWritable(transaction) == NULL)
1571 				return B_IO_ERROR;
1572 
1573 			values[index] = HOST_ENDIAN_TO_BFS_INT64(bplustree_node::MakeLink(
1574 				BPLUSTREE_DUPLICATE_FRAGMENT, duplicateOffset, fragmentIndex));
1575 		}
1576 		return B_OK;
1577 	}
1578 }
1579 
1580 
1581 /*!
1582 	Removes the key with the given index from the specified node.
1583 	Since it has to get the key from the node anyway (to obtain it's
1584 	pointer), it's not needed to pass the key & its length, although
1585 	the calling method (BPlusTree::Remove()) have this data.
1586 */
1587 void
1588 BPlusTree::_RemoveKey(bplustree_node *node, uint16 index)
1589 {
1590 	// should never happen, but who knows?
1591 	if (index > node->NumKeys() && node->NumKeys() > 0) {
1592 		FATAL(("Asked me to remove key outer limits: %u\n", index));
1593 		return;
1594 	}
1595 
1596 	off_t *values = node->Values();
1597 
1598 	// if we would have to drop the overflow link, drop
1599 	// the last key instead and update the overflow link
1600 	// to the value of that one
1601 	if (!node->IsLeaf() && index == node->NumKeys())
1602 		node->overflow_link = values[--index];
1603 
1604 	uint16 length;
1605 	uint8 *key = node->KeyAt(index, &length);
1606 	if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fNodeSize
1607 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
1608 		FATAL(("Key length to long: %s, %u (inode at %d,%u)\n", key, length,
1609 			(int)fStream->BlockRun().allocation_group, fStream->BlockRun().start));
1610 		fStream->GetVolume()->Panic();
1611 		return;
1612 	}
1613 
1614 	uint16 *keyLengths = node->KeyLengths();
1615 	uint8 *keys = node->Keys();
1616 
1617 	node->all_key_count = HOST_ENDIAN_TO_BFS_INT16(node->NumKeys() - 1);
1618 	node->all_key_length = HOST_ENDIAN_TO_BFS_INT64(node->AllKeyLength() - length);
1619 
1620 	off_t *newValues = node->Values();
1621 	uint16 *newKeyLengths = node->KeyLengths();
1622 
1623 	// move key data
1624 	memmove(key, key + length, node->AllKeyLength() - (key - keys));
1625 
1626 	// move and update key lengths
1627 	if (index > 0 && newKeyLengths != keyLengths)
1628 		memmove(newKeyLengths, keyLengths, index * sizeof(uint16));
1629 	for (uint16 i = index; i < node->NumKeys(); i++) {
1630 		newKeyLengths[i] = HOST_ENDIAN_TO_BFS_INT16(
1631 			BFS_ENDIAN_TO_HOST_INT16(keyLengths[i + 1]) - length);
1632 	}
1633 
1634 	// move values
1635 	if (index > 0)
1636 		memmove(newValues, values, index * sizeof(off_t));
1637 	if (node->NumKeys() > index)
1638 		memmove(newValues + index, values + index + 1, (node->NumKeys() - index) * sizeof(off_t));
1639 }
1640 
1641 
1642 /*!
1643 	Removes the specified key from the tree. The "value" parameter is only used
1644 	for trees which allow duplicates, so you may safely ignore it.
1645 	It's not an optional parameter, so at least you have to think about it.
1646 */
1647 status_t
1648 BPlusTree::Remove(Transaction &transaction, const uint8 *key, uint16 keyLength,
1649 	off_t value)
1650 {
1651 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1652 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH)
1653 		RETURN_ERROR(B_BAD_VALUE);
1654 
1655 	// lock access to stream
1656 	WriteLocked locked(fStream->Lock());
1657 
1658 	Stack<node_and_key> stack;
1659 	if (_SeekDown(stack, key, keyLength) != B_OK)
1660 		RETURN_ERROR(B_ERROR);
1661 
1662 	node_and_key nodeAndKey;
1663 	const bplustree_node *node;
1664 
1665 	CachedNode cached(this);
1666 	while (stack.Pop(&nodeAndKey)
1667 		&& (node = cached.SetTo(nodeAndKey.nodeOffset)) != NULL) {
1668 #ifdef DEBUG
1669 		NodeChecker checker(node, fNodeSize, "remove");
1670 #endif
1671 		if (node->IsLeaf()) {
1672 			// first round, check for duplicate entries
1673 			status_t status = _FindKey(node, key, keyLength, &nodeAndKey.keyIndex);
1674 			if (status < B_OK)
1675 				RETURN_ERROR(status);
1676 
1677 			// If we will remove the last key, the iterator will be set
1678 			// to the next node after the current - if there aren't any
1679 			// more nodes, we need a way to prevent the TreeIterators to
1680 			// touch the old node again, we use BPLUSTREE_FREE for this
1681 			off_t next = node->RightLink() == BPLUSTREE_NULL
1682 				? BPLUSTREE_FREE : node->RightLink();
1683 			_UpdateIterators(nodeAndKey.nodeOffset, node->NumKeys() == 1
1684 				? next : BPLUSTREE_NULL, nodeAndKey.keyIndex, 0 , -1);
1685 
1686 			// is this a duplicate entry?
1687 			if (bplustree_node::IsDuplicate(BFS_ENDIAN_TO_HOST_INT64(
1688 					node->Values()[nodeAndKey.keyIndex]))) {
1689 				if (fAllowDuplicates) {
1690 					return _RemoveDuplicate(transaction, node, cached,
1691 						nodeAndKey.keyIndex, value);
1692 				} else {
1693 					FATAL(("dupliate node found where no duplicates are allowed!\n"));
1694 					RETURN_ERROR(B_ERROR);
1695 				}
1696 			}
1697 		}
1698 
1699 		bplustree_node *writableNode = cached.MakeWritable(transaction);
1700 		if (writableNode == NULL)
1701 			return B_IO_ERROR;
1702 
1703 		// if it's an empty root node, we have to convert it
1704 		// to a leaf node by dropping the overflow link, or,
1705 		// if it's already a leaf node, just empty it
1706 		if (nodeAndKey.nodeOffset == fHeader->RootNode()
1707 			&& (node->NumKeys() == 0 || node->NumKeys() == 1 && node->IsLeaf())) {
1708 			writableNode->overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
1709 			writableNode->all_key_count = 0;
1710 			writableNode->all_key_length = 0;
1711 
1712 			// if we've made a leaf node out of the root node, we need
1713 			// to reset the maximum number of levels in the header
1714 			if (fHeader->MaxNumberOfLevels() != 1) {
1715 				bplustree_header *header = fCachedHeader.MakeWritableHeader(transaction);
1716 				if (header == NULL)
1717 					return B_IO_ERROR;
1718 
1719 				header->max_number_of_levels = HOST_ENDIAN_TO_BFS_INT32(1);
1720 			}
1721 			return B_OK;
1722 		}
1723 
1724 		// if there is only one key left, we don't have to remove
1725 		// it, we can just dump the node (index nodes still have
1726 		// the overflow link, so we have to drop the last key)
1727 		if (writableNode->NumKeys() > 1
1728 			|| !writableNode->IsLeaf() && writableNode->NumKeys() == 1) {
1729 			_RemoveKey(writableNode, nodeAndKey.keyIndex);
1730 			return B_OK;
1731 		}
1732 
1733 		// when we are here, we can just free the node, but
1734 		// we have to update the right/left link of the
1735 		// siblings first
1736 		CachedNode otherCached(this);
1737 		bplustree_node *other = otherCached.SetToWritable(transaction,
1738 			writableNode->LeftLink());
1739 		if (other != NULL)
1740 			other->right_link = writableNode->right_link;
1741 
1742 		if ((other = otherCached.SetToWritable(transaction, node->RightLink())) != NULL)
1743 			other->left_link = writableNode->left_link;
1744 
1745 		cached.Free(transaction, nodeAndKey.nodeOffset);
1746 	}
1747 	RETURN_ERROR(B_ERROR);
1748 }
1749 
1750 
1751 /*!
1752 	Replaces the value for the key in the tree.
1753 	Returns B_OK if the key could be found and its value replaced,
1754 	B_ENTRY_NOT_FOUND if the key couldn't be found, and other errors
1755 	to indicate that something went terribly wrong.
1756 	Note that this doesn't work with duplicates - it will just
1757 	return B_BAD_TYPE if you call this function on a tree where
1758 	duplicates are allowed.
1759 */
1760 status_t
1761 BPlusTree::Replace(Transaction &transaction, const uint8 *key,
1762 	uint16 keyLength, off_t value)
1763 {
1764 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1765 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1766 		|| key == NULL)
1767 		RETURN_ERROR(B_BAD_VALUE);
1768 
1769 	if (fAllowDuplicates)
1770 		RETURN_ERROR(B_BAD_TYPE);
1771 
1772 	// lock access to stream (a read lock is okay for this purpose)
1773 	ReadLocked locked(fStream->Lock());
1774 
1775 	off_t nodeOffset = fHeader->RootNode();
1776 	CachedNode cached(this);
1777 	const bplustree_node *node;
1778 
1779 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1780 		uint16 keyIndex = 0;
1781 		off_t nextOffset;
1782 		status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1783 
1784 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1785 			if (status == B_OK) {
1786 				bplustree_node *writableNode = cached.MakeWritable(transaction);
1787 				if (writableNode != NULL)
1788 					writableNode->Values()[keyIndex] = HOST_ENDIAN_TO_BFS_INT64(value);
1789 				else
1790 					status = B_IO_ERROR;
1791 			}
1792 
1793 			return status;
1794 		} else if (nextOffset == nodeOffset)
1795 			RETURN_ERROR(B_ERROR);
1796 
1797 		nodeOffset = nextOffset;
1798 	}
1799 	RETURN_ERROR(B_ERROR);
1800 }
1801 
1802 
1803 /*!
1804 	Searches the key in the tree, and stores the offset found in
1805 	_value, if successful.
1806 	It's very similar to BPlusTree::SeekDown(), but doesn't fill
1807 	a stack while it descends the tree.
1808 	Returns B_OK when the key could be found, B_ENTRY_NOT_FOUND
1809 	if not. It can also return other errors to indicate that
1810 	something went wrong.
1811 	Note that this doesn't work with duplicates - it will just
1812 	return B_BAD_TYPE if you call this function on a tree where
1813 	duplicates are allowed.
1814 */
1815 status_t
1816 BPlusTree::Find(const uint8 *key, uint16 keyLength, off_t *_value)
1817 {
1818 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH
1819 		|| keyLength > BPLUSTREE_MAX_KEY_LENGTH
1820 		|| key == NULL)
1821 		RETURN_ERROR(B_BAD_VALUE);
1822 
1823 	if (fAllowDuplicates)
1824 		RETURN_ERROR(B_BAD_TYPE);
1825 
1826 	// lock access to stream
1827 	ReadLocked locked(fStream->Lock());
1828 
1829 	off_t nodeOffset = fHeader->RootNode();
1830 	CachedNode cached(this);
1831 	const bplustree_node *node;
1832 
1833 #ifdef DEBUG
1834 	int32 levels = 0;
1835 #endif
1836 
1837 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1838 		uint16 keyIndex = 0;
1839 		off_t nextOffset;
1840 		status_t status = _FindKey(node, key, keyLength, &keyIndex, &nextOffset);
1841 
1842 #ifdef DEBUG
1843 		levels++;
1844 #endif
1845 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1846 			if (status == B_OK && _value != NULL)
1847 				*_value = BFS_ENDIAN_TO_HOST_INT64(node->Values()[keyIndex]);
1848 
1849 #ifdef DEBUG
1850 			if (levels != (int32)fHeader->MaxNumberOfLevels())
1851 				DEBUGGER(("levels don't match"));
1852 #endif
1853 			return status;
1854 		} else if (nextOffset == nodeOffset)
1855 			RETURN_ERROR(B_ERROR);
1856 
1857 		nodeOffset = nextOffset;
1858 	}
1859 	FATAL(("b+tree node at %Ld could not be loaded\n", nodeOffset));
1860 	RETURN_ERROR(B_ERROR);
1861 }
1862 
1863 
1864 //	#pragma mark -
1865 
1866 
1867 TreeIterator::TreeIterator(BPlusTree *tree)
1868 	:
1869 	fTree(tree),
1870 	fCurrentNodeOffset(BPLUSTREE_NULL),
1871 	fNext(NULL)
1872 {
1873 	tree->_AddIterator(this);
1874 }
1875 
1876 
1877 TreeIterator::~TreeIterator()
1878 {
1879 	if (fTree)
1880 		fTree->_RemoveIterator(this);
1881 }
1882 
1883 
1884 status_t
1885 TreeIterator::Goto(int8 to)
1886 {
1887 	if (fTree == NULL || fTree->fHeader == NULL)
1888 		RETURN_ERROR(B_BAD_VALUE);
1889 
1890 	// lock access to stream
1891 	ReadLocked locked(fTree->fStream->Lock());
1892 
1893 	off_t nodeOffset = fTree->fHeader->RootNode();
1894 	CachedNode cached(fTree);
1895 	const bplustree_node *node;
1896 
1897 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
1898 		// is the node a leaf node?
1899 		if (node->OverflowLink() == BPLUSTREE_NULL) {
1900 			fCurrentNodeOffset = nodeOffset;
1901 			fCurrentKey = to == BPLUSTREE_BEGIN ? -1 : node->NumKeys();
1902 			fDuplicateNode = BPLUSTREE_NULL;
1903 
1904 			return B_OK;
1905 		}
1906 
1907 		// get the next node offset depending on the direction (and if there
1908 		// are any keys in that node at all)
1909 		off_t nextOffset;
1910 		if (to == BPLUSTREE_END || node->all_key_count == 0)
1911 			nextOffset = node->OverflowLink();
1912 		else {
1913 			if (node->AllKeyLength() > fTree->fNodeSize
1914 				|| (addr_t)node->Values() > (addr_t)node + fTree->fNodeSize
1915 					- 8 * node->NumKeys())
1916 				RETURN_ERROR(B_ERROR);
1917 
1918 			nextOffset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[0]);
1919 		}
1920 		if (nextOffset == nodeOffset)
1921 			break;
1922 
1923 		nodeOffset = nextOffset;
1924 	}
1925 	FATAL(("%s fails\n", __FUNCTION__));
1926 
1927 	RETURN_ERROR(B_ERROR);
1928 }
1929 
1930 
1931 /*!
1932 	Iterates through the tree in the specified direction.
1933 	When it iterates through duplicates, the "key" is only updated for the
1934 	first entry - if you need to know when this happens, use the "duplicate"
1935 	parameter which is 0 for no duplicate, 1 for the first, and 2 for all
1936 	the other duplicates.
1937 	That's not too nice, but saves the 256 bytes that would be needed to
1938 	store the last key - if this will ever become an issue, it will be
1939 	easy to change.
1940 	The other advantage of this is, that the queries can skip all duplicates
1941 	at once when they are not relevant to them.
1942 */
1943 status_t
1944 TreeIterator::Traverse(int8 direction, void *key, uint16 *keyLength,
1945 	uint16 maxLength, off_t *value, uint16 *duplicate)
1946 {
1947 	if (fTree == NULL)
1948 		return B_INTERRUPTED;
1949 
1950 	bool forward = direction == BPLUSTREE_FORWARD;
1951 
1952 	if (fCurrentNodeOffset == BPLUSTREE_NULL
1953 		&& Goto(forward ? BPLUSTREE_BEGIN : BPLUSTREE_END) < B_OK)
1954 		RETURN_ERROR(B_ERROR);
1955 
1956 	// if the tree was emptied since the last call
1957 	if (fCurrentNodeOffset == BPLUSTREE_FREE)
1958 		return B_ENTRY_NOT_FOUND;
1959 
1960 	// lock access to stream
1961 	ReadLocked locked(fTree->fStream->Lock());
1962 
1963 	CachedNode cached(fTree);
1964 	const bplustree_node *node;
1965 
1966 	if (fDuplicateNode != BPLUSTREE_NULL) {
1967 		// regardless of traverse direction the duplicates are always presented in
1968 		// the same order; since they are all considered as equal, this shouldn't
1969 		// cause any problems
1970 
1971 		if (!fIsFragment || fDuplicate < fNumDuplicates)
1972 			node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
1973 		else
1974 			node = NULL;
1975 
1976 		if (node != NULL) {
1977 			if (!fIsFragment && fDuplicate >= fNumDuplicates) {
1978 				// if the node is out of duplicates, we go directly to the next one
1979 				fDuplicateNode = node->RightLink();
1980 				if (fDuplicateNode != BPLUSTREE_NULL
1981 					&& (node = cached.SetTo(fDuplicateNode, false)) != NULL) {
1982 					fNumDuplicates = node->CountDuplicates(fDuplicateNode, false);
1983 					fDuplicate = 0;
1984 				}
1985 			}
1986 			if (fDuplicate < fNumDuplicates) {
1987 				*value = node->DuplicateAt(fDuplicateNode, fIsFragment, fDuplicate++);
1988 				if (duplicate)
1989 					*duplicate = 2;
1990 				return B_OK;
1991 			}
1992 		}
1993 		fDuplicateNode = BPLUSTREE_NULL;
1994 	}
1995 
1996 	off_t savedNodeOffset = fCurrentNodeOffset;
1997 	int32 savedKey = fCurrentKey;
1998 
1999 	if ((node = cached.SetTo(fCurrentNodeOffset)) == NULL)
2000 		RETURN_ERROR(B_ERROR);
2001 
2002 	if (duplicate)
2003 		*duplicate = 0;
2004 
2005 	fCurrentKey += direction;
2006 
2007 	// is the current key in the current node?
2008 	while ((forward && fCurrentKey >= node->NumKeys())
2009 			|| (!forward && fCurrentKey < 0)) {
2010 		fCurrentNodeOffset = forward ? node->RightLink() : node->LeftLink();
2011 
2012 		// are there any more nodes?
2013 		if (fCurrentNodeOffset != BPLUSTREE_NULL) {
2014 			node = cached.SetTo(fCurrentNodeOffset);
2015 			if (!node)
2016 				RETURN_ERROR(B_ERROR);
2017 
2018 			// reset current key
2019 			fCurrentKey = forward ? 0 : node->NumKeys();
2020 		} else {
2021 			// there are no nodes left, so turn back to the last key
2022 			fCurrentNodeOffset = savedNodeOffset;
2023 			fCurrentKey = savedKey;
2024 
2025 			return B_ENTRY_NOT_FOUND;
2026 		}
2027 	}
2028 
2029 	if (node->all_key_count == 0)
2030 		RETURN_ERROR(B_ERROR);	// B_ENTRY_NOT_FOUND ?
2031 
2032 	uint16 length;
2033 	uint8 *keyStart = node->KeyAt(fCurrentKey, &length);
2034 	if (keyStart + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)node + fTree->fNodeSize
2035 		|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2036 		fTree->fStream->GetVolume()->Panic();
2037 		RETURN_ERROR(B_BAD_DATA);
2038 	}
2039 
2040 	length = min_c(length, maxLength);
2041 	memcpy(key, keyStart, length);
2042 
2043 	if (fTree->fHeader->data_type == BPLUSTREE_STRING_TYPE)	{
2044 		// terminate string type
2045 		if (length == maxLength)
2046 			length--;
2047 		((char *)key)[length] = '\0';
2048 	}
2049 	*keyLength = length;
2050 
2051 	off_t offset = BFS_ENDIAN_TO_HOST_INT64(node->Values()[fCurrentKey]);
2052 
2053 	// duplicate fragments?
2054 	uint8 type = bplustree_node::LinkType(offset);
2055 	if (type == BPLUSTREE_DUPLICATE_FRAGMENT || type == BPLUSTREE_DUPLICATE_NODE) {
2056 		fDuplicateNode = offset;
2057 
2058 		node = cached.SetTo(bplustree_node::FragmentOffset(fDuplicateNode), false);
2059 		if (node == NULL)
2060 			RETURN_ERROR(B_ERROR);
2061 
2062 		fIsFragment = type == BPLUSTREE_DUPLICATE_FRAGMENT;
2063 
2064 		fNumDuplicates = node->CountDuplicates(offset, fIsFragment);
2065 		if (fNumDuplicates) {
2066 			offset = node->DuplicateAt(offset, fIsFragment, 0);
2067 			fDuplicate = 1;
2068 			if (duplicate)
2069 				*duplicate = 1;
2070 		} else {
2071 			// shouldn't happen, but we're dealing here with potentially corrupt disks...
2072 			fDuplicateNode = BPLUSTREE_NULL;
2073 			offset = 0;
2074 		}
2075 	}
2076 	*value = offset;
2077 
2078 	return B_OK;
2079 }
2080 
2081 
2082 /*!
2083 	This is more or less a copy of BPlusTree::Find() - but it just
2084 	sets the current position in the iterator, regardless of if the
2085 	key could be found or not.
2086 */
2087 status_t
2088 TreeIterator::Find(const uint8 *key, uint16 keyLength)
2089 {
2090 	if (fTree == NULL)
2091 		return B_INTERRUPTED;
2092 	if (keyLength < BPLUSTREE_MIN_KEY_LENGTH || keyLength > BPLUSTREE_MAX_KEY_LENGTH
2093 		|| key == NULL)
2094 		RETURN_ERROR(B_BAD_VALUE);
2095 
2096 	// lock access to stream
2097 	ReadLocked locked(fTree->fStream->Lock());
2098 
2099 	off_t nodeOffset = fTree->fHeader->RootNode();
2100 
2101 	CachedNode cached(fTree);
2102 	const bplustree_node *node;
2103 	while ((node = cached.SetTo(nodeOffset)) != NULL) {
2104 		uint16 keyIndex = 0;
2105 		off_t nextOffset;
2106 		status_t status = fTree->_FindKey(node, key, keyLength, &keyIndex,
2107 			&nextOffset);
2108 
2109 		if (node->OverflowLink() == BPLUSTREE_NULL) {
2110 			fCurrentNodeOffset = nodeOffset;
2111 			fCurrentKey = keyIndex - 1;
2112 			fDuplicateNode = BPLUSTREE_NULL;
2113 
2114 			return status;
2115 		} else if (nextOffset == nodeOffset)
2116 			RETURN_ERROR(B_ERROR);
2117 
2118 		nodeOffset = nextOffset;
2119 	}
2120 	RETURN_ERROR(B_ERROR);
2121 }
2122 
2123 
2124 void
2125 TreeIterator::SkipDuplicates()
2126 {
2127 	fDuplicateNode = BPLUSTREE_NULL;
2128 }
2129 
2130 
2131 void
2132 TreeIterator::Update(off_t offset, off_t nextOffset, uint16 keyIndex, uint16 splitAt, int8 change)
2133 {
2134 	if (offset != fCurrentNodeOffset)
2135 		return;
2136 
2137 	if (nextOffset != BPLUSTREE_NULL) {
2138 		fCurrentNodeOffset = nextOffset;
2139 		if (splitAt <= fCurrentKey) {
2140 			fCurrentKey -= splitAt;
2141 			keyIndex -= splitAt;
2142 		}
2143 	}
2144 
2145 	// Adjust fCurrentKey to point to the same key as before.
2146 	// Note, that if a key is inserted at the current position
2147 	// it won't be included in this tree transition.
2148 	if (keyIndex <= fCurrentKey)
2149 		fCurrentKey += change;
2150 
2151 	// ToDo: duplicate handling!
2152 }
2153 
2154 
2155 void
2156 TreeIterator::Stop()
2157 {
2158 	fTree = NULL;
2159 }
2160 
2161 
2162 #ifdef DEBUG
2163 void
2164 TreeIterator::Dump()
2165 {
2166 	__out("TreeIterator at %p:\n", this);
2167 	__out("\tfTree = %p\n", fTree);
2168 	__out("\tfCurrentNodeOffset = %Ld\n", fCurrentNodeOffset);
2169 	__out("\tfCurrentKey = %ld\n", fCurrentKey);
2170 	__out("\tfDuplicateNode = %Ld (%Ld, 0x%Lx)\n", bplustree_node::FragmentOffset(fDuplicateNode), fDuplicateNode, fDuplicateNode);
2171 	__out("\tfDuplicate = %u\n", fDuplicate);
2172 	__out("\tfNumDuplicates = %u\n", fNumDuplicates);
2173 	__out("\tfIsFragment = %s\n", fIsFragment ? "true" : "false");
2174 }
2175 #endif
2176 
2177 
2178 //	#pragma mark -
2179 
2180 
2181 void
2182 bplustree_node::Initialize()
2183 {
2184 	left_link = right_link = overflow_link = HOST_ENDIAN_TO_BFS_INT64((uint64)BPLUSTREE_NULL);
2185 	all_key_count = 0;
2186 	all_key_length = 0;
2187 }
2188 
2189 
2190 uint8 *
2191 bplustree_node::KeyAt(int32 index, uint16 *keyLength) const
2192 {
2193 	if (index < 0 || index > NumKeys())
2194 		return NULL;
2195 
2196 	uint8 *keyStart = Keys();
2197 	uint16 *keyLengths = KeyLengths();
2198 
2199 	*keyLength = BFS_ENDIAN_TO_HOST_INT16(keyLengths[index])
2200 		- (index != 0 ? BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]) : 0);
2201 	if (index > 0)
2202 		keyStart += BFS_ENDIAN_TO_HOST_INT16(keyLengths[index - 1]);
2203 
2204 	return keyStart;
2205 }
2206 
2207 
2208 uint8
2209 bplustree_node::CountDuplicates(off_t offset, bool isFragment) const
2210 {
2211 	// the duplicate fragment handling is currently hard-coded to a node size
2212 	// of 1024 bytes - with future versions of BFS, this may be a problem
2213 
2214 	if (isFragment) {
2215 		uint32 fragment = (NUM_FRAGMENT_VALUES + 1) * ((uint64)offset & 0x3ff);
2216 
2217 		return ((off_t *)this)[fragment];
2218 	}
2219 	return OverflowLink();
2220 }
2221 
2222 
2223 off_t
2224 bplustree_node::DuplicateAt(off_t offset, bool isFragment, int8 index) const
2225 {
2226 	uint32 start;
2227 	if (isFragment)
2228 		start = 8 * ((uint64)offset & 0x3ff);
2229 	else
2230 		start = 2;
2231 
2232 	return ((off_t *)this)[start + 1 + index];
2233 }
2234 
2235 
2236 /*!	Although the name suggests it, this function doesn't return the real
2237 	used fragment count; at least, it can only count to two: it returns
2238 	0, if there is no fragment used, 1 if there is only one fragment
2239 	used, and 2 if there are at least 2 fragments used.
2240 */
2241 uint32
2242 bplustree_node::FragmentsUsed(uint32 nodeSize) const
2243 {
2244 	uint32 used = 0;
2245 	for (uint32 i = 0; i < MaxFragments(nodeSize); i++) {
2246 		duplicate_array *array = FragmentAt(i);
2247 		if (array->count > 0 && ++used > 1)
2248 			return used;
2249 	}
2250 	return used;
2251 }
2252 
2253 
2254 #ifdef DEBUG
2255 status_t
2256 bplustree_node::CheckIntegrity(uint32 nodeSize) const
2257 {
2258 	if (NumKeys() > nodeSize || AllKeyLength() > nodeSize)
2259 		DEBUGGER(("invalid node: key/length count"));
2260 
2261 	for (int32 i = 0; i < NumKeys(); i++) {
2262 		uint16 length;
2263 		uint8 *key = KeyAt(i, &length);
2264 		if (key + length + sizeof(off_t) + sizeof(uint16) > (uint8 *)this + nodeSize
2265 			|| length > BPLUSTREE_MAX_KEY_LENGTH) {
2266 			dprintf("node %p, key %ld\n", this, i);
2267 			DEBUGGER(("invalid node: keys corrupted"));
2268 			return B_BAD_DATA;
2269 		}
2270 		if (Values()[i] == -1) {
2271 			dprintf("node %p, value %ld: %Ld\n", this, i, Values()[i]);
2272 			DEBUGGER(("invalid node: values corrupted"));
2273 			return B_BAD_DATA;
2274 		}
2275 	}
2276 	return B_OK;
2277 }
2278 #endif
2279 
2280 
2281 //	#pragma mark -
2282 
2283 
2284 int32
2285 compareKeys(type_code type, const void *key1, int keyLength1,
2286 	const void *key2, int keyLength2)
2287 {
2288 	// if one of the keys is NULL, bail out gracefully
2289 	if (key1 == NULL || key2 == NULL) {
2290 		// even if it's most probably a bug in the calling code, we try to
2291 		// give a meaningful result
2292 		if (key1 == NULL && key2 != NULL)
2293 			return 1;
2294 		else if (key2 == NULL && key1 != NULL)
2295 			return -1;
2296 
2297 		return 0;
2298 	}
2299 
2300 	switch (type) {
2301 	    case B_STRING_TYPE:
2302     	{
2303 			int len = min_c(keyLength1, keyLength2);
2304 			int result = strncmp((const char *)key1, (const char *)key2, len);
2305 
2306 			if (result == 0
2307 				&& !(((const char *)key1)[len] == '\0' && ((const char *)key2)[len] == '\0'))
2308 				result = keyLength1 - keyLength2;
2309 
2310 			return result;
2311 		}
2312 
2313 		case B_SSIZE_T_TYPE:
2314 		case B_INT32_TYPE:
2315 			return *(int32 *)key1 - *(int32 *)key2;
2316 
2317 		case B_SIZE_T_TYPE:
2318 		case B_UINT32_TYPE:
2319 			if (*(uint32 *)key1 == *(uint32 *)key2)
2320 				return 0;
2321 			else if (*(uint32 *)key1 > *(uint32 *)key2)
2322 				return 1;
2323 
2324 			return -1;
2325 
2326 		case B_OFF_T_TYPE:
2327 		case B_INT64_TYPE:
2328 			if (*(int64 *)key1 == *(int64 *)key2)
2329 				return 0;
2330 			else if (*(int64 *)key1 > *(int64 *)key2)
2331 				return 1;
2332 
2333 			return -1;
2334 
2335 		case B_UINT64_TYPE:
2336 			if (*(uint64 *)key1 == *(uint64 *)key2)
2337 				return 0;
2338 			else if (*(uint64 *)key1 > *(uint64 *)key2)
2339 				return 1;
2340 
2341 			return -1;
2342 
2343 		case B_FLOAT_TYPE:
2344 		{
2345 			float result = *(float *)key1 - *(float *)key2;
2346 			if (result == 0.0f)
2347 				return 0;
2348 
2349 			return (result < 0.0f) ? -1 : 1;
2350 		}
2351 
2352 		case B_DOUBLE_TYPE:
2353 		{
2354 			double result = *(double *)key1 - *(double *)key2;
2355 			if (result == 0.0)
2356 				return 0;
2357 
2358 			return (result < 0.0) ? -1 : 1;
2359 		}
2360 	}
2361 
2362 	// if the type is unknown, the entries don't match...
2363 	return -1;
2364 }
2365 
2366